diff --git a/build.xml b/build.xml index 275cb5555..1f26e7b7a 100644 --- a/build.xml +++ b/build.xml @@ -163,6 +163,14 @@ + + + + @@ -709,53 +717,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -769,20 +730,113 @@ - - - + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -792,10 +846,10 @@ - + - - + - - + + + + - - + + + + + + + + + + + + + + - - + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + - + - + + + - + - + + + - + - + + + - + - + + + - + + + + + + - - + + - - + + - - - - - - + + diff --git a/ivy.xml b/ivy.xml index 115f4062a..f90b9a010 100644 --- a/ivy.xml +++ b/ivy.xml @@ -15,10 +15,8 @@ - - - - + + diff --git a/public/R/queueJobReport.R b/public/R/queueJobReport.R index a24d269c9..de6aa045e 100644 --- a/public/R/queueJobReport.R +++ b/public/R/queueJobReport.R @@ -12,14 +12,14 @@ if ( onCMDLine ) { inputFileName = args[1] outputPDF = args[2] } else { - #inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt" - inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt" + inputFileName = "~/Desktop/Q-30033@gsa1.jobreport.txt" + #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt" #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt" outputPDF = NA } -RUNTIME_UNITS = "(sec)" -ORIGINAL_UNITS_TO_SECONDS = 1/1000 +RUNTIME_UNITS = "(hours)" +ORIGINAL_UNITS_TO_SECONDS = 1/1000/60/60 # # Helper function to aggregate all of the jobs in the report across all tables @@ -33,7 +33,7 @@ allJobsFromReport <- function(report) { # # Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job # -plotJobsGantt <- function(gatkReport, sortOverall) { +plotJobsGantt <- function(gatkReport, sortOverall, includeText) { allJobs = allJobsFromReport(gatkReport) if ( sortOverall ) { title = "All jobs, by analysis, by start time" @@ -44,16 +44,18 @@ plotJobsGantt <- function(gatkReport, sortOverall) { } allJobs$index = 1:nrow(allJobs) minTime = min(allJobs$startTime) - allJobs$relStartTime = allJobs$startTime - minTime - allJobs$relDoneTime = allJobs$doneTime - minTime + allJobs$relStartTime = (allJobs$startTime - minTime) * ORIGINAL_UNITS_TO_SECONDS + allJobs$relDoneTime = (allJobs$doneTime - minTime) * ORIGINAL_UNITS_TO_SECONDS allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts) maxRelTime = max(allJobs$relDoneTime) p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName)) - p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm"))) - p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2) + p <- p + theme_bw() + p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm"))) + if ( includeText ) + p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2) p <- p + xlim(0, maxRelTime * 1.1) p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS)) - p <- p + ylab("Job") + p <- p + ylab("Job number") p <- p + opts(title=title) print(p) } @@ -140,6 +142,8 @@ print(paste("Project :", inputFileName)) convertUnits <- function(gatkReportData) { convertGroup <- function(g) { g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS + g$startTime = g$startTime * ORIGINAL_UNITS_TO_SECONDS + g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_SECONDS g } lapply(gatkReportData, convertGroup) @@ -155,8 +159,8 @@ if ( ! is.na(outputPDF) ) { pdf(outputPDF, height=8.5, width=11) } -plotJobsGantt(gatkReportData, T) -plotJobsGantt(gatkReportData, F) +plotJobsGantt(gatkReportData, T, F) +plotJobsGantt(gatkReportData, F, F) plotProgressByTime(gatkReportData) for ( group in gatkReportData ) { plotGroup(group) diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index 16358d05f..5fff8f609 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -379,7 +379,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { } if ( tribbleType == null ) - if ( ! file.canRead() | !! file.isFile() ) { + if ( ! file.canRead() | ! file.isFile() ) { throw new UserException.BadArgumentValue(name, "Couldn't read file to determine type: " + file); } else { throw new UserException.CommandLineException( diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 5b9ebd99b..972943e26 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -929,6 +929,14 @@ public class GenomeAnalysisEngine { return readsDataSource.getHeader(reader); } + /** + * Gets the master sequence dictionary for this GATK engine instance + * @return a never-null dictionary listing all of the contigs known to this engine instance + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return getReferenceDataSource().getReference().getSequenceDictionary(); + } + /** * Returns data source object encapsulating all essential info and handlers used to traverse * reads; header merger, individual file readers etc can be accessed through the returned data source object. diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index 467aebac5..47eb55b28 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.GATKChunk; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -84,7 +85,7 @@ public class BAMScheduler implements Iterator { if(currentLocus == GenomeLoc.UNMAPPED) { nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED); for(SAMReaderID id: dataSource.getReaderIDs()) - nextFilePointer.addFileSpans(id,new GATKBAMFileSpan()); + nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE))); currentLocus = null; continue; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java index 5d0c38b78..dc703ff23 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java @@ -215,6 +215,45 @@ public class GATKBAMIndex { return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize); } + /** + * Use to get close to the unmapped reads at the end of a BAM file. + * @return The file offset of the first record in the last linear bin, or -1 + * if there are no elements in linear bins (i.e. no mapped reads). + */ + public long getStartOfLastLinearBin() { + openIndexFile(); + + seek(4); + + final int sequenceCount = readInteger(); + // Because no reads may align to the last sequence in the sequence dictionary, + // grab the last element of the linear index for each sequence, and return + // the last one from the last sequence that has one. + long lastLinearIndexPointer = -1; + for (int i = 0; i < sequenceCount; i++) { + // System.out.println("# Sequence TID: " + i); + final int nBins = readInteger(); + // System.out.println("# nBins: " + nBins); + for (int j1 = 0; j1 < nBins; j1++) { + // Skip bin # + skipBytes(4); + final int nChunks = readInteger(); + // Skip chunks + skipBytes(16 * nChunks); + } + final int nLinearBins = readInteger(); + if (nLinearBins > 0) { + // Skip to last element of list of linear bins + skipBytes(8 * (nLinearBins - 1)); + lastLinearIndexPointer = readLongs(1)[0]; + } + } + + closeIndexFile(); + + return lastLinearIndexPointer; + } + /** * Gets the possible number of bins for a given reference sequence. * @return How many bins could possibly be used according to this indexing scheme to index a single contig. diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java index c2235ec73..5ea75dbb0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java @@ -134,24 +134,11 @@ public class ReadShardStrategy implements ShardStrategy { Map selectedReaders = new HashMap(); while(selectedReaders.size() == 0 && currentFilePointer != null) { shardPosition = currentFilePointer.fileSpans; + for(SAMReaderID id: shardPosition.keySet()) { - // If the region contains location information (in other words, it is not at - // the start of the unmapped region), add the region. - if(currentFilePointer.isRegionUnmapped) { - // If the region is unmapped and no location data exists, add a null as an indicator to - // start at the next unmapped region. - if(!isIntoUnmappedRegion) { - selectedReaders.put(id,null); - isIntoUnmappedRegion = true; - } - else - selectedReaders.put(id,position.get(id)); - } - else { - SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id)); - if(!fileSpan.isEmpty()) - selectedReaders.put(id,fileSpan); - } + SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id)); + if(!fileSpan.isEmpty()) + selectedReaders.put(id,fileSpan); } if(selectedReaders.size() > 0) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 59fb4aa9e..3b9e35311 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -97,7 +97,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar if (!( walker instanceof TreeReducible )) throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers"); - traversalEngine.startTimers(); ReduceTree reduceTree = new ReduceTree(this); initializeWalker(walker); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 65ff27497..09ab4bd44 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -44,7 +44,6 @@ public class LinearMicroScheduler extends MicroScheduler { * @param shardStrategy A strategy for sharding the data. */ public Object execute(Walker walker, ShardStrategy shardStrategy) { - traversalEngine.startTimers(); walker.initialize(); Accumulator accumulator = Accumulator.create(engine,walker); @@ -54,6 +53,7 @@ public class LinearMicroScheduler extends MicroScheduler { if ( done || shard == null ) // we ran out of shards that aren't owned break; + traversalEngine.startTimersIfNecessary(); if(shard.getShardType() == Shard.ShardType.LOCUS) { LocusWalker lWalker = (LocusWalker)walker; WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleMetadata()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index 6136bd68d..2b6488ada 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -57,6 +57,7 @@ public class ShardTraverser implements Callable { public Object call() { try { + traversalEngine.startTimersIfNecessary(); long startTime = System.currentTimeMillis(); Object accumulator = walker.reduceInit(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java index 30b2f828d..8e241bb2c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java @@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; * @version 0.1 */ public class PlatformFilter extends ReadFilter { - @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this strign", required=false) + @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this string", required=false) protected String[] PLFilterNames; public boolean filterOut(SAMRecord rec) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java index ebb4cbe66..4ca7b935f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java @@ -46,7 +46,7 @@ public class VCFWriterStorage implements Storage, VCFWriter { else if ( stub.getOutputStream() != null ) { this.file = null; this.stream = stub.getOutputStream(); - writer = new StandardVCFWriter(stream, stub.doNotWriteGenotypes()); + writer = new StandardVCFWriter(stream, stub.getMasterSequenceDictionary(), stub.doNotWriteGenotypes()); } else throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream."); @@ -71,7 +71,7 @@ public class VCFWriterStorage implements Storage, VCFWriter { } // The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it. - return new StandardVCFWriter(file, this.stream, indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes()); + return new StandardVCFWriter(file, this.stream, stub.getMasterSequenceDictionary(), indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java index 936243f9d..82cb43634 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.io.stubs; +import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.gatk.CommandLineExecutable; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -150,6 +151,15 @@ public class VCFWriterStub implements Stub, VCFWriter { return isCompressed; } + /** + * Gets the master sequence dictionary from the engine associated with this stub + * @link GenomeAnalysisEngine.getMasterSequenceDictionary + * @return + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return engine.getMasterSequenceDictionary(); + } + /** * Should we tell the VCF writer not to write genotypes? * @return true if the writer should not write genotypes. diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index 4d94130a8..5a7658031 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -293,15 +293,16 @@ public class GATKRunReport { * That is, postReport() is guarenteed not to fail for any reason. */ private File postReportToLocalDisk(File rootDir) { + String filename = getID() + ".report.xml.gz"; + File file = new File(rootDir, filename); try { - String filename = getID() + ".report.xml.gz"; - File file = new File(rootDir, filename); postReportToFile(file); logger.debug("Wrote report to " + file); return file; } catch ( Exception e ) { // we catch everything, and no matter what eat the error exceptDuringRunReport("Couldn't read report file", e); + file.delete(); return null; } } @@ -312,6 +313,7 @@ public class GATKRunReport { File localFile = postReportToLocalDisk(new File("./")); logger.debug("Generating GATK report to AWS S3 based on local file " + localFile); if ( localFile != null ) { // we succeeded in creating the local file + localFile.deleteOnExit(); try { // stop us from printing the annoying, and meaningless, mime types warning Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class); @@ -336,14 +338,13 @@ public class GATKRunReport { //logger.info("Uploading " + localFile + " to AWS bucket"); S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject); logger.debug("Uploaded to AWS: " + s3Object); + logger.info("Uploaded run statistics report to AWS S3"); } catch ( S3ServiceException e ) { exceptDuringRunReport("S3 exception occurred", e); } catch ( NoSuchAlgorithmException e ) { exceptDuringRunReport("Couldn't calculate MD5", e); } catch ( IOException e ) { exceptDuringRunReport("Couldn't read report file", e); - } finally { - localFile.delete(); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java index 029800aea..9e5a95d10 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java @@ -101,7 +101,7 @@ public class RMDIndexer extends CommandLineProgram { Index index = IndexFactory.createIndex(inputFileSource, codec, approach); // add writing of the sequence dictionary, if supplied - builder.setIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary(), indexFile, false); + builder.validateAndUpdateIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary()); // create the output stream, and write the index LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java new file mode 100644 index 000000000..d133439dc --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.refdata.tracks; + +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.apache.log4j.Logger; +import org.broad.tribble.index.Index; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.utils.SequenceDictionaryUtils; + +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +/** + * Utilities for working with Sequence Dictionaries embedded in tribble indices + * + * @author Your Name + * @since Date created + */ +public class IndexDictionaryUtils { + private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class); + + // a constant we use for marking sequence dictionary entries in the Tribble index property list + public static final String SequenceDictionaryPropertyPredicate = "DICT:"; + + /** + * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index + * @param index the index file to use + * @return a SAMSequenceDictionary if available, null if unavailable + */ + public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { + SAMSequenceDictionary dict = new SAMSequenceDictionary(); + for (Map.Entry entry : index.getProperties().entrySet()) { + if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) + dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), + Integer.valueOf(entry.getValue()))); + } + return dict; + } + + /** + * create the sequence dictionary with the contig list; a backup approach + * @param index the index file to use + * @param dict the sequence dictionary to add contigs to + * @return the filled-in sequence dictionary + */ + static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) { + LinkedHashSet seqNames = index.getSequenceNames(); + if (seqNames == null) { + return dict; + } + for (String name : seqNames) { + SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); + dict.addSequence(seq); + } + return dict; + } + + public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { + for ( SAMSequenceRecord seq : dict.getSequences() ) { + final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName(); + final String length = String.valueOf(seq.getSequenceLength()); + index.addProperty(contig,length); + } + } + + public static void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict, + final ValidationExclusion.TYPE validationExclusionType ) { + // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation + if (trackDict == null || trackDict.size() == 0) + logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); + else { + Set trackSequences = new TreeSet(); + for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) + trackSequences.add(dictionaryEntry.getSequenceName()); + SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java index 06d05912a..3b4558579 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.refdata.tracks; import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureSource; @@ -41,7 +40,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.SequenceDictionaryUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -52,16 +50,11 @@ import org.broadinstitute.sting.utils.instrumentation.Sizeof; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; -import java.util.LinkedHashSet; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - /** - * - * @author aaron + * + * @author aaron * ` * Class RMDTrackBuilder * @@ -76,9 +69,6 @@ public class RMDTrackBuilder { // extends PluginManager { private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); public final static boolean MEASURE_TRIBBLE_QUERY_PERFORMANCE = false; - // a constant we use for marking sequence dictionary entries in the Tribble index property list - public static final String SequenceDictionaryPropertyPredicate = "DICT:"; - // private sequence dictionary we use to set our tracks with private SAMSequenceDictionary dict = null; @@ -210,13 +200,19 @@ public class RMDTrackBuilder { // extends PluginManager { try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); } catch (ReviewedStingException e) { } - sequenceDictionary = getSequenceDictionaryFromProperties(index); + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match if (sequenceDictionary.size() == 0 && dict != null) { File indexFile = Tribble.indexFile(inputFile); - setIndexSequenceDictionary(inputFile,index,dict,indexFile,true); - sequenceDictionary = getSequenceDictionaryFromProperties(index); + validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); + try { // re-write the index + writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); + } catch (IOException e) { + logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK"); + } + + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); } if ( MEASURE_TRIBBLE_QUERY_PERFORMANCE ) @@ -363,88 +359,31 @@ public class RMDTrackBuilder { // extends PluginManager { // this can take a while, let them know what we're doing logger.info("Creating Tribble index in memory for file " + inputFile); Index idx = IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - setIndexSequenceDictionary(inputFile, idx, dict, null, false); + validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict); return idx; } - - // --------------------------------------------------------------------------------------------------------- - // static functions to work with the sequence dictionaries of indexes - // --------------------------------------------------------------------------------------------------------- - - /** - * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index - * @param index the index file to use - * @return a SAMSequenceDictionary if available, null if unavailable - */ - public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { - SAMSequenceDictionary dict = new SAMSequenceDictionary(); - for (Map.Entry entry : index.getProperties().entrySet()) { - if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) - dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), - Integer.valueOf(entry.getValue()))); - } - return dict; - } - - /** - * create the sequence dictionary with the contig list; a backup approach - * @param index the index file to use - * @param dict the sequence dictionary to add contigs to - * @return the filled-in sequence dictionary - */ - private static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) { - LinkedHashSet seqNames = index.getSequenceNames(); - if (seqNames == null) { - return dict; - } - for (String name : seqNames) { - SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); - dict.addSequence(seq); - } - return dict; - } - /** * set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible. * (that each contig in the index is in the sequence dictionary). * @param inputFile for proper error message formatting. * @param dict the sequence dictionary * @param index the index file - * @param indexFile the index file - * @param rewriteIndex should we rewrite the index when we're done? - * */ - public void setIndexSequenceDictionary(File inputFile, Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) { - if (dict == null) return; - - SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); - validateTrackSequenceDictionary(inputFile.getAbsolutePath(),currentDict,dict); + public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) { + if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null"); // check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set - for (SAMSequenceRecord seq : currentDict.getSequences()) { - if (dict.getSequence(seq.getSequenceName()) == null) - continue; - index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength())); - } - // re-write the index - if (rewriteIndex) try { - writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); - } catch (IOException e) { - logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK"); - } + final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); + validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict); + + // actually update the dictionary in the index + IndexDictionaryUtils.setIndexSequenceDictionary(index, dict); } - - public void validateTrackSequenceDictionary(String trackName, SAMSequenceDictionary trackDict, SAMSequenceDictionary referenceDict) { - // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation - if (trackDict == null || trackDict.size() == 0) - logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); - else { - Set trackSequences = new TreeSet(); - for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) - trackSequences.add(dictionaryEntry.getSequenceName()); - SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict); - } + public void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict ) { + IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 89a179d0e..c6321e2ad 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -115,12 +115,13 @@ public abstract class TraversalEngine,Provide LinkedList history = new LinkedList(); /** We use the SimpleTimer to time our run */ - private SimpleTimer timer = new SimpleTimer("Traversal"); + private SimpleTimer timer = null; // How long can we go without printing some progress info? private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000; private int printProgressCheckCounter = 0; private long lastProgressPrintTime = -1; // When was the last time we printed progress log? + private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 120 * 1000; // in milliseconds private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; @@ -209,11 +210,16 @@ public abstract class TraversalEngine,Provide } } /** - * Should be called to indicate that we're going to process records and the timer should start ticking + * Should be called to indicate that we're going to process records and the timer should start ticking. This + * function should be called right before any traversal work is done, to avoid counting setup costs in the + * processing costs and inflating the estimated runtime. */ - public void startTimers() { - timer.start(); - lastProgressPrintTime = timer.currentTime(); + public void startTimersIfNecessary() { + if ( timer == null ) { + timer = new SimpleTimer("Traversal"); + timer.start(); + lastProgressPrintTime = timer.currentTime(); + } } /** @@ -224,7 +230,8 @@ public abstract class TraversalEngine,Provide * @return true if the maximum interval (in millisecs) has passed since the last printing */ private boolean maxElapsedIntervalForPrinting(final long curTime, long lastPrintTime, long printFreq) { - return (curTime - lastPrintTime) > printFreq; + long elapsed = curTime - lastPrintTime; + return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS; } /** @@ -351,7 +358,7 @@ public abstract class TraversalEngine,Provide public void printOnTraversalDone() { printProgress(null, null, true); - final double elapsed = timer.getElapsedTime(); + final double elapsed = timer == null ? 0 : timer.getElapsedTime(); ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java index 84549b13a..7960f5c35 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java @@ -26,21 +26,23 @@ package org.broadinstitute.sting.gatk.walkers; import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import java.io.PrintStream; -import java.util.Iterator; /** * Prints out all of the RODs in the input data set. Data is rendered using the toString() method * of the given ROD. */ public class PrintRODsWalker extends RodWalker { + @Input(fullName="input", shortName = "input", doc="The input ROD which should be printed out.", required=true) + public RodBinding input; + @Output PrintStream out; @@ -62,7 +64,7 @@ public class PrintRODsWalker extends RodWalker { if ( tracker == null ) return 0; - for ( Feature feature : tracker.getValues(Feature.class) ) { + for ( Feature feature : tracker.getValues(Feature.class, context.getLocation()) ) { out.println(feature.toString()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index fdfac6bf7..4f072e88c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -68,6 +68,13 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; * -I input1.bam \ * -I input2.bam \ * --read_filter MappingQualityZero + * + * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T PrintReads \ + * -o output.bam \ + * -I input.bam \ + * -n 2000 * * */ diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index c88c7c3c4..10261112c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers; +import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -77,6 +78,15 @@ public abstract class Walker { return toolkit; } + /** + * Gets the master sequence dictionary for this walker + * @link GenomeAnalysisEngine.getMasterSequenceDictionary + * @return + */ + protected SAMSequenceDictionary getMasterSequenceDictionary() { + return getToolkit().getMasterSequenceDictionary(); + } + /** * (conceptual static) method that states whether you want to see reads piling up at a locus * that contain a deletion at the locus. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java index 180bed24d..d2c4d24ab 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java @@ -26,7 +26,7 @@ public class SBByDepth extends AnnotationByDepth { if (!vc.hasAttribute(VCFConstants.STRAND_BIAS_KEY)) return null; - double sBias = Double.valueOf(vc.getAttributeAsString(VCFConstants.STRAND_BIAS_KEY)); + double sBias = vc.getAttributeAsDouble(VCFConstants.STRAND_BIAS_KEY, -1); final Map genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 350c683c2..4ead77506 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -24,7 +24,9 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -32,10 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants; -import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -46,134 +45,429 @@ import java.util.*; * (http://snpeff.sourceforge.net/). * * For each variant, chooses one of the effects of highest biological impact from the SnpEff - * output file (which must be provided on the command line via --snpEffFile:SnpEff ), + * output file (which must be provided on the command line via --snpEffFile filename.vcf), * and adds annotations on that effect. * - * The possible biological effects and their associated impacts are defined in the class: - * org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants - * * @author David Roazen */ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotation { - // SnpEff annotation key names: - public static final String GENE_ID_KEY = "GENE_ID"; - public static final String GENE_NAME_KEY = "GENE_NAME"; - public static final String TRANSCRIPT_ID_KEY = "TRANSCRIPT_ID"; - public static final String EXON_ID_KEY = "EXON_ID"; - public static final String EXON_RANK_KEY = "EXON_RANK"; - public static final String WITHIN_NON_CODING_GENE_KEY = "WITHIN_NON_CODING_GENE"; - public static final String EFFECT_KEY = "EFFECT"; - public static final String EFFECT_IMPACT_KEY = "EFFECT_IMPACT"; - public static final String EFFECT_EXTRA_INFORMATION_KEY = "EFFECT_EXTRA_INFORMATION"; - public static final String OLD_NEW_AA_KEY = "OLD_NEW_AA"; - public static final String OLD_NEW_CODON_KEY = "OLD_NEW_CODON"; - public static final String CODON_NUM_KEY = "CODON_NUM"; - public static final String CDS_SIZE_KEY = "CDS_SIZE"; + private static Logger logger = Logger.getLogger(SnpEff.class); + + // We refuse to parse SnpEff output files generated by unsupported versions, or + // lacking a SnpEff version number in the VCF header: + public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" }; + public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion"; + + // SnpEff aggregates all effects (and effect metadata) together into a single INFO + // field annotation with the key EFF: + public static final String SNPEFF_INFO_FIELD_KEY = "EFF"; + public static final String SNPEFF_EFFECT_METADATA_DELIMITER = "[()]"; + public static final String SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER = "\\|"; + + // Key names for the INFO field annotations we will add to each record, along + // with parsing-related information: + public enum InfoFieldKey { + EFFECT_KEY ("SNPEFF_EFFECT", -1), + IMPACT_KEY ("SNPEFF_IMPACT", 0), + CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1), + AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2), + GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3), + GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4), + TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6), + EXON_ID_KEY ("SNPEFF_EXON_ID", 7); + + // Actual text of the key + private final String keyName; + + // Index within the effect metadata subfields from the SnpEff EFF annotation + // where each key's associated value can be found during parsing. + private final int fieldIndex; + + InfoFieldKey ( String keyName, int fieldIndex ) { + this.keyName = keyName; + this.fieldIndex = fieldIndex; + } + + public String getKeyName() { + return keyName; + } + + public int getFieldIndex() { + return fieldIndex; + } + } + + // Possible SnpEff biological effects. All effect names found in the SnpEff input file + // are validated against this list. + public enum EffectType { + NONE, + CHROMOSOME, + INTERGENIC, + UPSTREAM, + UTR_5_PRIME, + UTR_5_DELETED, + START_GAINED, + SPLICE_SITE_ACCEPTOR, + SPLICE_SITE_DONOR, + START_LOST, + SYNONYMOUS_START, + NON_SYNONYMOUS_START, + CDS, + GENE, + TRANSCRIPT, + EXON, + EXON_DELETED, + NON_SYNONYMOUS_CODING, + SYNONYMOUS_CODING, + FRAME_SHIFT, + CODON_CHANGE, + CODON_INSERTION, + CODON_CHANGE_PLUS_CODON_INSERTION, + CODON_DELETION, + CODON_CHANGE_PLUS_CODON_DELETION, + STOP_GAINED, + SYNONYMOUS_STOP, + NON_SYNONYMOUS_STOP, + STOP_LOST, + INTRON, + UTR_3_PRIME, + UTR_3_DELETED, + DOWNSTREAM, + INTRON_CONSERVED, + INTERGENIC_CONSERVED, + REGULATION, + CUSTOM, + WITHIN_NON_CODING_GENE + } + + // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. + public enum EffectImpact { + LOW (1), + MODERATE (2), + HIGH (3); + + private final int severityRating; + + EffectImpact ( int severityRating ) { + this.severityRating = severityRating; + } + + public boolean isHigherImpactThan ( EffectImpact other ) { + return this.severityRating > other.severityRating; + } + } + + // SnpEff labels most effects as either CODING or NON_CODING, but sometimes omits this information. + public enum EffectCoding { + CODING, + NON_CODING, + UNKNOWN + } + + + public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) { + validateRodBinding(walker.getSnpEffRodBinding()); + checkSnpEffVersion(walker, toolkit); + } public Map annotate ( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc ) { - RodBinding snpEffRodBinding = walker.getSnpEffRodBinding(); - validateRodBinding(snpEffRodBinding); + RodBinding snpEffRodBinding = walker.getSnpEffRodBinding(); - List features = tracker.getValues(snpEffRodBinding, ref.getLocus()); + // Get only SnpEff records that start at this locus, not merely span it: + List snpEffRecords = tracker.getValues(snpEffRodBinding, ref.getLocus()); - // Add only annotations for one of the most biologically-significant effects as defined in - // the SnpEffConstants class: - SnpEffFeature mostSignificantEffect = getMostSignificantEffect(features); - - if ( mostSignificantEffect == null ) { + // Within this set, look for a SnpEff record whose ref/alt alleles match the record to annotate. + // If there is more than one such record, we only need to pick the first one, since the biological + // effects will be the same across all such records: + VariantContext matchingRecord = getMatchingSnpEffRecord(snpEffRecords, vc); + if ( matchingRecord == null ) { return null; } - return generateAnnotations(mostSignificantEffect); + // Parse the SnpEff INFO field annotation from the matching record into individual effect objects: + List effects = parseSnpEffRecord(matchingRecord); + if ( effects.size() == 0 ) { + return null; + } + + // Add only annotations for one of the most biologically-significant effects from this set: + SnpEffEffect mostSignificantEffect = getMostSignificantEffect(effects); + return mostSignificantEffect.getAnnotations(); } - private void validateRodBinding ( RodBinding snpEffRodBinding ) { + private void validateRodBinding ( RodBinding snpEffRodBinding ) { if ( snpEffRodBinding == null || ! snpEffRodBinding.isBound() ) { - throw new UserException("The SnpEff annotator requires that a SnpEff output file be provided " + - "as a rodbinding on the command line, but no SnpEff rodbinding was found."); + throw new UserException("The SnpEff annotator requires that a SnpEff VCF output file be provided " + + "as a rodbinding on the command line via the --snpEffFile option, but " + + "no SnpEff rodbinding was found."); } } - private SnpEffFeature getMostSignificantEffect ( List snpEffFeatures ) { - SnpEffFeature mostSignificantEffect = null; + private void checkSnpEffVersion ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) { + RodBinding snpEffRodBinding = walker.getSnpEffRodBinding(); - for ( SnpEffFeature snpEffFeature : snpEffFeatures ) { + VCFHeader snpEffVCFHeader = VCFUtils.getVCFHeadersFromRods(toolkit, Arrays.asList(snpEffRodBinding.getName())).get(snpEffRodBinding.getName()); + VCFHeaderLine snpEffVersionLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_VERSION_LINE_KEY); + + if ( snpEffVersionLine == null || snpEffVersionLine.getValue() == null || snpEffVersionLine.getValue().trim().length() == 0 ) { + throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_VERSION_LINE_KEY + " entry in the VCF header for the SnpEff " + + "input file, and so could not verify that the file was generated by a supported version of SnpEff (" + + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")"); + } + + String snpEffVersionString = snpEffVersionLine.getValue().replaceAll("\"", "").split(" ")[0]; + + if ( ! isSupportedSnpEffVersion(snpEffVersionString) ) { + throw new UserException("The version of SnpEff used to generate the SnpEff input file (" + snpEffVersionString + ") " + + "is not currently supported by the GATK. Supported versions are: " + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS)); + } + } + + private boolean isSupportedSnpEffVersion ( String versionString ) { + for ( String supportedVersion : SUPPORTED_SNPEFF_VERSIONS ) { + if ( supportedVersion.equals(versionString) ) { + return true; + } + } + + return false; + } + + private VariantContext getMatchingSnpEffRecord ( List snpEffRecords, VariantContext vc ) { + for ( VariantContext snpEffRecord : snpEffRecords ) { + if ( snpEffRecord.hasSameAlternateAllelesAs(vc) && snpEffRecord.getReference().equals(vc.getReference()) ) { + return snpEffRecord; + } + } + + return null; + } + + private List parseSnpEffRecord ( VariantContext snpEffRecord ) { + List parsedEffects = new ArrayList(); + + Object effectFieldValue = snpEffRecord.getAttribute(SNPEFF_INFO_FIELD_KEY); + List individualEffects; + + // The VCF codec stores multi-valued fields as a List, and single-valued fields as a String. + // We can have either in the case of SnpEff, since there may be one or more than one effect in this record. + if ( effectFieldValue instanceof List ) { + individualEffects = (List)effectFieldValue; + } + else { + individualEffects = Arrays.asList((String)effectFieldValue); + } + + for ( String effectString : individualEffects ) { + String[] effectNameAndMetadata = effectString.split(SNPEFF_EFFECT_METADATA_DELIMITER); + + if ( effectNameAndMetadata.length != 2 ) { + logger.warn(String.format("Malformed SnpEff effect field at %s:%d, skipping: %s", + snpEffRecord.getChr(), snpEffRecord.getStart(), effectString)); + continue; + } + + String effectName = effectNameAndMetadata[0]; + String[] effectMetadata = effectNameAndMetadata[1].split(SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER, -1); + + SnpEffEffect parsedEffect = new SnpEffEffect(effectName, effectMetadata); + + if ( parsedEffect.isWellFormed() ) { + parsedEffects.add(parsedEffect); + } + else { + logger.warn(String.format("Skipping malformed SnpEff effect field at %s:%d. Error was: \"%s\". Field was: \"%s\"", + snpEffRecord.getChr(), snpEffRecord.getStart(), parsedEffect.getParseError(), effectString)); + } + } + + return parsedEffects; + } + + private SnpEffEffect getMostSignificantEffect ( List effects ) { + SnpEffEffect mostSignificantEffect = null; + + for ( SnpEffEffect effect : effects ) { if ( mostSignificantEffect == null || - snpEffFeature.isHigherImpactThan(mostSignificantEffect) ) { + effect.isHigherImpactThan(mostSignificantEffect) ) { - mostSignificantEffect = snpEffFeature; + mostSignificantEffect = effect; } } return mostSignificantEffect; } - private Map generateAnnotations ( SnpEffFeature mostSignificantEffect ) { - Map annotations = new LinkedHashMap(Utils.optimumHashSize(getKeyNames().size())); - - if ( mostSignificantEffect.hasGeneID() ) - annotations.put(GENE_ID_KEY, mostSignificantEffect.getGeneID()); - if ( mostSignificantEffect.hasGeneName() ) - annotations.put(GENE_NAME_KEY, mostSignificantEffect.getGeneName()); - if ( mostSignificantEffect.hasTranscriptID() ) - annotations.put(TRANSCRIPT_ID_KEY, mostSignificantEffect.getTranscriptID()); - if ( mostSignificantEffect.hasExonID() ) - annotations.put(EXON_ID_KEY, mostSignificantEffect.getExonID()); - if ( mostSignificantEffect.hasExonRank() ) - annotations.put(EXON_RANK_KEY, Integer.toString(mostSignificantEffect.getExonRank())); - if ( mostSignificantEffect.isNonCodingGene() ) - annotations.put(WITHIN_NON_CODING_GENE_KEY, null); - - annotations.put(EFFECT_KEY, mostSignificantEffect.getEffect().toString()); - annotations.put(EFFECT_IMPACT_KEY, mostSignificantEffect.getEffectImpact().toString()); - if ( mostSignificantEffect.hasEffectExtraInformation() ) - annotations.put(EFFECT_EXTRA_INFORMATION_KEY, mostSignificantEffect.getEffectExtraInformation()); - - if ( mostSignificantEffect.hasOldAndNewAA() ) - annotations.put(OLD_NEW_AA_KEY, mostSignificantEffect.getOldAndNewAA()); - if ( mostSignificantEffect.hasOldAndNewCodon() ) - annotations.put(OLD_NEW_CODON_KEY, mostSignificantEffect.getOldAndNewCodon()); - if ( mostSignificantEffect.hasCodonNum() ) - annotations.put(CODON_NUM_KEY, Integer.toString(mostSignificantEffect.getCodonNum())); - if ( mostSignificantEffect.hasCdsSize() ) - annotations.put(CDS_SIZE_KEY, Integer.toString(mostSignificantEffect.getCdsSize())); - - return annotations; - } - public List getKeyNames() { - return Arrays.asList( GENE_ID_KEY, - GENE_NAME_KEY, - TRANSCRIPT_ID_KEY, - EXON_ID_KEY, - EXON_RANK_KEY, - WITHIN_NON_CODING_GENE_KEY, - EFFECT_KEY, - EFFECT_IMPACT_KEY, - EFFECT_EXTRA_INFORMATION_KEY, - OLD_NEW_AA_KEY, - OLD_NEW_CODON_KEY, - CODON_NUM_KEY, - CDS_SIZE_KEY + return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(), + InfoFieldKey.IMPACT_KEY.getKeyName(), + InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), + InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), + InfoFieldKey.GENE_NAME_KEY.getKeyName(), + InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), + InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), + InfoFieldKey.EXON_ID_KEY.getKeyName() ); } public List getDescriptions() { return Arrays.asList( - new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If this flag is present, the highest-impact effect resulting from the current variant is within a non-coding gene"), - new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), - new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(SnpEffConstants.EffectImpact.values())), - new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size for the highest-impact effect resulting from the current variant") + new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), + new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())), + new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant") ); } + + /** + * Helper class to parse, validate, and store a single SnpEff effect and its metadata. + */ + protected static class SnpEffEffect { + private EffectType effect; + private EffectImpact impact; + private String codonChange; + private String aminoAcidChange; + private String geneName; + private String geneBiotype; + private EffectCoding coding; + private String transcriptID; + private String exonID; + + private String parseError = null; + private boolean isWellFormed = true; + + private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10; + + // Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header, + // errors come after warnings, not vice versa: + private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1; + private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1; + + private static final int SNPEFF_CODING_FIELD_INDEX = 5; + + public SnpEffEffect ( String effectName, String[] effectMetadata ) { + parseEffectName(effectName); + parseEffectMetadata(effectMetadata); + } + + private void parseEffectName ( String effectName ) { + try { + effect = EffectType.valueOf(effectName); + } + catch ( IllegalArgumentException e ) { + parseError(String.format("%s is not a recognized effect type", effectName)); + } + } + + private void parseEffectMetadata ( String[] effectMetadata ) { + if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) { + if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) { + parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX])); + } + else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) { + parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX])); + } + else { + parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d", + EXPECTED_NUMBER_OF_METADATA_FIELDS, effectMetadata.length)); + } + + return; + } + + try { + impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); + } + catch ( IllegalArgumentException e ) { + parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); + } + + codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()]; + aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()]; + geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()]; + geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()]; + + if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) { + try { + coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]); + } + catch ( IllegalArgumentException e ) { + parseError(String.format("Unrecognized value for effect coding: %s", effectMetadata[SNPEFF_CODING_FIELD_INDEX])); + } + } + else { + coding = EffectCoding.UNKNOWN; + } + + transcriptID = effectMetadata[InfoFieldKey.TRANSCRIPT_ID_KEY.getFieldIndex()]; + exonID = effectMetadata[InfoFieldKey.EXON_ID_KEY.getFieldIndex()]; + } + + private void parseError ( String message ) { + isWellFormed = false; + + // Cache only the first error encountered: + if ( parseError == null ) { + parseError = message; + } + } + + public boolean isWellFormed() { + return isWellFormed; + } + + public String getParseError() { + return parseError == null ? "" : parseError; + } + + public boolean isCoding() { + return coding == EffectCoding.CODING; + } + + public boolean isHigherImpactThan ( SnpEffEffect other ) { + // If one effect is within a coding gene and the other is not, the effect that is + // within the coding gene has higher impact: + + if ( isCoding() && ! other.isCoding() ) { + return true; + } + else if ( ! isCoding() && other.isCoding() ) { + return false; + } + + // Otherwise, both effects are either in or not in a coding gene, so we compare the impacts + // of the effects themselves: + + return impact.isHigherImpactThan(other.impact); + } + + public Map getAnnotations() { + Map annotations = new LinkedHashMap(Utils.optimumHashSize(InfoFieldKey.values().length)); + + addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString()); + addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString()); + addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange); + addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange); + addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName); + addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype); + addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID); + addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID); + + return annotations; + } + + private void addAnnotation ( Map annotations, String keyName, String keyValue ) { + // Only add annotations for keys associated with non-empty values: + if ( keyValue != null && keyValue.trim().length() > 0 ) { + annotations.put(keyName, keyValue); + } + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 96a400c68..fb3dbc3cf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -40,7 +40,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -86,14 +85,15 @@ public class VariantAnnotator extends RodWalker implements Ann @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + public RodBinding getVariantRodBinding() { return variantCollection.variants; } /** * The INFO field will be annotated with information on the most biologically-significant effect * listed in the SnpEff output file for each variant. */ @Input(fullName="snpEffFile", shortName = "snpEffFile", doc="A SnpEff output file from which to add annotations", required=false) - public RodBinding snpEffFile; - public RodBinding getSnpEffRodBinding() { return snpEffFile; } + public RodBinding snpEffFile; + public RodBinding getSnpEffRodBinding() { return snpEffFile; } /** * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. @@ -203,11 +203,13 @@ public class VariantAnnotator extends RodWalker implements Ann } if ( USE_ALL_ANNOTATIONS ) - engine = new VariantAnnotatorEngine(this); + engine = new VariantAnnotatorEngine(this, getToolkit()); else - engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this); + engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this, getToolkit()); engine.initializeExpressions(expressionsToUse); + engine.invokeAnnotationInitializationMethods(); + // setup the header fields // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones Set hInfo = new HashSet(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 01926a7f3..68cd07803 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -26,13 +26,11 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Genotype; @@ -49,6 +47,7 @@ public class VariantAnnotatorEngine { private HashMap, String> dbAnnotations = new HashMap, String>(); private AnnotatorCompatibleWalker walker; + private GenomeAnalysisEngine toolkit; private static class VAExpression { @@ -74,16 +73,18 @@ public class VariantAnnotatorEngine { } // use this constructor if you want all possible annotations - public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker) { + public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) { this.walker = walker; + this.toolkit = toolkit; requestedInfoAnnotations = AnnotationInterfaceManager.createAllInfoFieldAnnotations(); requestedGenotypeAnnotations = AnnotationInterfaceManager.createAllGenotypeAnnotations(); initializeDBs(); } // use this constructor if you want to select specific annotations (and/or interfaces) - public VariantAnnotatorEngine(List annotationGroupsToUse, List annotationsToUse, AnnotatorCompatibleWalker walker) { + public VariantAnnotatorEngine(List annotationGroupsToUse, List annotationsToUse, AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) { this.walker = walker; + this.toolkit = toolkit; initializeAnnotations(annotationGroupsToUse, annotationsToUse); initializeDBs(); } @@ -113,6 +114,16 @@ public class VariantAnnotatorEngine { dbAnnotations.put(rod, rod.getName()); } + public void invokeAnnotationInitializationMethods() { + for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) { + annotation.initialize(walker, toolkit); + } + + for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) { + annotation.initialize(walker, toolkit); + } + } + public Set getVCFAnnotationDescriptions() { Set descriptions = new HashSet(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java index 20a2aea0e..7200f841b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.List; @@ -9,8 +8,9 @@ import java.util.List; public interface AnnotatorCompatibleWalker { // getter methods for various used bindings - public abstract RodBinding getSnpEffRodBinding(); + public abstract RodBinding getVariantRodBinding(); + public abstract RodBinding getSnpEffRodBinding(); public abstract RodBinding getDbsnpRodBinding(); public abstract List> getCompRodBindings(); public abstract List> getResourceRodBindings(); -} \ No newline at end of file +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java index f33d61df9..160a3d258 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java @@ -24,18 +24,16 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.List; -import java.util.Map; @DocumentedGATKFeature(enable = true, groupName = "VariantAnnotator annotations", summary = "VariantAnnotator annotations") public abstract class VariantAnnotatorAnnotation { // return the INFO keys public abstract List getKeyNames(); + + // initialization method (optional for subclasses, and therefore non-abstract) + public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) { } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index 60f0fcb0a..7f6dabeec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -79,24 +79,45 @@ public class BeagleOutputToVCFWalker extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + /** + * If this argument is present, the original allele frequencies and counts from this vcf are added as annotations ACH,AFH and ANH. at each record present in this vcf + */ @Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false) public RodBinding comp; + + /** + * This required argument is used to annotate each site in the vcf INFO field with R2 annotation. Will be NaN if Beagle determined there are no variant samples. + */ @Input(fullName="beagleR2", shortName = "beagleR2", doc="Beagle-produced .r2 file containing R^2 values for all markers", required=true) public RodBinding beagleR2; + /** + * These values will populate the GL field for each sample and contain the posterior probability of each genotype given the data after phasing and imputation. + */ @Input(fullName="beagleProbs", shortName = "beagleProbs", doc="Beagle-produced .probs file containing posterior genotype probabilities", required=true) public RodBinding beagleProbs; + /** + * By default, all genotypes will be marked in the VCF as "phased", using the "|" separator after Beagle. + */ @Input(fullName="beaglePhased", shortName = "beaglePhased", doc="Beagle-produced .phased file containing phased genotypes", required=true) public RodBinding beaglePhased; @Output(doc="VCF File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; + /** + * If this argument is absent, and if Beagle determines that there is no sample in a site that has a variant genotype, the site will be marked as filtered (Default behavior). + * If the argument is present, the site won't be marked as filtered under this condition even if there are no variant genotypes. + */ @Argument(fullName="dont_mark_monomorphic_sites_as_filtered", shortName="keep_monomorphic", doc="If provided, we won't filter sites that beagle tags as monomorphic. Useful for imputing a sample's genotypes from a reference panel" ,required=false) public boolean DONT_FILTER_MONOMORPHIC_SITES = false; + /** + * Value between 0 and 1. If the probability of getting a genotype correctly (based on the posterior genotype probabilities and the actual genotype) is below this threshold, + * a genotype will be substitute by a no-call. + */ @Argument(fullName="no" + "call_threshold", shortName="ncthr", doc="Threshold of confidence at which a genotype won't be called", required=false) private double noCallThreshold = 0.0; @@ -154,21 +175,16 @@ public class BeagleOutputToVCFWalker extends RodWalker { } BeagleFeature beagleR2Feature = tracker.getFirstValue(beagleR2); - // ignore places where we don't have a variant - if ( beagleR2Feature == null ) - return 0; - - BeagleFeature beagleProbsFeature = tracker.getFirstValue(beagleProbs); - - // ignore places where we don't have a variant - if ( beagleProbsFeature == null ) - return 0; - BeagleFeature beaglePhasedFeature = tracker.getFirstValue(beaglePhased); + // ignore places where we don't have a variant - if ( beaglePhasedFeature == null ) - return 0; + if ( beagleR2Feature == null || beagleProbsFeature == null || beaglePhasedFeature == null) + { + vcfWriter.add(vc_input); + return 1; + } + // get reference base for current position byte refByte = ref.getBase(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 07793fd7b..87695077d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -112,6 +112,9 @@ public class ProduceBeagleInputWalker extends RodWalker { @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false) VCFWriter bootstrapVCFOutput = null; + /** + * If sample gender is known, this flag should be set to true to ensure that Beagle treats male Chr X properly. + */ @Argument(fullName = "checkIsMaleOnChrX", shortName = "checkIsMaleOnChrX", doc = "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.", required = false) public boolean CHECK_IS_MALE_ON_CHR_X = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 3a18fe610..664c319ab 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -63,20 +63,32 @@ import java.util.*; *

Input

*

* One or more bam files (with proper headers) to be analyzed for coverage statistics - * (Optional) A REFSEQ Rod to aggregate coverage to the gene level *

- * + *

+ *(Optional) A REFSEQ Rod to aggregate coverage to the gene level + *

+ * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation) + *

*

Output

*

* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: + *

* - no suffix: per locus coverage + *

* - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases + *

* - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases + *

* - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval + *

* - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples + *

* - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene + *

* - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples + *

* - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases + *

* - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases *

* @@ -84,7 +96,7 @@ import java.util.*; *
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
- *   -T VariantEval \
+ *   -T DepthOfCoverage \
  *   -o file_name_base \
  *   -I input_bams.list
  *   [-geneList refSeq.sorted.txt] \
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
index fd912334f..4e2c17bf6 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
@@ -43,8 +43,10 @@ import java.util.List;
  * Generates an alternative reference sequence over the specified interval.
  *
  * 

- * Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). - * Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'. + * Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). + * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. + * Note that if there are multiple variants at a site, it takes the first one seen. + * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order). * *

Input

*

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java index 5f3b37cc8..7ae5c5c75 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java @@ -42,6 +42,9 @@ import java.io.PrintStream; * *

* The output format can be partially controlled using the provided command-line arguments. + * Specify intervals with the usual -L argument to output only the reference bases within your intervals. + * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a + * separate fasta sequence (named numerically in order). * *

Input

*

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 5f6865d04..ec180f0cd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -276,8 +276,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { if ( elt.isReducedRead() ) { // reduced read representation byte qual = elt.getReducedQual(); - add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods - return elt.getReducedCount(); // we added nObs bases here + if ( BaseUtils.isRegularBase( elt.getBase() )) { + add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods + return elt.getReducedCount(); // we added nObs bases here + } else // odd bases or deletions => don't use them + return 0; } else { byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index cd006a3cf..6ae437b27 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -63,7 +63,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private boolean SIMPLE_GREEDY_GENOTYPER = false; - + private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. final private ExactCalculation calcToUse; protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { @@ -178,22 +178,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } - private static final double[][] getGLs(Map GLs) { - double[][] genotypeLikelihoods = new double[GLs.size()+1][]; + private static final ArrayList getGLs(Map GLs) { + ArrayList genotypeLikelihoods = new ArrayList(); - int j = 0; + //int j = 0; + genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy for ( Genotype sample : GLs.values() ) { - j++; - if ( sample.hasLikelihoods() ) { //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); - genotypeLikelihoods[j] = sample.getLikelihoods().getAsVector(); + double[] gls = sample.getLikelihoods().getAsVector(); + + if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL) + genotypeLikelihoods.add(gls); } } return genotypeLikelihoods; } + // ------------------------------------------------------------------------------------- // // Linearized, ~O(N), implementation. @@ -318,9 +321,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { public int linearExact(Map GLs, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { - final int numSamples = GLs.size(); + final ArrayList genotypeLikelihoods = getGLs(GLs); + final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; - final double[][] genotypeLikelihoods = getGLs(GLs); final ExactACCache logY = new ExactACCache(numSamples+1); logY.getkMinus0()[0] = 0.0; // the zero case @@ -334,14 +337,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( k == 0 ) { // special case for k = 0 for ( int j=1; j <= numSamples; j++ ) { - kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][idxAA]; + kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA]; } } else { // k > 0 final double[] kMinus1 = logY.getkMinus1(); final double[] kMinus2 = logY.getkMinus2(); for ( int j=1; j <= numSamples; j++ ) { - final double[] gl = genotypeLikelihoods[j]; + final double[] gl = genotypeLikelihoods.get(j); final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; double aa = Double.NEGATIVE_INFINITY; @@ -434,10 +437,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( !vc.isVariant() ) throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart()); - boolean multiAllelicRecord = false; - - if (vc.getAlternateAlleles().size() > 1) - multiAllelicRecord = true; Map GLs = vc.getGenotypes(); double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1]; @@ -454,7 +453,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { pathMetricArray[0][0] = 0.0; // todo = can't deal with optimal dynamic programming solution with multiallelic records - if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) { + if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { sampleIndices.addAll(GLs.keySet()); sampleIdx = GLs.size(); } @@ -465,6 +464,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { continue; double[] likelihoods = sample.getValue().getLikelihoods().getAsVector(); + + if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) { + //System.out.print(sample.getKey()+":"); + //for (int k=0; k < likelihoods.length; k++) + // System.out.format("%4.2f ",likelihoods[k]); + //System.out.println(); + // all likelihoods are essentially the same: skip this sample and will later on force no call. + //sampleIdx++; + continue; + } + sampleIndices.add(sample.getKey()); for (int k=0; k <= AFofMaxLikelihood; k++) { @@ -504,22 +514,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { Genotype g = GLs.get(sample); if ( !g.hasLikelihoods() ) continue; - - if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) - bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector()); - else { - int newIdx = tracebackArray[k][startIdx]; - bestGTguess = startIdx - newIdx; - startIdx = newIdx; - } - + // if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now, + // and will add no-call genotype to GL's in a second pass ArrayList myAlleles = new ArrayList(); double qual = Double.NEGATIVE_INFINITY; double[] likelihoods = g.getLikelihoods().getAsVector(); + + if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { + bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector()); + } + else { + int newIdx = tracebackArray[k][startIdx];; + bestGTguess = startIdx - newIdx; + startIdx = newIdx; + } + /* System.out.format("Sample: %s GL:",sample); for (int i=0; i < likelihoods.length; i++) - System.out.format("%1.4f ",likelihoods[i]); + System.out.format("%1.4f, ",likelihoods[i]); */ for (int i=0; i < likelihoods.length; i++) { @@ -570,6 +583,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } + for ( Map.Entry sample : GLs.entrySet() ) { + + if ( !sample.getValue().hasLikelihoods() ) + continue; + Genotype g = GLs.get(sample.getKey()); + + double[] likelihoods = sample.getValue().getLikelihoods().getAsVector(); + + if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL) + continue; // regular likelihoods + + ArrayList myAlleles = new ArrayList(); + + double qual = Genotype.NO_NEG_LOG_10PERROR; + myAlleles.add(Allele.NO_CALL); + myAlleles.add(Allele.NO_CALL); + //System.out.println(myAlleles.toString()); + calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false)); + } return calls; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 41b340058..ec5eefd60 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -32,7 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.genotype.Haplotype; @@ -321,7 +323,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood haplotypeMap.clear(); if (getAlleleListFromVCF) { - for( final VariantContext vc_input : tracker.getValues(UAC.alleles) ) { + for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) { if( vc_input != null && allowableTypes.contains(vc_input.getType()) && ref.getLocus().getStart() == vc_input.getStart()) { @@ -413,16 +415,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (pileup != null ) { double[] genotypeLikelihoods; + if (useOldWrongHorribleHackedUpLikelihoodModel) genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap); else genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); - - // which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is ordered as for example - // for 3 alleles it's 00 01 11 02 12 22 - GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), + GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), alleleList, genotypeLikelihoods, getFilteredDepth(pileup))); @@ -444,4 +444,16 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood return indelLikelihoodMap.get(); } + // Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup, + // so that per-sample DP will include deletions covering the event. + protected int getFilteredDepth(ReadBackedPileup pileup) { + int count = 0; + for ( PileupElement p : pileup ) { + if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()) ) + count++; + } + + return count; + } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 477155241..6905ce4a4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -26,14 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -58,25 +56,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; } - public static VariantContext getSNPVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { - if ( tracker == null || ref == null || logger == null ) - throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger); - VariantContext vc = null; - - // search for usable record - for( final VariantContext vc_input : tracker.getValues(allelesBinding) ) { - if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) { - if ( vc == null ) { - vc = vc_input; - } else { - logger.warn("Multiple valid VCF records detected at site " + ref.getLocus() + ", only considering alleles from first record"); - } - } - } - - return vc; - } - public Allele getLikelihoods(RefMetaDataTracker tracker, ReferenceContext ref, Map contexts, @@ -96,7 +75,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC if ( alternateAlleleToUse != null ) { bestAlternateAllele = alternateAlleleToUse.getBases()[0]; } else if ( useAlleleFromVCF ) { - VariantContext vc = getSNPVCFromAllelesRod(tracker, ref, true, logger, UAC.alleles); + VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); // ignore places where we don't have a variant if ( vc == null ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java index 500b11360..d88e55687 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java @@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index d5dbdedd6..428f97e2a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -38,7 +38,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.codecs.snpEff.SnpEffFeature; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -127,7 +126,8 @@ public class UnifiedGenotyper extends LocusWalker getDbsnpRodBinding() { return dbsnp.dbsnp; } - public RodBinding getSnpEffRodBinding() { return null; } + public RodBinding getVariantRodBinding() { return null; } + public RodBinding getSnpEffRodBinding() { return null; } public List> getCompRodBindings() { return Collections.emptyList(); } public List> getResourceRodBindings() { return Collections.emptyList(); } @@ -210,7 +210,7 @@ public class UnifiedGenotyper extends LocusWalker stratifiedContexts, AlignmentContext rawContext) { VariantContext vc; if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, ref, false, logger, UAC.alleles); + VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); if ( vcInput == null ) return null; - vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles()); + vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles(), InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, ref.getBase()); + } else { // deal with bad/non-standard reference bases if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) ) @@ -635,7 +635,7 @@ public class UnifiedGenotyperEngine { // no extended event pileup // if we're genotyping given alleles and we have a requested SNP at this position, do SNP if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { - VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, refContext, false, logger, UAC.alleles); + VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); if (vcInput == null) return null; @@ -741,4 +741,23 @@ public class UnifiedGenotyperEngine { return afcm; } + + public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { + if ( tracker == null || ref == null || logger == null ) + throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger); + VariantContext vc = null; + + // search for usable record + for( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) { + if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) { + if ( vc == null ) { + vc = vc_input; + } else { + logger.warn("Multiple valid VCF records detected in the alleles input file at site " + ref.getLocus() + ", only considering the first record"); + } + } + } + + return vc; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index e68aa31e0..232e468f9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -73,7 +73,7 @@ public class HaplotypeIndelErrorModel { baseMatchArray = new double[MAX_CACHED_QUAL+1]; baseMismatchArray = new double[MAX_CACHED_QUAL+1]; for (int k=1; k <= MAX_CACHED_QUAL; k++) { - double baseProb = QualityUtils.qualToProb(k); + double baseProb = QualityUtils.qualToProb((byte)k); baseMatchArray[k] = probToQual(baseProb); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java index e5ad3106d..8bba8eac2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java @@ -68,26 +68,59 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.*; import java.util.*; + /** + * Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well, + * but this latter functionality is now superceded by UnifiedGenotyper. + * + *

* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing - * data. Two output formats supported are: BED format (minimal output, required), and extended output that includes read - * and mismtach statistics around the calls (tuned on with --verbose). The calls can be performed from a single/pooled sample, - * or from a matched pair of samples (with --somatic option). In the latter case, two input bam files must be specified, - * the order is important: indels are called from the second sample ("Tumor") and additionally annotated as germline - * if even a weak evidence for the same indel, not necessarily a confident call, exists in the first sample ("Normal"), or as somatic - * if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains - * only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords. + * data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs + * include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many + * forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional + * statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will + * attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional + * metrics for the post-processing tools to make the final decision). The calls are performed by default + * from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I + * command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments, + * respectively. Indels are called from the tumor sample and annotated as germline + * if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic + * if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling + * is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold + * in tumor sample, it will not be reported. * - * If any of the general usage of this tool or any of the command-line arguments for this tool are not clear to you, - * please email asivache at broadinstitute dot org and he will gladly explain everything in more detail. + * To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input + * bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged + * on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups). * + *

Input

+ *

+ * Tumor and normal bam files (or single sample bam file(s) in --unpaired mode). + *

+ * + *

Output

+ *

+ * Indel calls with associated metrics. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SomaticIndelDetector \
+ *   -o indels.vcf \
+ *   -verbose indels.txt
+ *   -I:normal normal.bam \
+ *   -I:tumor tumor.bam
+ * 
* */ + @ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class}) public class SomaticIndelDetectorWalker extends ReadWalker { // @Output // PrintStream out; - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to write variants (indels) in VCF format",required=true) protected VCFWriter vcf_writer = null; @Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true) @@ -102,68 +135,80 @@ public class SomaticIndelDetectorWalker extends ReadWalker { @Hidden @Argument(fullName = "genotype_intervals", shortName = "genotype", - doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false) + doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false) public String genotypeIntervalsFile = null; @Hidden @Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false, - doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+ - "if the list turns out to be unsorted, it will throw an exception. "+ - "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+ - "to sort and keep it in memory (increases memory usage!).") + doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+ + "if the list turns out to be unsorted, it will throw an exception. "+ + "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+ + "to sort and keep it in memory (increases memory usage!).") protected boolean GENOTYPE_NOT_SORTED = false; @Hidden - @Argument(fullName="unpaired", shortName="unpaired", - doc="Perform unpaired calls (no somatic status detection)", required=false) + @Argument(fullName="unpaired", shortName="unpaired", + doc="Perform unpaired calls (no somatic status detection)", required=false) boolean call_unpaired = false; - boolean call_somatic ; + boolean call_somatic ; - @Argument(fullName="verboseOutput", shortName="verbose", - doc="Verbose output file in text format", required=false) - java.io.File verboseOutput = null; + @Argument(fullName="verboseOutput", shortName="verbose", + doc="Verbose output file in text format", required=false) + java.io.File verboseOutput = null; @Argument(fullName="bedOutput", shortName="bed", - doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false) + doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false) java.io.File bedOutput = null; - @Argument(fullName="minCoverage", shortName="minCoverage", - doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false) - int minCoverage = 6; + @Argument(fullName="minCoverage", shortName="minCoverage", + doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+ + "with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false) + int minCoverage = 6; - @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage", - doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false) - int minNormalCoverage = 4; + @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage", + doc="used only in default (somatic) mode; normal sample must have at least minNormalCoverage "+ + "or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false) + int minNormalCoverage = 4; - @Argument(fullName="minFraction", shortName="minFraction", - doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+ - " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false) - double minFraction = 0.3; + @Argument(fullName="minFraction", shortName="minFraction", + doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+ + " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false) + double minFraction = 0.3; - @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction", - doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false) - double minConsensusFraction = 0.7; + @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction", + doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt "+ + "all indel observations at the site exceeds this threshold", required=false) + double minConsensusFraction = 0.7; - @Argument(fullName="minIndelCount", shortName="minCnt", - doc="Minimum count of reads supporting consensus indel required for making the call. "+ - " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+ - "(minIndelCount not met) will not pass.", required=false) - int minIndelCount = 0; + @Argument(fullName="minIndelCount", shortName="minCnt", + doc="Minimum count of reads supporting consensus indel required for making the call. "+ + " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+ + "(minIndelCount not met) will not pass.", required=false) + int minIndelCount = 0; - @Argument(fullName="refseq", shortName="refseq", - doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false) - String RefseqFileName = null; + @Argument(fullName="refseq", shortName="refseq", + doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with "+ + "GENOMIC/UTR/INTRON/CODING and with the gene name", required=false) + String RefseqFileName = null; - @Argument(fullName="blacklistedLanes", shortName="BL", - doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ - "by this application, so they will not contribute indels to consider and will not be counted.", required=false) - PlatformUnitFilterHelper dummy; - @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false; +//@Argument(fullName="blacklistedLanes", shortName="BL", +// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ +// "by this application, so they will not contribute indels to consider and will not be counted.", required=false) +//PlatformUnitFilterHelper dummy; + + @Hidden + @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on", + required=false) Boolean DEBUG = false; @Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+ - "May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200; + "May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+ + "window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+ + "than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+ + "if window is too small",required=false) int WINDOW_SIZE = 200; @Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+ " the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000; + + private WindowContext tumor_context; private WindowContext normal_context; private int currentContigIndex = -1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java index a56c9e21e..63fb33295 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java @@ -37,7 +37,7 @@ public class PhasingRead extends BaseArray { public PhasingRead(int length, int mappingQual) { super(length); - this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual)); + this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb((byte)mappingQual)); this.baseProbs = new PreciseNonNegativeDouble[length]; Arrays.fill(this.baseProbs, null); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java index 55da1c152..f94140814 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java @@ -44,12 +44,12 @@ public class RefSeqDataParser { String nameKeyToUseMultiplePrefix = nameKeyToUse + "_"; Map entriesToNames = new HashMap(); - Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY); - if (numRecords != null) { + int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1); + if (numRecords != -1) { boolean done = false; if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1": - String name = vc.getAttributeAsStringNoException(nameKeyToUse); + String name = vc.getAttributeAsString(nameKeyToUse, null); if (name != null) { entriesToNames.put(nameKeyToUse, name); done = true; @@ -59,14 +59,14 @@ public class RefSeqDataParser { if (!done) { for (int i = 1; i <= numRecords; i++) { String key = nameKeyToUseMultiplePrefix + i; - String name = vc.getAttributeAsStringNoException(key); + String name = vc.getAttributeAsString(key, null); if (name != null) entriesToNames.put(key, name); } } } else { // no entry with the # of records: - String name = vc.getAttributeAsStringNoException(nameKeyToUse); + String name = vc.getAttributeAsString(nameKeyToUse, null); if (name != null) { entriesToNames.put(nameKeyToUse, name); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index 98c8950e3..1bdb70bdd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -76,6 +76,42 @@ import java.util.Map; *

Output

*

* A recalibration table file in CSV format that is used by the TableRecalibration walker. + * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score. + * + * The first 20 lines of such a file is shown below. + * * The file begins with a series of comment lines describing: + * ** The number of counted loci + * ** The number of counted bases + * ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases + * + * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records. + * + * * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change + * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of + * reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate. + * + *

+ * # Counted Sites    19451059
+ * # Counted Bases    56582018
+ * # Skipped Sites    82666
+ * # Fraction Skipped 1 / 235 bp
+ * ReadGroup,QualityScore,Cycle,Dinuc,nObservations,nMismatches,Qempirical
+ * SRR006446,11,65,CA,9,1,10
+ * SRR006446,11,48,TA,10,0,40
+ * SRR006446,11,67,AA,27,0,40
+ * SRR006446,11,61,GA,11,1,10
+ * SRR006446,12,34,CA,47,1,17
+ * SRR006446,12,30,GA,52,1,17
+ * SRR006446,12,36,AA,352,1,25
+ * SRR006446,12,17,TA,182,11,12
+ * SRR006446,11,48,TG,2,0,40
+ * SRR006446,11,67,AG,1,0,40
+ * SRR006446,12,34,CG,9,0,40
+ * SRR006446,12,30,GG,43,0,40
+ * ERR001876,4,31,AG,1,0,40
+ * ERR001876,4,31,AT,2,2,1
+ * ERR001876,4,31,CA,1,0,40
+ * 
*

* *

Examples

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 01e8cd321..48cba6a1a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -61,7 +61,7 @@ import java.util.List; * CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC *
* are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be: - * + *
  * Valid                     // amplicon is valid
  * SITE_IS_FILTERED=1        // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
  * VARIANT_TOO_NEAR_PROBE=1  // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
@@ -72,10 +72,10 @@ import java.util.List;
  * END_TOO_CLOSE,            // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
  * NO_VARIANTS_FOUND,        // no variants found within the amplicon region
  * INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
- * 

+ *

* *

Examples

- *

+ * 
  *    java
  *      -jar GenomeAnalysisTK.jar
  *      -T ValidationAmplicons
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
index fe4729bdc..28f4f2a56 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
 import org.broadinstitute.sting.gatk.walkers.TreeReducible;
 import org.broadinstitute.sting.gatk.walkers.Window;
 import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator;
+import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.JexlExpression;
 import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier;
 import org.broadinstitute.sting.gatk.walkers.varianteval.util.*;
 import org.broadinstitute.sting.gatk.walkers.variantrecalibration.Tranche;
@@ -24,6 +25,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.StingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
@@ -53,7 +55,23 @@ import java.util.*;
  *
  * 

Output

*

- * Evaluation tables. + * Evaluation tables detailing the results of the eval modules which were applied. + * For example: + *

+ * output.eval.gatkreport:
+ * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
+ * CountVariants  CompRod   CpG      EvalRod  JexlExpression  Novelty  nProcessedLoci  nCalledLoci  nRefLoci  nVariantLoci  variantRate ...
+ * CountVariants  dbsnp     CpG      eval     none            all      65900028        135770       0         135770        0.00206024  ...
+ * CountVariants  dbsnp     CpG      eval     none            known    65900028        47068        0         47068         0.00071423  ...
+ * CountVariants  dbsnp     CpG      eval     none            novel    65900028        88702        0         88702         0.00134601  ...
+ * CountVariants  dbsnp     all      eval     none            all      65900028        330818       0         330818        0.00502000  ...
+ * CountVariants  dbsnp     all      eval     none            known    65900028        120685       0         120685        0.00183133  ...
+ * CountVariants  dbsnp     all      eval     none            novel    65900028        210133       0         210133        0.00318866  ...
+ * CountVariants  dbsnp     non_CpG  eval     none            all      65900028        195048       0         195048        0.00295976  ...
+ * CountVariants  dbsnp     non_CpG  eval     none            known    65900028        73617        0         73617         0.00111710  ...
+ * CountVariants  dbsnp     non_CpG  eval     none            novel    65900028        121431       0         121431        0.00184265  ...
+ * ...
+ * 
*

* *

Examples

@@ -147,12 +165,12 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50; - @Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=false) - private String TRANCHE_FILENAME = null; - @Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false) private File ancestralAlignmentsFile = null; + @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false) + private boolean requireStrictAlleleMatch = false; + // Variables private Set jexlExpressions = new TreeSet(); @@ -230,16 +248,6 @@ public class VariantEvalWalker extends RodWalker implements Tr jexlExpressions.add(sjexl); } - // Add select expressions for anything in the tranches file - if ( TRANCHE_FILENAME != null ) { - // we are going to build a few select names automatically from the tranches file - for ( Tranche t : Tranche.readTranches(new File(TRANCHE_FILENAME)) ) { - logger.info("Adding select for all variant above the pCut of : " + t); - SELECT_EXPS.add(String.format(VariantRecalibrator.VQS_LOD_KEY + " >= %.2f", t.minVQSLod)); - SELECT_NAMES.add(String.format("TS-%.2f", t.ts)); - } - } - // Initialize the set of stratifications and evaluations to use stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE); Set> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE); @@ -371,16 +379,16 @@ public class VariantEvalWalker extends RodWalker implements Tr if ( matchingComps.size() == 0 ) return null; - // find the comp which matches the alternate allele from eval + // find the comp which matches both the reference allele and alternate allele from eval Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0); for ( VariantContext comp : matchingComps ) { Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0); - if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp)) ) + if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference())) ) return comp; } - // if none match, just return the first one - return matchingComps.get(0); + // if none match, just return the first one unless we require a strict match + return (requireStrictAlleleMatch ? null : matchingComps.get(0)); } public Integer treeReduce(Integer lhs, Integer rhs) { return null; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java index 2ea64c49c..9facb11b5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java @@ -22,9 +22,6 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { @DataPoint(description = "number of eval SNP sites") long nEvalVariants = 0; - @DataPoint(description = "number of comp SNP sites") - long nCompVariants = 0; - @DataPoint(description = "number of eval sites outside of comp sites") long novelSites = 0; @@ -75,10 +72,9 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { } public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - boolean evalIsGood = eval != null && eval.isVariant(); - boolean compIsGood = comp != null && comp.isNotFiltered() && (eval == null || comp.getType() == eval.getType()); + boolean evalIsGood = eval != null && eval.isPolymorphic(); + boolean compIsGood = comp != null && comp.isNotFiltered(); - if (compIsGood) nCompVariants++; // count the number of comp events if (evalIsGood) nEvalVariants++; // count the number of eval events if (compIsGood && evalIsGood) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index 59ef3d992..72058ba7b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -100,21 +100,22 @@ public class CountVariants extends VariantEvaluator implements StandardEval { // So in order to maintain consistency with the previous implementation (and the intention of the original author), I've // added in a proxy check for monomorphic status here. // Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call. - if ( !vc1.isVariant() || (vc1.hasGenotypes() && vc1.getHomRefCount() + vc1.getNoCallCount() == vc1.getNSamples()) ) { + if ( vc1.isMonomorphic() ) { nRefLoci++; } else { switch (vc1.getType()) { case NO_VARIATION: + // shouldn't get here break; case SNP: nVariantLoci++; nSNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; + if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; break; case MNP: nVariantLoci++; nMNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; + if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; break; case INDEL: nVariantLoci++; @@ -136,7 +137,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval { String refStr = vc1.getReference().getBaseString().toUpperCase(); - String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE").toUpperCase() : null; + String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase() : null; // if (aaStr.equals(".")) { // aaStr = refStr; // } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java index a476a2680..e69dbfb28 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java @@ -219,7 +219,8 @@ public class GenotypePhasingEvaluator extends VariantEvaluator { } public static Double getPQ(Genotype gt) { - return gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY); + Double d = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); + return d == -1 ? null : d; } public static boolean topMatchesTop(AllelePair b1, AllelePair b2) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java index 35fffd815..ffe7c185f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java @@ -90,18 +90,19 @@ public class IndelLengthHistogram extends VariantEvaluator { public int getComparisonOrder() { return 1; } // need only the evals public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( ! vc1.isBiallelic() && vc1.isIndel() ) { - //veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored."); - return vc1.toString(); // biallelic sites are output - } - if ( vc1.isIndel() ) { + if ( vc1.isIndel() && vc1.isPolymorphic() ) { + + if ( ! vc1.isBiallelic() ) { + //veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored."); + return vc1.toString(); // biallelic sites are output + } + + // only count simple insertions/deletions, not complex indels if ( vc1.isSimpleInsertion() ) { indelHistogram.update(vc1.getAlternateAllele(0).length()); } else if ( vc1.isSimpleDeletion() ) { indelHistogram.update(-vc1.getReference().length()); - } else { - throw new ReviewedStingException("Indel type that is not insertion or deletion."); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java index fc347339d..f70e6c2de 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java @@ -270,7 +270,7 @@ public class IndelStatistics extends VariantEvaluator { public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (eval != null ) { + if (eval != null && eval.isPolymorphic()) { if ( indelStats == null ) { indelStats = new IndelStats(eval); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java index d466645ea..2d0163206 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java @@ -120,7 +120,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval if ( eval.hasGenotypes() ) ac = eval.getChromosomeCount(eval.getAlternateAllele(0)); else if ( eval.hasAttribute("AC") ) { - ac = Integer.valueOf(eval.getAttributeAsString("AC")); + ac = eval.getAttributeAsInt("AC", -1); } if ( ac != -1 ) { @@ -166,7 +166,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval } } - if ( eval.isSNP() && eval.isBiallelic() && metrics != null ) { + if ( eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() && metrics != null ) { metrics.incrValue(eval); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java index ec43cbd55..e51623c3c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java @@ -37,77 +37,74 @@ public class ThetaVariantEvaluator extends VariantEvaluator { } public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (vc == null || !vc.isSNP() || !vc.hasGenotypes()) { + if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphic()) { return null; //no interesting sites } - if (vc.hasGenotypes()) { + //this maps allele to a count + ConcurrentMap alleleCounts = new ConcurrentHashMap(); - //this maps allele to a count - ConcurrentMap alleleCounts = new ConcurrentHashMap(); + int numHetsHere = 0; + float numGenosHere = 0; + int numIndsHere = 0; - int numHetsHere = 0; - float numGenosHere = 0; - int numIndsHere = 0; + for (Genotype genotype : vc.getGenotypes().values()) { + numIndsHere++; + if (!genotype.isNoCall()) { + //increment stats for heterozygosity + if (genotype.isHet()) { + numHetsHere++; + } - for (Genotype genotype : vc.getGenotypes().values()) { - numIndsHere++; - if (!genotype.isNoCall()) { - //increment stats for heterozygosity - if (genotype.isHet()) { - numHetsHere++; - } + numGenosHere++; + //increment stats for pairwise mismatches - numGenosHere++; - //increment stats for pairwise mismatches - - for (Allele allele : genotype.getAlleles()) { - if (allele.isNonNull() && allele.isCalled()) { - String alleleString = allele.toString(); - alleleCounts.putIfAbsent(alleleString, 0); - alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1); - } + for (Allele allele : genotype.getAlleles()) { + if (allele.isNonNull() && allele.isCalled()) { + String alleleString = allele.toString(); + alleleCounts.putIfAbsent(alleleString, 0); + alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1); } } } - if (numGenosHere > 0) { - //only if have one called genotype at least - this.numSites++; + } + if (numGenosHere > 0) { + //only if have one called genotype at least + this.numSites++; - this.totalHet += numHetsHere / numGenosHere; + this.totalHet += numHetsHere / numGenosHere; - //compute based on num sites - float harmonicFactor = 0; - for (int i = 1; i <= numIndsHere; i++) { - harmonicFactor += 1.0 / i; - } - this.thetaRegionNumSites += 1.0 / harmonicFactor; + //compute based on num sites + float harmonicFactor = 0; + for (int i = 1; i <= numIndsHere; i++) { + harmonicFactor += 1.0 / i; + } + this.thetaRegionNumSites += 1.0 / harmonicFactor; - //now compute pairwise mismatches - float numPairwise = 0; - float numDiffs = 0; - for (String allele1 : alleleCounts.keySet()) { - int allele1Count = alleleCounts.get(allele1); + //now compute pairwise mismatches + float numPairwise = 0; + float numDiffs = 0; + for (String allele1 : alleleCounts.keySet()) { + int allele1Count = alleleCounts.get(allele1); - for (String allele2 : alleleCounts.keySet()) { - if (allele1.compareTo(allele2) < 0) { - continue; - } - if (allele1 .compareTo(allele2) == 0) { - numPairwise += allele1Count * (allele1Count - 1) * .5; + for (String allele2 : alleleCounts.keySet()) { + if (allele1.compareTo(allele2) < 0) { + continue; + } + if (allele1 .compareTo(allele2) == 0) { + numPairwise += allele1Count * (allele1Count - 1) * .5; - } - else { - int allele2Count = alleleCounts.get(allele2); - numPairwise += allele1Count * allele2Count; - numDiffs += allele1Count * allele2Count; - } + } + else { + int allele2Count = alleleCounts.get(allele2); + numPairwise += allele1Count * allele2Count; + numDiffs += allele1Count * allele2Count; } } + } - if (numPairwise > 0) { - this.totalAvgDiffs += numDiffs / numPairwise; - } + if (numPairwise > 0) { + this.totalAvgDiffs += numDiffs / numPairwise; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java index be957abd7..9b6e145e6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java @@ -40,7 +40,7 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv } public void updateTiTv(VariantContext vc, boolean updateStandard) { - if (vc != null && vc.isSNP() && vc.isBiallelic()) { + if (vc != null && vc.isSNP() && vc.isBiallelic() && vc.isPolymorphic()) { if (VariantContextUtils.isTransition(vc)) { if (updateStandard) nTiInComp++; else nTi++; @@ -49,18 +49,14 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv else nTv++; } - String refStr = vc.getReference().getBaseString().toUpperCase(); - String aaStr = vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase(); - - if (aaStr != null && !aaStr.equalsIgnoreCase("null") && !aaStr.equals(".")) { - BaseUtils.BaseSubstitutionType aaSubType = BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0]); - - //System.out.println(refStr + " " + vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase() + " " + aaSubType); - - if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSITION) { - nTiDerived++; - } else if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSVERSION) { - nTvDerived++; + if (vc.hasAttribute("ANCESTRALALLELE")) { + final String aaStr = vc.getAttributeAsString("ANCESTRALALLELE", "null").toUpperCase(); + if ( ! aaStr.equals(".") ) { + switch ( BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0] ) ) { + case TRANSITION: nTiDerived++; break; + case TRANSVERSION: nTvDerived++; break; + default: break; + } } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java index 9c331b577..c60586017 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java @@ -117,7 +117,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { public SiteStatus calcSiteStatus(VariantContext vc) { if ( vc == null ) return SiteStatus.NO_CALL; if ( vc.isFiltered() ) return SiteStatus.FILTERED; - if ( ! vc.isVariant() ) return SiteStatus.MONO; + if ( vc.isMonomorphic() ) return SiteStatus.MONO; + if ( vc.hasGenotypes() ) return SiteStatus.POLY; // must be polymorphic if isMonomorphic was false and there are genotypes if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { int ac = 0; @@ -130,10 +131,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { //// System.out.printf(" ac = %d%n", ac); } else - ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY); + ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0); return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO; - } else if ( vc.hasGenotypes() ) { - return vc.isPolymorphic() ? SiteStatus.POLY : SiteStatus.MONO; } else { return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED ? SiteStatus.POLY : SiteStatus.NO_CALL; // we can't figure out what to do //return SiteStatus.NO_CALL; // we can't figure out what to do diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java index b6ad55b18..263227938 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java @@ -232,7 +232,7 @@ public class VariantQualityScore extends VariantEvaluator { public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { final String interesting = null; - if( eval != null && eval.isSNP() && eval.isBiallelic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) + if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) if( titvStats == null ) { titvStats = new TiTvStats(); } titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index 3cc22cc52..c7bea93b2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -44,7 +44,7 @@ public class AlleleCount extends VariantStratifier { if (eval != null) { int AC = -1; if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) { - AC = eval.getAttributeAsInt("AC"); + AC = eval.getAttributeAsInt("AC", 0); } else if ( eval.isVariant() ) { for (Allele allele : eval.getAlternateAlleles()) AC = Math.max(AC, eval.getChromosomeCount(allele)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java index 3d2dda651..cd2b8e475 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java @@ -28,7 +28,7 @@ public class AlleleFrequency extends VariantStratifier { if (eval != null) { try { - relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF") / 5.0, 3)))); + relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF", 0.0) / 5.0, 3)))); } catch (Exception e) { return relevantStates; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java index 3223626c0..91c96e490 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java @@ -90,8 +90,8 @@ public class Degeneracy extends VariantStratifier { Integer frame = null; if (eval.hasAttribute("refseq.functionalClass")) { - aa = eval.getAttributeAsString("refseq.variantAA"); - frame = eval.getAttributeAsInt("refseq.frame"); + aa = eval.getAttributeAsString("refseq.variantAA", null); + frame = eval.getAttributeAsInt("refseq.frame", 0); } else if (eval.hasAttribute("refseq.functionalClass_1")) { int annotationId = 1; String key; @@ -99,7 +99,7 @@ public class Degeneracy extends VariantStratifier { do { key = String.format("refseq.functionalClass_%d", annotationId); - String newtype = eval.getAttributeAsString(key); + String newtype = eval.getAttributeAsString(key, null); if ( newtype != null && ( type == null || @@ -109,13 +109,13 @@ public class Degeneracy extends VariantStratifier { type = newtype; String aakey = String.format("refseq.variantAA_%d", annotationId); - aa = eval.getAttributeAsString(aakey); + aa = eval.getAttributeAsString(aakey, null); if (aa != null) { String framekey = String.format("refseq.frame_%d", annotationId); if (eval.hasAttribute(framekey)) { - frame = eval.getAttributeAsInt(framekey); + frame = eval.getAttributeAsInt(framekey, 0); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java index 193a65591..dbe8295ee 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; @@ -11,25 +12,34 @@ import java.util.List; * Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation. */ public class FunctionalClass extends VariantStratifier { - @Override - public void initialize() { - states.add("all"); - states.add("silent"); - states.add("missense"); - states.add("nonsense"); + + public enum FunctionalType { + silent, + missense, + nonsense } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + @Override + public void initialize() { + states.add("all"); + for ( FunctionalType type : FunctionalType.values() ) + states.add(type.name()); + } + + +public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { ArrayList relevantStates = new ArrayList(); relevantStates.add("all"); if (eval != null && eval.isVariant()) { - String type = null; + FunctionalType type = null; if (eval.hasAttribute("refseq.functionalClass")) { - type = eval.getAttributeAsString("refseq.functionalClass"); + try { + type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass", null)); + } catch ( Exception e ) {} // don't error out if the type isn't supported } else if (eval.hasAttribute("refseq.functionalClass_1")) { int annotationId = 1; String key; @@ -37,24 +47,33 @@ public class FunctionalClass extends VariantStratifier { do { key = String.format("refseq.functionalClass_%d", annotationId); - String newtype = eval.getAttributeAsString(key); - - if ( newtype != null && !newtype.equalsIgnoreCase("null") && - ( type == null || - ( type.equals("silent") && !newtype.equals("silent") ) || - ( type.equals("missense") && newtype.equals("nonsense") ) ) - ) { - type = newtype; + String newtypeStr = eval.getAttributeAsString(key, null); + if ( newtypeStr != null && !newtypeStr.equalsIgnoreCase("null") ) { + try { + FunctionalType newType = FunctionalType.valueOf(newtypeStr); + if ( type == null || + ( type == FunctionalType.silent && newType != FunctionalType.silent ) || + ( type == FunctionalType.missense && newType == FunctionalType.nonsense ) ) { + type = newType; + } + } catch ( Exception e ) {} // don't error out if the type isn't supported } annotationId++; } while (eval.hasAttribute(key)); + + } else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName() ) ) { + SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName()).toString()); + if ( snpEffType == SnpEff.EffectType.STOP_GAINED ) + type = FunctionalType.nonsense; + else if ( snpEffType == SnpEff.EffectType.NON_SYNONYMOUS_CODING ) + type = FunctionalType.missense; + else if ( snpEffType == SnpEff.EffectType.SYNONYMOUS_CODING ) + type = FunctionalType.silent; } - if (type != null) { - if (type.equals("silent")) { relevantStates.add("silent"); } - else if (type.equals("missense")) { relevantStates.add("missense"); } - else if (type.equals("nonsense")) { relevantStates.add("nonsense"); } + if ( type != null ) { + relevantStates.add(type.name()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 3cc039141..92e7c6554 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -277,7 +277,7 @@ public class VariantEvalUtils { * @return a new VariantContext with just the requested samples */ public VariantContext getSubsetOfVariantContext(VariantContext vc, Collection sampleNames) { - VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values()); + VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values(), vc.getAlleles()); HashMap newAts = new HashMap(vcsub.getAttributes()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java new file mode 100755 index 000000000..5f688d001 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 3/12/11 + */ + +public class TrainingSet { + + public RodBinding rodBinding; + public boolean isKnown = false; + public boolean isTraining = false; + public boolean isAntiTraining = false; + public boolean isTruth = false; + public boolean isConsensus = false; + public double prior = 0.0; + + protected final static Logger logger = Logger.getLogger(TrainingSet.class); + + public TrainingSet( final RodBinding rodBinding) { + this.rodBinding = rodBinding; + + final Tags tags = rodBinding.getTags(); + final String name = rodBinding.getName(); + + // Parse the tags to decide which tracks have which properties + if( tags != null ) { + isKnown = tags.containsKey("known") && tags.getValue("known").equals("true"); + isTraining = tags.containsKey("training") && tags.getValue("training").equals("true"); + isAntiTraining = tags.containsKey("bad") && tags.getValue("bad").equals("true"); + isTruth = tags.containsKey("truth") && tags.getValue("truth").equals("true"); + isConsensus = tags.containsKey("consensus") && tags.getValue("consensus").equals("true"); + prior = ( tags.containsKey("prior") ? Double.parseDouble(tags.getValue("prior")) : prior ); + } + + // Report back to the user which tracks were found and the properties that were detected + if( !isConsensus && !isAntiTraining ) { + logger.info( String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", name, isKnown, isTraining, isTruth, prior) ); + } else if( isConsensus ) { + logger.info( String.format( "Found consensus track: %s", name) ); + } else { + logger.info( String.format( "Found bad sites training track: %s", name) ); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java index bc7252ec2..04ba3ff14 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java @@ -115,7 +115,7 @@ public class VQSRCalibrationCurve { if ( vc.isFiltered() ) return 0.0; else if ( vc.hasAttribute(VQSRQualKey) ) { - double qual = vc.getAttributeAsDouble(VQSRQualKey); + double qual = vc.getAttributeAsDouble(VQSRQualKey, 0.0); return probTrueVariant(qual); } else { throw new UserException.VariantContextMissingRequiredField(VQSRQualKey, vc); @@ -143,7 +143,7 @@ public class VQSRCalibrationCurve { for ( int i = 0; i < log10Likelihoods.length; i++) { double p = Math.pow(10, log10Likelihoods[i]); double q = alpha * p + (1-alpha) * noInfoPr; - if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey), p, alpha, q); + if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey, 0.0), p, alpha, q); updated[i] = Math.log10(q); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 429becfc7..e04bfab76 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -51,10 +51,10 @@ public class VariantDataManager { private ExpandingArrayList data; private final double[] meanVector; private final double[] varianceVector; // this is really the standard deviation - public final ArrayList annotationKeys; + public final List annotationKeys; private final VariantRecalibratorArgumentCollection VRAC; protected final static Logger logger = Logger.getLogger(VariantDataManager.class); - + protected final List trainingSets; public VariantDataManager( final List annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) { this.data = null; @@ -62,6 +62,7 @@ public class VariantDataManager { this.VRAC = VRAC; meanVector = new double[this.annotationKeys.size()]; varianceVector = new double[this.annotationKeys.size()]; + trainingSets = new ArrayList(); } public void setData( final ExpandingArrayList data ) { @@ -104,6 +105,31 @@ public class VariantDataManager { } } + public void addTrainingSet( final TrainingSet trainingSet ) { + trainingSets.add( trainingSet ); + } + + public boolean checkHasTrainingSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTraining ) { return true; } + } + return false; + } + + public boolean checkHasTruthSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTruth ) { return true; } + } + return false; + } + + public boolean checkHasKnownSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isKnown ) { return true; } + } + return false; + } + public ExpandingArrayList getTrainingData() { final ExpandingArrayList trainingData = new ExpandingArrayList(); for( final VariantDatum datum : data ) { @@ -232,57 +258,35 @@ public class VariantDataManager { return value; } - public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC, final HashMap rodToPriorMap, - final List> training, final List> truth, final List> known, final List> badSites, final List> resource) { + public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) { datum.isKnown = false; datum.atTruthSite = false; datum.atTrainingSite = false; datum.atAntiTrainingSite = false; datum.prior = 2.0; - //BUGBUG: need to clean this up - - for( final RodBinding rod : training ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { + for( final TrainingSet trainingSet : trainingSets ) { + for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) { if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.atTrainingSite = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); + datum.isKnown = datum.isKnown || trainingSet.isKnown; + datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth; + datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining; + datum.prior = Math.max( datum.prior, trainingSet.prior ); + datum.consensusCount += ( trainingSet.isConsensus ? 1 : 0 ); } - } - } - for( final RodBinding rod : truth ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { - if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.atTruthSite = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); - } - } - } - for( final RodBinding rod : known ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { - if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.isKnown = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); - } - } - } - for( final RodBinding rod : resource ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { - if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); - } - } - } - for( final RodBinding rod : badSites ) { - for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { if( trainVC != null ) { - datum.atAntiTrainingSite = true; - datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); + datum.atAntiTrainingSite = datum.atAntiTrainingSite || trainingSet.isAntiTraining; } } } } + private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { + return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && + ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && + (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()); + } + public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) { for( final VariantDatum datum : data ) { RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s", @@ -290,10 +294,4 @@ public class VariantDataManager { (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"))); } } - - private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { - return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && - ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && - (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()); - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index df4faebd1..529d17285 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -77,16 +77,15 @@ import java.util.*; *

* A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. * - *

Examples

+ *

Example

*
  * java -Xmx4g -jar GenomeAnalysisTK.jar \
  *   -T VariantRecalibrator \
  *   -R reference/human_g1k_v37.fasta \
  *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
- *   -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
- *   -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
- *   -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
- *   -known:prior=8.0 dbsnp_132.b37.vcf \
+ *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
+ *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
+ *   -resource:dbsnp,known=true,training=false,truth=false,prior=8.0 dbsnp_132.b37.vcf \
  *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
  *   -recalFile path/to/output.recal \
  *   -tranchesFile path/to/output.tranches \
@@ -112,34 +111,11 @@ public class VariantRecalibrator extends RodWalker> input;
 
     /**
-     * Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
-     */
-    @Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
-    public List> training;
-
-    /**
-     * When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
-     * Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
-     */
-    @Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
-    public List> truth;
-
-    /**
-     * The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
-     * The output metrics are stratified by known status in order to aid in comparisons with other call sets.
-     */
-    @Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
-    public List> known = Collections.emptyList();
-
-    /**
-     * In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
-     * with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
-     */
-    @Input(fullName="badSites", shortName = "badSites", doc="A list of known bad variants used to supplement training the negative model", required=false)
-    public List> badSites = Collections.emptyList();
-
-    /**
-     * Any set of sites for which you would like to apply a prior probability but for which you don't want to use as training, truth, or known sites.
+     * Any set of VCF files to use as lists of training, truth, or known sites.
+     * Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
+     * Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
+     * Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
+     * Bad - In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list with a database of known bad variants.
      */
     @Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm", required=false)
     public List> resource = Collections.emptyList();
@@ -205,7 +181,6 @@ public class VariantRecalibrator extends RodWalker ignoreInputFilterSet = new TreeSet();
     private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC );
-    private final HashMap rodToPriorMap = new HashMap();
 
     //---------------------------------------------------------------------------------------------------------------
     //
@@ -227,18 +202,15 @@ public class VariantRecalibrator extends RodWalker> allInputBindings = new ArrayList>();
-        allInputBindings.addAll(truth);
-        allInputBindings.addAll(training);
-        allInputBindings.addAll(known);
-        allInputBindings.addAll(badSites);
-        allInputBindings.addAll(resource);
-        for( final RodBinding rod : allInputBindings ) {
-            try {
-                rodToPriorMap.put(rod.getName(), (rod.getTags().containsKey("prior") ? Double.parseDouble(rod.getTags().getValue("prior")) : 0.0) );
-            } catch( NumberFormatException e ) {
-                throw new UserException.BadInput("Bad rod binding syntax. Prior key-value tag detected but isn't parsable. Expecting something like -training:prior=12.0 my.set.vcf");
-            }
+        for( RodBinding rod : resource ) {
+            dataManager.addTrainingSet( new TrainingSet( rod ) );
+        }
+
+        if( !dataManager.checkHasTrainingSet() ) {
+            throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
+        }
+        if( !dataManager.checkHasTruthSet() ) {
+            throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
         }
     }
 
@@ -270,7 +242,7 @@ public class VariantRecalibrator extends RodWalker {
         if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN))
             return 0;
         
-        List mergedVCs = new ArrayList();
+        List preMergedVCs = new ArrayList();
         Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
         // iterate over the types so that it's deterministic
         for ( VariantContext.Type type : VariantContext.Type.values() ) {
             if ( VCsByType.containsKey(type) )
-                mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
+                preMergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
                         priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
                         SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
         }
 
+        List mergedVCs = new ArrayList();
+        // se have records merged but separated by type. If a particular record is for example a snp but all alleles are a subset of an existing mixed record,
+        // we will still merge those records.
+        if (preMergedVCs.size() > 1) {
+            for (VariantContext vc1 : preMergedVCs) {
+                VariantContext newvc = vc1;
+                boolean merged = false;
+                for (int k=0; k < mergedVCs.size(); k++) {
+                    VariantContext vc2 = mergedVCs.get(k);
+
+                    if (VariantContextUtils.allelesAreSubset(vc1,vc2) || VariantContextUtils.allelesAreSubset(vc2,vc1)) {
+                        // all alleles of vc1 are contained in vc2 but they are of different type (say, vc1 is snp, vc2 is complex): try to merget v1 into v2
+                        List vcpair = new ArrayList();
+                        vcpair.add(vc1);
+                        vcpair.add(vc2);
+                        newvc = VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcpair,
+                                               priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
+                                               SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC);
+                        mergedVCs.set(k,newvc);
+                        merged = true;
+                        break;
+                    }
+                }
+                if (!merged)
+                    mergedVCs.add(vc1);
+            }
+        }
+        else {
+            mergedVCs = preMergedVCs;
+        }
+
         for ( VariantContext mergedVC : mergedVCs ) {
             // only operate at the start of events
             if ( mergedVC == null )
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java
index 1c76a21ea..a932d44ed 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java
@@ -99,7 +99,7 @@ public class LiftoverVariants extends RodWalker {
 
 
         final VCFHeader vcfHeader = new VCFHeader(metaData, samples);
-        writer = new StandardVCFWriter(file, false);
+        writer = new StandardVCFWriter(file, getMasterSequenceDictionary(), false);
         writer.writeHeader(vcfHeader);
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java
index 1fefd20fc..fa5093839 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java
@@ -75,7 +75,7 @@ public class RandomlySplitVariants extends RodWalker {
         hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
 
         vcfWriter1.writeHeader(new VCFHeader(hInfo, samples));
-        vcfWriter2 = new StandardVCFWriter(file2, true);
+        vcfWriter2 = new StandardVCFWriter(file2, getMasterSequenceDictionary(), true);
         vcfWriter2.writeHeader(new VCFHeader(hInfo, samples));
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
index 35ff66243..89d103f52 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@@ -145,10 +145,9 @@ import java.util.*;
  *   -R ref.fasta \
  *   -T SelectVariants \
  *   --variant input.vcf \
- *   -o output.vcf \
- *   -SM family.yaml \
  *   -family NA12891+NA12892=NA12878 \
- *   -mvq 50
+ *   -mvq 50 \
+ *   -o violations.vcf
  *
  * Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF:
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
@@ -265,17 +264,17 @@ public class SelectVariants extends RodWalker {
     private File AF_FILE = new File("");
 
     @Hidden
-    @Argument(fullName="family_structure_file", shortName="familyFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
+    @Argument(fullName="family_structure_file", shortName="familyFile", doc="use -family unless you know what you're doing", required=false)
     private File FAMILY_STRUCTURE_FILE = null;
 
     /**
      * String formatted as dad+mom=child where these parameters determine which sample names are examined.
      */
-    @Argument(fullName="family_structure", shortName="family", doc="Deprecated; use the -SM argument instead", required=false)
+    @Argument(fullName="family_structure", shortName="family", doc="string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
     private String FAMILY_STRUCTURE = "";
 
     /**
-     * Sample metadata information will be taken from a YAML file (see the -SM argument).
+     * This activates the mendelian violation module that will select all variants that correspond to a mendelian violation following the rules given by the family structure.
      */
     @Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only", required=false)
     private Boolean MENDELIAN_VIOLATIONS = false;
@@ -306,7 +305,7 @@ public class SelectVariants extends RodWalker {
 
 
     @Hidden
-    @Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
+    @Argument(fullName="outMVFile", shortName="outMVFile", doc="", required=false)
     private String outMVFile = null;
 
     /* Private class used to store the intermediate variants in the integer random selection process */
@@ -575,7 +574,7 @@ public class SelectVariants extends RodWalker {
                         // ok we have a comp VC and we need to match the AF spectrum of inputAFRodName.
                         // We then pick a variant with probablity AF*desiredFraction
                         if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) )  {
-                            String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY);
+                            String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null);
 
                             double af;
                             double afBoost = 1.0;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
index 2c7902914..fdfca982c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
@@ -56,11 +56,6 @@ import java.util.Set;
  * A variant set to filter.
  * 

* - *

Output

- *

- * A filtered VCF. - *

- * *

Examples

*
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
index b98646270..8eaf976d0 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
@@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
 import java.util.*;
 
 /**
- * Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes)
+ * Annotates a validation (from Sequenom for example) VCF with QC metrics (HW-equilibrium, % failed probes)
  *
  * 

* The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes). @@ -57,7 +57,16 @@ import java.util.*; * *

Output

*

- * An annotated VCF. + * An annotated VCF. Additionally, a table like the following will be output: + *

+ *     Total number of samples assayed:                  185
+ *     Total number of records processed:                152
+ *     Number of Hardy-Weinberg violations:              34 (22%)
+ *     Number of no-call violations:                     12 (7%)
+ *     Number of homozygous variant violations:          0 (0%)
+ *     Number of records passing all filters:            106 (69%)
+ *     Number of passing records that are polymorphic:   98 (92%)
+ * 
*

* *

Examples

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 2a877fb09..c44d84136 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -192,7 +192,7 @@ public class VariantsToTable extends RodWalker { if ( getters.containsKey(field) ) { val = getters.get(field).get(vc); } else if ( vc.hasAttribute(field) ) { - val = vc.getAttributeAsString(field); + val = vc.getAttributeAsString(field, null); } else if ( isWildCard(field) ) { Set wildVals = new HashSet(); for ( Map.Entry elt : vc.getAttributes().entrySet()) { @@ -309,6 +309,7 @@ public class VariantsToTable extends RodWalker { getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } }); + getters.put("TYPE", new Getter() { public String get(VariantContext vc) { return vc.getType().toString(); } }); getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index b96923589..b66198713 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -306,7 +306,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome @Override public int hashCode() { - return (int)( start << 16 + stop << 4 + contigIndex ); + return start << 16 | stop << 4 | contigIndex; } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index f4c057c15..0d85f9606 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1056,42 +1056,30 @@ public class MathUtils { } static public double softMax(final double x, final double y) { - if (Double.isInfinite(x)) - return y; + // we need to compute log10(10^x + 10^y) + // By Jacobian logarithm identity, this is equal to + // max(x,y) + log10(1+10^-abs(x-y)) + // we compute the second term as a table lookup + // with integer quantization - if (Double.isInfinite(y)) - return x; + // slow exact version: + // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); - if (y >= x + MAX_JACOBIAN_TOLERANCE) - return y; - if (x >= y + MAX_JACOBIAN_TOLERANCE) - return x; + double diff = x-y; - // OK, so |y-x| < tol: we use the following identity then: - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup - // with integer quantization - - //double diff = Math.abs(x-y); - double diff = x-y; - double t1 =x; - if (diff<0) { // - t1 = y; - diff= -diff; - } - // t has max(x,y), diff has abs(x-y) - // we have pre-stored correction for 0,0.1,0.2,... 10.0 - //int ind = (int)Math.round(diff*INV_JACOBIAN_LOG_TABLE_STEP); - int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); - // gdebug+ - //double z =Math.log10(1+Math.pow(10.0,-diff)); - //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); - //gdebug- - return t1+jacobianLogTable[ind]; - // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); - } + if (diff > MAX_JACOBIAN_TOLERANCE) + return x; + else if (diff < -MAX_JACOBIAN_TOLERANCE) + return y; + else if (diff >= 0) { + int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); + return x + jacobianLogTable[ind]; + } + else { + int ind = (int)(-diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); + return y + jacobianLogTable[ind]; + } + } public static double phredScaleToProbability (byte q) { return Math.pow(10,(-q)/10.0); diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index fad2320fc..19e03a19d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -9,14 +9,17 @@ import net.sf.samtools.SAMUtils; * @author Kiran Garimella */ public class QualityUtils { - public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static double MIN_REASONABLE_ERROR = 0.0001; public final static byte MAX_REASONABLE_Q_SCORE = 40; public final static byte MIN_USABLE_Q_SCORE = 6; - public final static int MAPPING_QUALITY_UNAVAILABLE = 255; + private static double qualToErrorProbCache[] = new double[256]; + static { + for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw((byte)i); + } + /** * Private constructor. No instantiating this class! */ @@ -33,10 +36,6 @@ public class QualityUtils { return 1.0 - qualToErrorProb(qual); } - static public double qualToProb(int qual) { - return qualToProb( (double)qual ); - } - static public double qualToProb(double qual) { return 1.0 - Math.pow(10.0, qual/(-10.0)); } @@ -48,10 +47,14 @@ public class QualityUtils { * @param qual a quality score (0-40) * @return a probability (0.0-1.0) */ - static public double qualToErrorProb(byte qual) { + static public double qualToErrorProbRaw(byte qual) { return Math.pow(10.0, ((double) qual)/-10.0); } + static public double qualToErrorProb(byte qual) { + return qualToErrorProbCache[qual]; + } + /** * Convert a probability to a quality score. Note, this is capped at Q40. * @@ -110,88 +113,4 @@ public class QualityUtils { //return (byte) Math.min(qual, maxQual); return (byte) Math.max(Math.min(qual, maxQual), 1); } - - /** - * Compress a base and a probability into a single byte so that it can be output in a SAMRecord's SQ field. - * Note: the highest probability this function can encode is 64%, so this function should only never be used on the best base hypothesis. - * Another note: the probability encoded here gets rounded to the nearest 1%. - * - * @param baseIndex the base index - * @param prob the base probability - * @return a byte containing the index and the probability - */ - static public byte baseAndProbToCompressedQuality(int baseIndex, double prob) { - byte compressedQual = 0; - - compressedQual = (byte) baseIndex; - - byte cprob = (byte) (100.0*prob); - byte qualmask = (byte) 252; - compressedQual += ((cprob << 2) & qualmask); - - return compressedQual; - } - - /** - * From a compressed base, extract the base index (0:A, 1:C, 2:G, 3:T) - * - * @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality - * @return base index - */ - static public int compressedQualityToBaseIndex(byte compressedQual) { - return (int) (compressedQual & 0x3); - } - - /** - * From a compressed base, extract the base probability - * - * @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality - * @return the probability - */ - static public double compressedQualityToProb(byte compressedQual) { - // Because java natives are signed, extra care must be taken to avoid - // shifting a 1 into the sign bit in the implicit promotion of 2 to an int. - int x2 = ((int) compressedQual) & 0xff; - x2 = (x2 >>> 2); - - return ((double) x2)/100.0; - } - - /** - * Return the complement of a compressed quality - * - * @param compressedQual the compressed quality score (as returned by baseAndProbToCompressedQuality) - * @return the complementary compressed quality - */ - static public byte complementCompressedQuality(byte compressedQual) { - int baseIndex = compressedQualityToBaseIndex(compressedQual); - double prob = compressedQualityToProb(compressedQual); - - return baseAndProbToCompressedQuality(BaseUtils.complementIndex(baseIndex), prob); - } - - /** - * Return the reverse complement of a byte array of compressed qualities - * - * @param compressedQuals a byte array of compressed quality scores - * @return the reverse complement of the byte array - */ - static public byte[] reverseComplementCompressedQualityArray(byte[] compressedQuals) { - byte[] rcCompressedQuals = new byte[compressedQuals.length]; - - for (int pos = 0; pos < compressedQuals.length; pos++) { - rcCompressedQuals[compressedQuals.length - pos - 1] = complementCompressedQuality(compressedQuals[pos]); - } - - return rcCompressedQuals; - } - - /** - * Return the reverse of a byte array of qualities (compressed or otherwise) - * @param quals the array of bytes to be reversed - * @return the reverse of the quality array - */ - static public byte[] reverseQualityArray( byte[] quals ) { - return Utils.reverse(quals); // no sense in duplicating functionality - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index f9997bfd8..1b4703e4a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -190,11 +190,21 @@ public class SampleUtils { } - public static List getSamplesFromCommandLineInput(Collection sampleArgs) { + /** + * Returns a new set of samples, containing a final list of samples expanded from sampleArgs + * + * Each element E of sampleArgs can either be a literal sample name or a file. For each E, + * we try to read a file named E from disk, and if possible all lines from that file are expanded + * into unique sample names. + * + * @param sampleArgs + * @return + */ + public static Set getSamplesFromCommandLineInput(Collection sampleArgs) { if (sampleArgs != null) { // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our // sample list set, and treat the entries as if they had been specified on the command line. - List samplesFromFiles = new ArrayList(); + Set samplesFromFiles = new HashSet(); for (String SAMPLE_EXPRESSION : sampleArgs) { File sampleFile = new File(SAMPLE_EXPRESSION); @@ -203,7 +213,7 @@ public class SampleUtils { List lines = reader.readLines(); for (String line : lines) { - samplesFromFiles.add(line); + samplesFromFiles.add(line.trim()); } } catch (FileNotFoundException e) { samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample @@ -212,7 +222,8 @@ public class SampleUtils { return samplesFromFiles; } - return new ArrayList(); + + return new HashSet(); } public static Set getSamplesFromCommandLineInput(Collection vcfSamples, Collection sampleExpressions) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java index d94d9ff84..f142fa5aa 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java @@ -12,19 +12,35 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.ArrayList; /** - * TODO FOR CHRIS HARTL + * Allows for reading in RefSeq information * *

- * Codec Description + * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop, + * strandedness of transcription. *

* *

- * See also: link to file specification + * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here + * http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq *

+ *

Usage

+ * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example + *
+ * -refSeqBinding:REFSEQ /path/to/refSeq.txt
+ * 
+ * + * You will need to consult individual walkers for the binding name ("refSeqBinding", above) * *

File format example

+ * If you want to define your own file for use, the format is (tab delimited): + * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete) + * and exon frames, for example: + *
+ * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
+ * 
+ * for more information see here *

- * A BAM file containing exactly one sample. + * *

* * @author Mark DePristo diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java deleted file mode 100644 index 7f3d9e17d..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.snpEff; - -import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.AsciiLineReader; -import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.gatk.refdata.SelfScopingFeatureCodec; - -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; - -/** - * Codec for decoding the output format of the SnpEff variant effect predictor tool - * - *

- * This format has 23 tab-delimited fields: - * - *

- * Chromosome
- * Position
- * Reference
- * Change
- * Change Type: {SNP, MNP, INS, DEL}
- * Zygosity: {Hom, Het}
- * Quality
- * Coverage
- * Warnings
- * Gene ID
- * Gene Name
- * Bio Type
- * Transcript ID
- * Exon ID
- * Exon Rank
- * Effect
- * Old/New Amino Acid
- * Old/New Codon
- * Codon Num
- * CDS Size
- * Codons Around
- * Amino Acids Around
- * Custom Interval ID
- * 
- * Note that we treat all except the Chromosome, Position, and Effect fields as optional. - *

- * - *

- * See also: @see SNPEff project page - *

- * - * @author David Roazen - * @since 2011 - */ -public class SnpEffCodec implements FeatureCodec, SelfScopingFeatureCodec { - - public static final int EXPECTED_NUMBER_OF_FIELDS = 23; - public static final String FIELD_DELIMITER_PATTERN = "\\t"; - public static final String EFFECT_FIELD_DELIMITER_PATTERN = "[,:]"; - public static final String HEADER_LINE_START = "# "; - public static final String[] HEADER_FIELD_NAMES = { "Chromo", - "Position", - "Reference", - "Change", - "Change type", - "Homozygous", - "Quality", - "Coverage", - "Warnings", - "Gene_ID", - "Gene_name", - "Bio_type", - "Trancript_ID", // yes, this is how it's spelled in the SnpEff output - "Exon_ID", - "Exon_Rank", - "Effect", - "old_AA/new_AA", - "Old_codon/New_codon", - "Codon_Num(CDS)", - "CDS_size", - "Codons around", - "AAs around", - "Custom_interval_ID" - }; - - // The "Chromo", "Position", and "Effect" fields are required to be non-empty in every SnpEff output line: - public static final int[] REQUIRED_FIELDS = { 0, 1, 15 }; - - public static final String NON_CODING_GENE_FLAG = "WITHIN_NON_CODING_GENE"; - - - public Feature decodeLoc ( String line ) { - return decode(line); - } - - public Feature decode ( String line ) { - String[] tokens = line.split(FIELD_DELIMITER_PATTERN, -1); - - if ( tokens.length != EXPECTED_NUMBER_OF_FIELDS ) { - throw new TribbleException.InvalidDecodeLine("Line does not have the expected (" + EXPECTED_NUMBER_OF_FIELDS + - ") number of fields: found " + tokens.length + " fields.", line); - } - - try { - trimAllFields(tokens); - checkForRequiredFields(tokens, line); - - String contig = tokens[0]; - long position = Long.parseLong(tokens[1]); - - String reference = tokens[2].isEmpty() ? null : tokens[2]; - String change = tokens[3].isEmpty() ? null : tokens[3]; - ChangeType changeType = tokens[4].isEmpty() ? null : ChangeType.valueOf(tokens[4]); - Zygosity zygosity = tokens[5].isEmpty() ? null : Zygosity.valueOf(tokens[5]); - Double quality = tokens[6].isEmpty() ? null : Double.parseDouble(tokens[6]); - Long coverage = tokens[7].isEmpty() ? null : Long.parseLong(tokens[7]); - String warnings = tokens[8].isEmpty() ? null : tokens[8]; - String geneID = tokens[9].isEmpty() ? null : tokens[9]; - String geneName = tokens[10].isEmpty() ? null : tokens[10]; - String bioType = tokens[11].isEmpty() ? null : tokens[11]; - String transcriptID = tokens[12].isEmpty() ? null : tokens[12]; - String exonID = tokens[13].isEmpty() ? null : tokens[13]; - Integer exonRank = tokens[14].isEmpty() ? null : Integer.parseInt(tokens[14]); - - boolean isNonCodingGene = isNonCodingGene(tokens[15]); - - // Split the effect field into three subfields if the WITHIN_NON_CODING_GENE flag is present, - // otherwise split it into two subfields. We need this limit to prevent the extra effect-related information - // in the final field (when present) from being inappropriately tokenized: - - int effectFieldTokenLimit = isNonCodingGene ? 3 : 2; - String[] effectFieldTokens = tokens[15].split(EFFECT_FIELD_DELIMITER_PATTERN, effectFieldTokenLimit); - EffectType effect = parseEffect(effectFieldTokens, isNonCodingGene); - String effectExtraInformation = parseEffectExtraInformation(effectFieldTokens, isNonCodingGene); - - String oldAndNewAA = tokens[16].isEmpty() ? null : tokens[16]; - String oldAndNewCodon = tokens[17].isEmpty() ? null : tokens[17]; - Integer codonNum = tokens[18].isEmpty() ? null : Integer.parseInt(tokens[18]); - Integer cdsSize = tokens[19].isEmpty() ? null : Integer.parseInt(tokens[19]); - String codonsAround = tokens[20].isEmpty() ? null : tokens[20]; - String aasAround = tokens[21].isEmpty() ? null : tokens[21]; - String customIntervalID = tokens[22].isEmpty() ? null : tokens[22]; - - return new SnpEffFeature(contig, position, reference, change, changeType, zygosity, quality, coverage, - warnings, geneID, geneName, bioType, transcriptID, exonID, exonRank, isNonCodingGene, - effect, effectExtraInformation, oldAndNewAA, oldAndNewCodon, codonNum, cdsSize, - codonsAround, aasAround, customIntervalID); - } - catch ( NumberFormatException e ) { - throw new TribbleException.InvalidDecodeLine("Error parsing a numeric field : " + e.getMessage(), line); - } - catch ( IllegalArgumentException e ) { - throw new TribbleException.InvalidDecodeLine("Illegal value in field: " + e.getMessage(), line); - } - } - - private void trimAllFields ( String[] tokens ) { - for ( int i = 0; i < tokens.length; i++ ) { - tokens[i] = tokens[i].trim(); - } - } - - private void checkForRequiredFields ( String[] tokens, String line ) { - for ( int requiredFieldIndex : REQUIRED_FIELDS ) { - if ( tokens[requiredFieldIndex].isEmpty() ) { - throw new TribbleException.InvalidDecodeLine("Line is missing required field \"" + - HEADER_FIELD_NAMES[requiredFieldIndex] + "\"", - line); - } - } - } - - private boolean isNonCodingGene ( String effectField ) { - return effectField.startsWith(NON_CODING_GENE_FLAG); - } - - private EffectType parseEffect ( String[] effectFieldTokens, boolean isNonCodingGene ) { - String effectName = ""; - - // If there's a WITHIN_NON_CODING_GENE flag, the effect name will be in the second subfield, - // otherwise it will be in the first subfield: - - if ( effectFieldTokens.length > 1 && isNonCodingGene ) { - effectName = effectFieldTokens[1].trim(); - } - else { - effectName = effectFieldTokens[0].trim(); - } - - return EffectType.valueOf(effectName); - } - - private String parseEffectExtraInformation ( String[] effectFieldTokens, boolean isNonCodingGene ) { - - // The extra effect-related information, if present, will always be the last subfield: - - if ( (effectFieldTokens.length == 2 && ! isNonCodingGene) || effectFieldTokens.length == 3 ) { - return effectFieldTokens[effectFieldTokens.length - 1].trim(); - } - - return null; - } - - public Class getFeatureType() { - return SnpEffFeature.class; - } - - public Object readHeader ( LineReader reader ) { - String headerLine = ""; - - try { - headerLine = reader.readLine(); - } - catch ( IOException e ) { - throw new TribbleException("Unable to read header line from input file."); - } - - validateHeaderLine(headerLine); - return headerLine; - } - - private void validateHeaderLine ( String headerLine ) { - if ( headerLine == null || ! headerLine.startsWith(HEADER_LINE_START) ) { - throw new TribbleException.InvalidHeader("Header line does not start with " + HEADER_LINE_START); - } - - String[] headerTokens = headerLine.substring(HEADER_LINE_START.length()).split(FIELD_DELIMITER_PATTERN); - - if ( headerTokens.length != EXPECTED_NUMBER_OF_FIELDS ) { - throw new TribbleException.InvalidHeader("Header line does not contain headings for the expected number (" + - EXPECTED_NUMBER_OF_FIELDS + ") of columns."); - } - - for ( int columnIndex = 0; columnIndex < headerTokens.length; columnIndex++ ) { - if ( ! HEADER_FIELD_NAMES[columnIndex].equals(headerTokens[columnIndex]) ) { - throw new TribbleException.InvalidHeader("Header field #" + columnIndex + ": Expected \"" + - HEADER_FIELD_NAMES[columnIndex] + "\" but found \"" + - headerTokens[columnIndex] + "\""); - } - } - } - - public boolean canDecode ( final File potentialInput ) { - try { - LineReader reader = new AsciiLineReader(new FileInputStream(potentialInput)); - readHeader(reader); - } - catch ( Exception e ) { - return false; - } - - return true; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java deleted file mode 100644 index 270db470f..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffConstants.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.snpEff; - -/** - * A set of constants associated with the SnpEff codec. - * - * @author David Roazen - */ -public class SnpEffConstants { - - // Possible SnpEff biological effects and their associated impacts: - public enum EffectType { - START_GAINED (EffectImpact.HIGH), - START_LOST (EffectImpact.HIGH), - EXON_DELETED (EffectImpact.HIGH), - FRAME_SHIFT (EffectImpact.HIGH), - STOP_GAINED (EffectImpact.HIGH), - STOP_LOST (EffectImpact.HIGH), - SPLICE_SITE_ACCEPTOR (EffectImpact.HIGH), - SPLICE_SITE_DONOR (EffectImpact.HIGH), - - NON_SYNONYMOUS_CODING (EffectImpact.MODERATE), - UTR_5_DELETED (EffectImpact.MODERATE), - UTR_3_DELETED (EffectImpact.MODERATE), - CODON_INSERTION (EffectImpact.MODERATE), - CODON_CHANGE_PLUS_CODON_INSERTION (EffectImpact.MODERATE), - CODON_DELETION (EffectImpact.MODERATE), - CODON_CHANGE_PLUS_CODON_DELETION (EffectImpact.MODERATE), - - NONE (EffectImpact.LOW), - CHROMOSOME (EffectImpact.LOW), - INTERGENIC (EffectImpact.LOW), - UPSTREAM (EffectImpact.LOW), - UTR_5_PRIME (EffectImpact.LOW), - SYNONYMOUS_START (EffectImpact.LOW), - NON_SYNONYMOUS_START (EffectImpact.LOW), - CDS (EffectImpact.LOW), - GENE (EffectImpact.LOW), - TRANSCRIPT (EffectImpact.LOW), - EXON (EffectImpact.LOW), - SYNONYMOUS_CODING (EffectImpact.LOW), - CODON_CHANGE (EffectImpact.LOW), - SYNONYMOUS_STOP (EffectImpact.LOW), - NON_SYNONYMOUS_STOP (EffectImpact.LOW), - INTRON (EffectImpact.LOW), - UTR_3_PRIME (EffectImpact.LOW), - DOWNSTREAM (EffectImpact.LOW), - INTRON_CONSERVED (EffectImpact.LOW), - INTERGENIC_CONSERVED (EffectImpact.LOW), - CUSTOM (EffectImpact.LOW); - - private final EffectImpact impact; - - EffectType ( EffectImpact impact ) { - this.impact = impact; - } - - public EffectImpact getImpact() { - return impact; - } - } - - public enum EffectImpact { - LOW (1), - MODERATE (2), - HIGH (3); - - private final int severityRating; - - EffectImpact ( int severityRating ) { - this.severityRating = severityRating; - } - - public boolean isHigherImpactThan ( EffectImpact other ) { - return this.severityRating > other.severityRating; - } - } - - // The kinds of variants supported by the SnpEff output format: - public enum ChangeType { - SNP, - MNP, - INS, - DEL - } - - // Possible zygosities of SnpEff variants: - public enum Zygosity { - Hom, - Het - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java deleted file mode 100644 index 2f120b7d2..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffFeature.java +++ /dev/null @@ -1,423 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.snpEff; - -import org.broad.tribble.Feature; - -import java.util.NoSuchElementException; - -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectImpact; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity; - -/** - * Feature returned by the SnpEff codec -- stores the parsed field values from a line of SnpEff output. - * - * Many fields are optional, and missing values are represented by nulls. You should always call the - * hasX() method before calling the corresponding getX() method. Required fields can never be null - * and do not have a hasX() method. - * - * @author David Roazen - */ -public class SnpEffFeature implements Feature { - - private String contig; // REQUIRED FIELD - private long position; // REQUIRED FIELD - private String reference; - private String change; - private ChangeType changeType; - private Zygosity zygosity; - private Double quality; - private Long coverage; - private String warnings; - private String geneID; - private String geneName; - private String bioType; - private String transcriptID; - private String exonID; - private Integer exonRank; - private boolean isNonCodingGene; // REQUIRED FIELD - private EffectType effect; // REQUIRED FIELD - private String effectExtraInformation; - private String oldAndNewAA; - private String oldAndNewCodon; - private Integer codonNum; - private Integer cdsSize; - private String codonsAround; - private String aasAround; - private String customIntervalID; - - public SnpEffFeature ( String contig, - long position, - String reference, - String change, - ChangeType changeType, - Zygosity zygosity, - Double quality, - Long coverage, - String warnings, - String geneID, - String geneName, - String bioType, - String transcriptID, - String exonID, - Integer exonRank, - boolean isNonCodingGene, - EffectType effect, - String effectExtraInformation, - String oldAndNewAA, - String oldAndNewCodon, - Integer codonNum, - Integer cdsSize, - String codonsAround, - String aasAround, - String customIntervalID ) { - - if ( contig == null || effect == null ) { - throw new IllegalArgumentException("contig and effect cannot be null, as they are required fields"); - } - - this.contig = contig; - this.position = position; - this.reference = reference; - this.change = change; - this.changeType = changeType; - this.zygosity = zygosity; - this.quality = quality; - this.coverage = coverage; - this.warnings = warnings; - this.geneID = geneID; - this.geneName = geneName; - this.bioType = bioType; - this.transcriptID = transcriptID; - this.exonID = exonID; - this.exonRank = exonRank; - this.isNonCodingGene = isNonCodingGene; - this.effect = effect; - this.effectExtraInformation = effectExtraInformation; - this.oldAndNewAA = oldAndNewAA; - this.oldAndNewCodon = oldAndNewCodon; - this.codonNum = codonNum; - this.cdsSize = cdsSize; - this.codonsAround = codonsAround; - this.aasAround = aasAround; - this.customIntervalID = customIntervalID; - } - - public boolean isHigherImpactThan ( SnpEffFeature other ) { - - // If one effect is in a non-coding gene and the other is not, the effect NOT in the - // non-coding gene has higher impact: - - if ( ! isNonCodingGene() && other.isNonCodingGene() ) { - return true; - } - else if ( isNonCodingGene() && ! other.isNonCodingGene() ) { - return false; - } - - // Otherwise, both effects are either in or not in a non-coding gene, so we compare the impacts - // of the effects themselves as defined in the SnpEffConstants class: - - return getEffectImpact().isHigherImpactThan(other.getEffectImpact()); - } - - public String getChr() { - return contig; - } - - public int getStart() { - return (int)position; - } - - public int getEnd() { - return (int)position; - } - - public boolean hasReference() { - return reference != null; - } - - public String getReference() { - if ( reference == null ) throw new NoSuchElementException("This feature has no reference field"); - return reference; - } - - public boolean hasChange() { - return change != null; - } - - public String getChange() { - if ( change == null ) throw new NoSuchElementException("This feature has no change field"); - return change; - } - - public boolean hasChangeType() { - return changeType != null; - } - - public ChangeType getChangeType() { - if ( changeType == null ) throw new NoSuchElementException("This feature has no changeType field"); - return changeType; - } - - public boolean hasZygosity() { - return zygosity != null; - } - - public Zygosity getZygosity() { - if ( zygosity == null ) throw new NoSuchElementException("This feature has no zygosity field"); - return zygosity; - } - - public boolean hasQuality() { - return quality != null; - } - - public Double getQuality() { - if ( quality == null ) throw new NoSuchElementException("This feature has no quality field"); - return quality; - } - - public boolean hasCoverage() { - return coverage != null; - } - - public Long getCoverage() { - if ( coverage == null ) throw new NoSuchElementException("This feature has no coverage field"); - return coverage; - } - - public boolean hasWarnings() { - return warnings != null; - } - - public String getWarnings() { - if ( warnings == null ) throw new NoSuchElementException("This feature has no warnings field"); - return warnings; - } - - public boolean hasGeneID() { - return geneID != null; - } - - public String getGeneID() { - if ( geneID == null ) throw new NoSuchElementException("This feature has no geneID field"); - return geneID; - } - - public boolean hasGeneName() { - return geneName != null; - } - - public String getGeneName() { - if ( geneName == null ) throw new NoSuchElementException("This feature has no geneName field"); - return geneName; - } - - public boolean hasBioType() { - return bioType != null; - } - - public String getBioType() { - if ( bioType == null ) throw new NoSuchElementException("This feature has no bioType field"); - return bioType; - } - - public boolean hasTranscriptID() { - return transcriptID != null; - } - - public String getTranscriptID() { - if ( transcriptID == null ) throw new NoSuchElementException("This feature has no transcriptID field"); - return transcriptID; - } - - public boolean hasExonID() { - return exonID != null; - } - - public String getExonID() { - if ( exonID == null ) throw new NoSuchElementException("This feature has no exonID field"); - return exonID; - } - - public boolean hasExonRank() { - return exonRank != null; - } - - public Integer getExonRank() { - if ( exonRank == null ) throw new NoSuchElementException("This feature has no exonRank field"); - return exonRank; - } - - public boolean isNonCodingGene() { - return isNonCodingGene; - } - - public EffectType getEffect() { - return effect; - } - - public EffectImpact getEffectImpact() { - return effect.getImpact(); - } - - public boolean hasEffectExtraInformation() { - return effectExtraInformation != null; - } - - public String getEffectExtraInformation() { - if ( effectExtraInformation == null ) throw new NoSuchElementException("This feature has no effectExtraInformation field"); - return effectExtraInformation; - } - - public boolean hasOldAndNewAA() { - return oldAndNewAA != null; - } - - public String getOldAndNewAA() { - if ( oldAndNewAA == null ) throw new NoSuchElementException("This feature has no oldAndNewAA field"); - return oldAndNewAA; - } - - public boolean hasOldAndNewCodon() { - return oldAndNewCodon != null; - } - - public String getOldAndNewCodon() { - if ( oldAndNewCodon == null ) throw new NoSuchElementException("This feature has no oldAndNewCodon field"); - return oldAndNewCodon; - } - - public boolean hasCodonNum() { - return codonNum != null; - } - - public Integer getCodonNum() { - if ( codonNum == null ) throw new NoSuchElementException("This feature has no codonNum field"); - return codonNum; - } - - public boolean hasCdsSize() { - return cdsSize != null; - } - - public Integer getCdsSize() { - if ( cdsSize == null ) throw new NoSuchElementException("This feature has no cdsSize field"); - return cdsSize; - } - - public boolean hasCodonsAround() { - return codonsAround != null; - } - - public String getCodonsAround() { - if ( codonsAround == null ) throw new NoSuchElementException("This feature has no codonsAround field"); - return codonsAround; - } - - public boolean hadAasAround() { - return aasAround != null; - } - - public String getAasAround() { - if ( aasAround == null ) throw new NoSuchElementException("This feature has no aasAround field"); - return aasAround; - } - - public boolean hasCustomIntervalID() { - return customIntervalID != null; - } - - public String getCustomIntervalID() { - if ( customIntervalID == null ) throw new NoSuchElementException("This feature has no customIntervalID field"); - return customIntervalID; - } - - public boolean equals ( Object o ) { - if ( o == null || ! (o instanceof SnpEffFeature) ) { - return false; - } - - SnpEffFeature other = (SnpEffFeature)o; - - return contig.equals(other.contig) && - position == other.position && - (reference == null ? other.reference == null : reference.equals(other.reference)) && - (change == null ? other.change == null : change.equals(other.change)) && - changeType == other.changeType && - zygosity == other.zygosity && - (quality == null ? other.quality == null : quality.equals(other.quality)) && - (coverage == null ? other.coverage == null : coverage.equals(other.coverage)) && - (warnings == null ? other.warnings == null : warnings.equals(other.warnings)) && - (geneID == null ? other.geneID == null : geneID.equals(other.geneID)) && - (geneName == null ? other.geneName == null : geneName.equals(other.geneName)) && - (bioType == null ? other.bioType == null : bioType.equals(other.bioType)) && - (transcriptID == null ? other.transcriptID == null : transcriptID.equals(other.transcriptID)) && - (exonID == null ? other.exonID == null : exonID.equals(other.exonID)) && - (exonRank == null ? other.exonRank == null : exonRank.equals(other.exonRank)) && - isNonCodingGene == other.isNonCodingGene && - effect == other.effect && - (effectExtraInformation == null ? other.effectExtraInformation == null : effectExtraInformation.equals(other.effectExtraInformation)) && - (oldAndNewAA == null ? other.oldAndNewAA == null : oldAndNewAA.equals(other.oldAndNewAA)) && - (oldAndNewCodon == null ? other.oldAndNewCodon == null : oldAndNewCodon.equals(other.oldAndNewCodon)) && - (codonNum == null ? other.codonNum == null : codonNum.equals(other.codonNum)) && - (cdsSize == null ? other.cdsSize == null : cdsSize.equals(other.cdsSize)) && - (codonsAround == null ? other.codonsAround == null : codonsAround.equals(other.codonsAround)) && - (aasAround == null ? other.aasAround == null : aasAround.equals(other.aasAround)) && - (customIntervalID == null ? other.customIntervalID == null : customIntervalID.equals(other.customIntervalID)); - } - - public String toString() { - return "[Contig: " + contig + - " Position: " + position + - " Reference: " + reference + - " Change: " + change + - " Change Type: " + changeType + - " Zygosity: " + zygosity + - " Quality: " + quality + - " Coverage: " + coverage + - " Warnings: " + warnings + - " Gene ID: " + geneID + - " Gene Name: " + geneName + - " Bio Type: " + bioType + - " Transcript ID: " + transcriptID + - " Exon ID: " + exonID + - " Exon Rank: " + exonRank + - " Non-Coding Gene: " + isNonCodingGene + - " Effect: " + effect + - " Effect Extra Information: " + effectExtraInformation + - " Old/New AA: " + oldAndNewAA + - " Old/New Codon: " + oldAndNewCodon + - " Codon Num: " + codonNum + - " CDS Size: " + cdsSize + - " Codons Around: " + codonsAround + - " AAs Around: " + aasAround + - " Custom Interval ID: " + customIntervalID + - "]"; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index bb212e128..624d06a71 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -227,7 +227,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, throw new UserException.MalformedVCF(message, lineNo); } - private static void generateException(String message, int lineNo) { + protected static void generateException(String message, int lineNo) { throw new UserException.MalformedVCF(message, lineNo); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java new file mode 100644 index 000000000..71ec4ce1b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.vcf; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMSequenceDictionary; +import org.broad.tribble.Tribble; +import org.broad.tribble.TribbleException; +import org.broad.tribble.index.DynamicIndexCreator; +import org.broad.tribble.index.Index; +import org.broad.tribble.index.IndexFactory; +import org.broad.tribble.util.LittleEndianOutputStream; +import org.broad.tribble.util.PositionalStream; +import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; + +/** + * this class writes VCF files + */ +public abstract class IndexingVCFWriter implements VCFWriter { + final private String name; + private final SAMSequenceDictionary refDict; + + private OutputStream outputStream; + private PositionalStream positionalStream = null; + private DynamicIndexCreator indexer = null; + private LittleEndianOutputStream idxStream = null; + + @Requires({"name != null", + "! ( location == null && output == null )", + "! ( enableOnTheFlyIndexing && location == null )"}) + protected IndexingVCFWriter(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) { + outputStream = output; + this.name = name; + this.refDict = refDict; + + if ( enableOnTheFlyIndexing ) { + try { + idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(location))); + //System.out.println("Creating index on the fly for " + location); + indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); + indexer.initialize(location, indexer.defaultBinSize()); + positionalStream = new PositionalStream(output); + outputStream = positionalStream; + } catch ( IOException ex ) { + // No matter what we keep going, since we don't care if we can't create the index file + idxStream = null; + indexer = null; + positionalStream = null; + } + } + } + + @Ensures("result != null") + public OutputStream getOutputStream() { + return outputStream; + } + + @Ensures("result != null") + public String getStreamName() { + return name; + } + + public abstract void writeHeader(VCFHeader header); + + /** + * attempt to close the VCF file + */ + public void close() { + // try to close the index stream (keep it separate to help debugging efforts) + if ( indexer != null ) { + try { + Index index = indexer.finalizeIndex(positionalStream.getPosition()); + IndexDictionaryUtils.setIndexSequenceDictionary(index, refDict); + index.write(idxStream); + idxStream.close(); + } catch (IOException e) { + throw new ReviewedStingException("Unable to close index for " + getStreamName(), e); + } + } + } + + /** + * add a record to the file + * + * @param vc the Variant Context object + */ + public void add(VariantContext vc) { + // if we are doing on the fly indexing, add the record ***before*** we write any bytes + if ( indexer != null ) + indexer.addFeature(vc, positionalStream.getPosition()); + } + + /** + * Returns a reasonable "name" for this writer, to display to the user if something goes wrong + * + * @param location + * @param stream + * @return + */ + protected static final String writerName(final File location, final OutputStream stream) { + return location == null ? stream.toString() : location.getAbsolutePath(); + } + + /** + * Returns a output stream writing to location, or throws a UserException if this fails + * @param location + * @return + */ + protected static OutputStream openOutputStream(final File location) { + try { + return new FileOutputStream(location); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(location, "Unable to create VCF writer", e); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index d3705813c..0da7a100f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.TribbleException; import org.broad.tribble.index.DynamicIndexCreator; @@ -44,46 +45,30 @@ import java.util.*; /** * this class writes VCF files */ -public class StandardVCFWriter implements VCFWriter { +public class StandardVCFWriter extends IndexingVCFWriter { + // the print stream we're writing to + final protected BufferedWriter mWriter; + + // should we write genotypes or just sites? + final protected boolean doNotWriteGenotypes; // the VCF header we're storing protected VCFHeader mHeader = null; - // the print stream we're writing to - protected BufferedWriter mWriter; - protected PositionalStream positionalStream = null; - // were filters applied? protected boolean filtersWereAppliedToContext = false; - // should we write genotypes or just sites? - protected boolean doNotWriteGenotypes = false; - - protected DynamicIndexCreator indexer = null; - protected File indexFile = null; - LittleEndianOutputStream idxStream = null; - File location = null; - /** * create a VCF writer, given a file to write to * * @param location the file location to write to */ - public StandardVCFWriter(File location) { - this(location, openOutputStream(location), true, false); + public StandardVCFWriter(final File location, final SAMSequenceDictionary refDict) { + this(location, openOutputStream(location), refDict, true, false); } - public StandardVCFWriter(File location, boolean enableOnTheFlyIndexing) { - this(location, openOutputStream(location), enableOnTheFlyIndexing, false); - } - - /** - * create a VCF writer, given a stream to write to - * - * @param output the file location to write to - */ - public StandardVCFWriter(OutputStream output) { - this(output, false); + public StandardVCFWriter(File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing) { + this(location, openOutputStream(location), refDict, enableOnTheFlyIndexing, false); } /** @@ -92,33 +77,23 @@ public class StandardVCFWriter implements VCFWriter { * @param output the file location to write to * @param doNotWriteGenotypes do not write genotypes */ - public StandardVCFWriter(OutputStream output, boolean doNotWriteGenotypes) { - mWriter = new BufferedWriter(new OutputStreamWriter(output)); + public StandardVCFWriter(final OutputStream output, final SAMSequenceDictionary refDict, final boolean doNotWriteGenotypes) { + this(null, output, refDict, false, doNotWriteGenotypes); + } + + public StandardVCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { + super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing); + mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size this.doNotWriteGenotypes = doNotWriteGenotypes; } - public StandardVCFWriter(File location, OutputStream output, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { - this.location = location; - - if ( enableOnTheFlyIndexing ) { - indexFile = Tribble.indexFile(location); - try { - idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); - //System.out.println("Creating index on the fly for " + location); - indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - indexer.initialize(location, indexer.defaultBinSize()); - positionalStream = new PositionalStream(output); - output = positionalStream; - } catch ( IOException ex ) { - // No matter what we keep going, since we don't care if we can't create the index file - } - } - - //mWriter = new BufferedWriter(new OutputStreamWriter(new PositionalStream(output))); - mWriter = new BufferedWriter(new OutputStreamWriter(output)); - this.doNotWriteGenotypes = doNotWriteGenotypes; - } + // -------------------------------------------------------------------------------- + // + // VCFWriter interface functions + // + // -------------------------------------------------------------------------------- + @Override public void writeHeader(VCFHeader header) { mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header; @@ -158,44 +133,24 @@ public class StandardVCFWriter implements VCFWriter { mWriter.flush(); // necessary so that writing to an output stream will work } catch (IOException e) { - throw new TribbleException("IOException writing the VCF header to " + locationString(), e); + throw new ReviewedStingException("IOException writing the VCF header to " + getStreamName(), e); } } - private String locationString() { - return location == null ? mWriter.toString() : location.getAbsolutePath(); - } - /** * attempt to close the VCF file */ + @Override public void close() { // try to close the vcf stream try { mWriter.flush(); mWriter.close(); } catch (IOException e) { - throw new TribbleException("Unable to close " + locationString() + " because of " + e.getMessage()); + throw new ReviewedStingException("Unable to close " + getStreamName(), e); } - // try to close the index stream (keep it separate to help debugging efforts) - if ( indexer != null ) { - try { - Index index = indexer.finalizeIndex(positionalStream.getPosition()); - index.write(idxStream); - idxStream.close(); - } catch (IOException e) { - throw new TribbleException("Unable to close index for " + locationString() + " because of " + e.getMessage()); - } - } - } - - protected static OutputStream openOutputStream(File location) { - try { - return new FileOutputStream(location); - } catch (FileNotFoundException e) { - throw new TribbleException("Unable to create VCF file at location: " + location); - } + super.close(); } /** @@ -203,28 +158,17 @@ public class StandardVCFWriter implements VCFWriter { * * @param vc the Variant Context object */ + @Override public void add(VariantContext vc) { - add(vc, false); - } - - /** - * add a record to the file - * - * @param vc the Variant Context object - * @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER THE EVENT INSTEAD) - */ - public void add(VariantContext vc, boolean refBaseShouldBeAppliedToEndOfAlleles) { if ( mHeader == null ) - throw new IllegalStateException("The VCF Header must be written before records can be added: " + locationString()); + throw new IllegalStateException("The VCF Header must be written before records can be added: " + getStreamName()); if ( doNotWriteGenotypes ) vc = VariantContext.modifyGenotypes(vc, null); try { - vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBaseShouldBeAppliedToEndOfAlleles); - - // if we are doing on the fly indexing, add the record ***before*** we write any bytes - if ( indexer != null ) indexer.addFeature(vc, positionalStream.getPosition()); + vc = VariantContext.createVariantContextWithPaddedAlleles(vc, false); + super.add(vc); Map alleleMap = new HashMap(vc.getAlleles().size()); alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup @@ -275,7 +219,7 @@ public class StandardVCFWriter implements VCFWriter { mWriter.write(VCFConstants.FIELD_SEPARATOR); // FILTER - String filters = vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (filtersWereAppliedToContext || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); + String filters = getFilterString(vc, filtersWereAppliedToContext); mWriter.write(filters); mWriter.write(VCFConstants.FIELD_SEPARATOR); @@ -317,9 +261,22 @@ public class StandardVCFWriter implements VCFWriter { mWriter.write("\n"); mWriter.flush(); // necessary so that writing to an output stream will work } catch (IOException e) { - throw new RuntimeException("Unable to write the VCF object to " + locationString()); + throw new RuntimeException("Unable to write the VCF object to " + getStreamName()); } + } + // -------------------------------------------------------------------------------- + // + // implementation functions + // + // -------------------------------------------------------------------------------- + + public static final String getFilterString(final VariantContext vc) { + return getFilterString(vc, false); + } + + public static final String getFilterString(final VariantContext vc, boolean forcePASS) { + return vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (forcePASS || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); } private String getQualValue(double qual) { @@ -462,7 +419,7 @@ public class StandardVCFWriter implements VCFWriter { mWriter.write(encoding); } - private static String formatVCFField(Object val) { + public static String formatVCFField(Object val) { String result; if ( val == null ) result = VCFConstants.MISSING_VALUE_v4; @@ -524,12 +481,11 @@ public class StandardVCFWriter implements VCFWriter { } - public static int countOccurrences(char c, String s) { + private static int countOccurrences(char c, String s) { int count = 0; for (int i = 0; i < s.length(); i++) { count += s.charAt(i) == c ? 1 : 0; } return count; } - } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index fa030ef5f..42ea05355 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -105,34 +105,37 @@ public class VCFCodec extends AbstractVCFCodec { * @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF) */ protected Set parseFilters(String filterString) { + return parseFilters(filterHash, lineNo, filterString); + } + public static Set parseFilters(final Map> cache, final int lineNo, final String filterString) { // null for unfiltered if ( filterString.equals(VCFConstants.UNFILTERED) ) return null; - // empty set for passes filters - LinkedHashSet fFields = new LinkedHashSet(); - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) - return fFields; + return Collections.emptySet(); if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4"); + generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); if ( filterString.length() == 0 ) - generateException("The VCF specification requires a valid filter status"); + generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); // do we have the filter string cached? - if ( filterHash.containsKey(filterString) ) - return filterHash.get(filterString); + if ( cache != null && cache.containsKey(filterString) ) + return Collections.unmodifiableSet(cache.get(filterString)); + // empty set for passes filters + LinkedHashSet fFields = new LinkedHashSet(); // otherwise we have to parse and cache the value if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) fFields.add(filterString); else fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); - filterHash.put(filterString, fFields); + fFields = fFields; + if ( cache != null ) cache.put(filterString, fFields); - return fFields; + return Collections.unmodifiableSet(fFields); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index eb01e5dca..fd1c74993 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -24,6 +24,7 @@ public class VCFHeader { private final Set mMetaData; private final Map mInfoMetaData = new HashMap(); private final Map mFormatMetaData = new HashMap(); + private final Map mOtherMetaData = new HashMap(); // the list of auxillary tags private final Set mGenotypeSampleNames = new LinkedHashSet(); @@ -110,6 +111,9 @@ public class VCFHeader { VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; mFormatMetaData.put(formatLine.getName(), formatLine); } + else { + mOtherMetaData.put(line.getKey(), line); + } } } @@ -185,6 +189,14 @@ public class VCFHeader { public VCFFormatHeaderLine getFormatHeaderLine(String key) { return mFormatMetaData.get(key); } + + /** + * @param key the header key name + * @return the meta data line, or null if there is none + */ + public VCFHeaderLine getOtherHeaderLine(String key) { + return mOtherMetaData.get(key); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java new file mode 100644 index 000000000..ef0d9ca42 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gcf; + +import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; +import java.util.*; + +/** + * GATK binary VCF record + * + * @author Your Name + * @since Date created + */ +public class GCF { + private final static int RECORD_TERMINATOR = 123456789; + private int chromOffset; + private int start, stop; + private String id; + private List alleleMap; + private int alleleOffsets[]; + private float qual; + private byte refPad; + private String info; + private int filterOffset; + + private List genotypes = Collections.emptyList(); + + public GCF(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc, boolean skipGenotypes) { + chromOffset = GCFHeaderBuilder.encodeString(vc.getChr()); + start = vc.getStart(); + stop = vc.getEnd(); + refPad = vc.hasReferenceBaseForIndel() ? vc.getReferenceBaseForIndel() : 0; + id = vc.getID(); + + // encode alleles + alleleMap = new ArrayList(vc.getNAlleles()); + alleleOffsets = new int[vc.getNAlleles()]; + alleleMap.add(vc.getReference()); + alleleOffsets[0] = GCFHeaderBuilder.encodeAllele(vc.getReference()); + for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { + alleleMap.add(vc.getAlternateAllele(i)); + alleleOffsets[i+1] = GCFHeaderBuilder.encodeAllele(vc.getAlternateAllele(i)); + } + + qual = (float)vc.getNegLog10PError(); //qualToByte(vc.getPhredScaledQual()); + info = infoFieldString(vc, GCFHeaderBuilder); + filterOffset = GCFHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc)); + + if ( ! skipGenotypes ) { + genotypes = encodeGenotypes(GCFHeaderBuilder, vc); + } + } + + public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException, EOFException { + chromOffset = inputStream.readInt(); + + // have we reached the footer? + if ( chromOffset == GCFHeader.FOOTER_START_MARKER ) + throw new EOFException(); + + start = inputStream.readInt(); + stop = inputStream.readInt(); + id = inputStream.readUTF(); + refPad = inputStream.readByte(); + alleleOffsets = readIntArray(inputStream); + qual = inputStream.readFloat(); + info = inputStream.readUTF(); + filterOffset = inputStream.readInt(); + + int nGenotypes = inputStream.readInt(); + int sizeOfGenotypes = inputStream.readInt(); + if ( skipGenotypes ) { + genotypes = Collections.emptyList(); + inputStream.skipBytes(sizeOfGenotypes); + } else { + genotypes = new ArrayList(nGenotypes); + for ( int i = 0; i < nGenotypes; i++ ) + genotypes.add(new GCFGenotype(this, inputStream)); + } + + int recordDone = inputStream.readInt(); + if ( recordDone != RECORD_TERMINATOR ) + throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key"); + } + + public int write(DataOutputStream outputStream) throws IOException { + int startSize = outputStream.size(); + outputStream.writeInt(chromOffset); + outputStream.writeInt(start); + outputStream.writeInt(stop); + outputStream.writeUTF(id); + outputStream.writeByte(refPad); + writeIntArray(alleleOffsets, outputStream, true); + outputStream.writeFloat(qual); + outputStream.writeUTF(info); + outputStream.writeInt(filterOffset); + + int nGenotypes = genotypes.size(); + int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes; + outputStream.writeInt(nGenotypes); + outputStream.writeInt(expectedSizeOfGenotypes); + int obsSizeOfGenotypes = 0; + for ( GCFGenotype g : genotypes ) + obsSizeOfGenotypes += g.write(outputStream); + if ( obsSizeOfGenotypes != expectedSizeOfGenotypes ) + throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes); + + outputStream.writeInt(RECORD_TERMINATOR); + return outputStream.size() - startSize; + } + + public VariantContext decode(final String source, final GCFHeader header) { + final String contig = header.getString(chromOffset); + alleleMap = header.getAlleles(alleleOffsets); + double negLog10PError = qual; // QualityUtils.qualToErrorProb(qual); + Set filters = header.getFilters(filterOffset); + Map attributes = new HashMap(); + attributes.put("INFO", info); + Byte refPadByte = refPad == 0 ? null : refPad; + Map genotypes = decodeGenotypes(header); + + return new VariantContext(source, contig, start, stop, alleleMap, genotypes, negLog10PError, filters, attributes, refPadByte); + } + + private Map decodeGenotypes(final GCFHeader header) { + if ( genotypes.isEmpty() ) + return VariantContext.NO_GENOTYPES; + else { + Map map = new TreeMap(); + + for ( int i = 0; i < genotypes.size(); i++ ) { + final String sampleName = header.getSample(i); + final Genotype g = genotypes.get(i).decode(sampleName, header, this, alleleMap); + map.put(sampleName, g); + } + + return map; + } + } + + private List encodeGenotypes(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc) { + int nGenotypes = vc.getNSamples(); + if ( nGenotypes > 0 ) { + List genotypes = new ArrayList(nGenotypes); + for ( int i = 0; i < nGenotypes; i++ ) genotypes.add(null); + + for ( Genotype g : vc.getGenotypes().values() ) { + int i = GCFHeaderBuilder.encodeSample(g.getSampleName()); + genotypes.set(i, new GCFGenotype(GCFHeaderBuilder, alleleMap, g)); + } + + return genotypes; + } else { + return Collections.emptyList(); + } + } + + public int getNAlleles() { return alleleOffsets.length; } + + + private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) { + StringBuilder s = new StringBuilder(); + + boolean first = true; + for ( Map.Entry field : vc.getAttributes().entrySet() ) { + String key = field.getKey(); + if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) + continue; + int stringIndex = GCFHeaderBuilder.encodeString(key); + String outputValue = StandardVCFWriter.formatVCFField(field.getValue()); + if ( outputValue != null ) { + if ( ! first ) s.append(";"); + s.append(stringIndex).append("=").append(outputValue); + first = false; + } + } + + return s.toString(); + } + + protected final static int BUFFER_SIZE = 1048576; // 2**20 + + public static DataInputStream createDataInputStream(final InputStream stream) { + return new DataInputStream(new BufferedInputStream(stream, BUFFER_SIZE)); + } + + public static FileInputStream createFileInputStream(final File file) throws FileNotFoundException { + return new FileInputStream(file); + } + + protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException { + return readIntArray(inputStream, inputStream.readInt()); + } + + protected final static int[] readIntArray(final DataInputStream inputStream, int size) throws IOException { + int[] array = new int[size]; + for ( int i = 0; i < array.length; i++ ) + array[i] = inputStream.readInt(); + return array; + } + + protected final static void writeIntArray(int[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException { + if ( writeSize ) outputStream.writeInt(array.length); + for ( int i : array ) + outputStream.writeInt(i); + } + + protected final static byte[] readByteArray(final DataInputStream inputStream) throws IOException { + return readByteArray(inputStream, inputStream.readInt()); + } + + protected final static byte[] readByteArray(final DataInputStream inputStream, int size) throws IOException { + byte[] array = new byte[size]; + for ( int i = 0; i < array.length; i++ ) + array[i] = inputStream.readByte(); + return array; + } + + protected final static void writeByteArray(byte[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException { + if ( writeSize ) outputStream.writeInt(array.length); + for ( byte i : array ) + outputStream.writeByte(i); + } + + protected final static byte qualToByte(double phredScaledQual) { + return (byte)Math.round(Math.min(phredScaledQual, 255)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java new file mode 100644 index 000000000..dd1fb091c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gcf; + +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.*; + +/** + * GATK binary VCF record + * + * @author Your Name + * @since Date created + */ +public class GCFGenotype { + private byte gq; + private int gt; + private int dp; + private int ad[]; + private byte[] pl; + + // todo -- what to do about phasing? Perhaps we shouldn't support it + // todo -- is the FL field generic or just a flag? Should we even support per sample filtering? + + public GCFGenotype(final GCFHeaderBuilder GCFHeaderBuilder, final List allAlleles, Genotype genotype) { + gq = GCF.qualToByte(genotype.getPhredScaledQual()); + gt = encodeAlleles(genotype.getAlleles(), allAlleles); + + dp = genotype.getAttributeAsInt("DP", 0); + + int nAlleles = allAlleles.size(); + ad = new int[nAlleles]; + + int npls = nAllelesToNPls(nAlleles); + pl = new byte[npls]; + } + + private int nAllelesToNPls( int nAlleles ) { + return nAlleles*(nAlleles+1) / 2; + } + + public GCFGenotype(GCF GCF, DataInputStream inputStream) throws IOException { + int gqInt = inputStream.readUnsignedByte(); + gq = (byte)gqInt; + gt = inputStream.readInt(); + dp = inputStream.readInt(); + ad = GCF.readIntArray(inputStream, GCF.getNAlleles()); + pl = GCF.readByteArray(inputStream, nAllelesToNPls(GCF.getNAlleles())); + } + + // 2 alleles => 1 + 8 + 8 + 3 => 20 + protected int sizeInBytes() { + return 1 // gq + + 4 * 2 // gt + dp + + 4 * ad.length // ad + + 1 * pl.length; // pl + } + + public Genotype decode(final String sampleName, final GCFHeader header, GCF GCF, List alleleIndex) { + final List alleles = decodeAlleles(gt, alleleIndex); + final double negLog10PError = gq / 10.0; + final Set filters = Collections.emptySet(); + final Map attributes = new HashMap(); + attributes.put("DP", dp); + attributes.put("AD", ad); + attributes.put("PL", pl); + + return new Genotype(sampleName, alleles, negLog10PError, filters, attributes, false); + } + + private static int encodeAlleles(List gtList, List allAlleles) { + final int nAlleles = gtList.size(); + if ( nAlleles > 4 ) + throw new IllegalArgumentException("encodeAlleles doesn't support more than 4 alt alleles, but I saw " + gtList); + + int gtInt = 0; + for ( int i = 0; i < nAlleles ; i++ ) { + final int bitOffset = i * 8; + final int allelei = getAlleleIndex(gtList.get(i), allAlleles); + final int gti = (allelei + 1) << bitOffset; + gtInt = gtInt | gti; + } + + return gtInt; + } + + private static int getAlleleIndex(Allele q, List allAlleles) { + if ( q.isNoCall() ) + return 254; + for ( int i = 0; i < allAlleles.size(); i++ ) + if ( q.equals(allAlleles.get(i)) ) + return i; + throw new IllegalStateException("getAlleleIndex passed allele not in map! allele " + q + " allAlleles " + allAlleles); + } + + private static List decodeAlleles(int gtInt, List alleleIndex) { + List alleles = new ArrayList(4); + + for ( int i = 0; i < 32; i += 8 ) { + final int gi = (gtInt & (0x000000FF << i)) >> i; + if ( gi != 0 ) { + final int allelei = gi - 1; + alleles.add( allelei == 254 ? Allele.NO_CALL : alleleIndex.get(allelei) ); + } else { + break; + } + } + + return alleles; + } + + public int write(DataOutputStream outputStream) throws IOException { + int startSize = outputStream.size(); + outputStream.writeByte(gq); + outputStream.writeInt(gt); + outputStream.writeInt(dp); + GCF.writeIntArray(ad, outputStream, false); + GCF.writeByteArray(pl, outputStream, false); + return outputStream.size() - startSize; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java new file mode 100644 index 000000000..6d96eda56 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gcf; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.io.*; +import java.util.*; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public class GCFHeader { + final protected static Logger logger = Logger.getLogger(GCFHeader.class); + + public final static int GCF_VERSION = 1; + public final static byte[] GCF_FILE_START_MARKER = "GCF\1".getBytes(); + public final static int FOOTER_START_MARKER = -1; + public final static long HEADER_FORWARD_REFERENCE_OFFSET = GCF_FILE_START_MARKER.length + 4; // for the version + + final int version; + long footerPosition; + final List alleles; + final List strings; + final List samples; + final List> filters; + + public GCFHeader(final Map allelesIn, final Map stringIn, final Map samplesIn) { + version = GCF_VERSION; + footerPosition = 0; + this.alleles = linearize(allelesIn); + this.strings = linearize(stringIn); + this.samples = linearize(samplesIn); + this.filters = null; // not used with this constructor + } + + public GCFHeader(FileInputStream fileInputStream) throws IOException { + DataInputStream inputStream = new DataInputStream(fileInputStream); + byte[] headerTest = new byte[GCF_FILE_START_MARKER.length]; + inputStream.read(headerTest); + if ( ! Arrays.equals(headerTest, GCF_FILE_START_MARKER) ) { + throw new UserException("Could not read GVCF file. GCF_FILE_START_MARKER missing. Saw " + new String(headerTest)); + } else { + version = inputStream.readInt(); + logger.info("Read GCF version " + version); + footerPosition = inputStream.readLong(); + logger.info("Read footer position of " + footerPosition); + long lastPos = fileInputStream.getChannel().position(); + logger.info(" Last position is " + lastPos); + + // seek to the footer + fileInputStream.getChannel().position(footerPosition); + if ( inputStream.readInt() != FOOTER_START_MARKER ) + throw new UserException.MalformedFile("Malformed GCF file: couldn't find the footer marker"); + alleles = stringsToAlleles(readStrings(inputStream)); + strings = readStrings(inputStream); + samples = readStrings(inputStream); + logger.info(String.format("Allele map of %d elements", alleles.size())); + logger.info(String.format("String map of %d elements", strings.size())); + logger.info(String.format("Sample map of %d elements", samples.size())); + filters = initializeFilterCache(); + fileInputStream.getChannel().position(lastPos); + } + } + + public static int writeHeader(final DataOutputStream outputStream) throws IOException { + int startBytes = outputStream.size(); + outputStream.write(GCF_FILE_START_MARKER); + outputStream.writeInt(GCF_VERSION); + outputStream.writeLong(0); + return outputStream.size() - startBytes; + } + + public int writeFooter(final DataOutputStream outputStream) throws IOException { + int startBytes = outputStream.size(); + outputStream.writeInt(FOOTER_START_MARKER); // has to be the same as chrom encoding + write(outputStream, allelesToStrings(alleles)); + write(outputStream, strings); + write(outputStream, samples); + return outputStream.size() - startBytes; + } + + private void write(DataOutputStream outputStream, List l) throws IOException { + outputStream.writeInt(l.size()); + for ( String elt : l ) outputStream.writeUTF(elt); + } + + private List allelesToStrings(List alleles) { + List strings = new ArrayList(alleles.size()); + for ( Allele allele : alleles ) strings.add(allele.toString()); + return strings; + } + + private List> initializeFilterCache() { + // required to allow offset -> set lookup + List> l = new ArrayList>(strings.size()); + for ( int i = 0; i < strings.size(); i++ ) l.add(null); + return l; + } + + private static List stringsToAlleles(final List strings) { + final List alleles = new ArrayList(strings.size()); + for ( String string : strings ) { + boolean isRef = string.endsWith("*"); + if ( isRef ) string = string.substring(0, string.length() - 1); + alleles.add(Allele.create(string, isRef)); + } + return alleles; + } + + private static List readStrings(final DataInputStream inputStream) throws IOException { + final int nStrings = inputStream.readInt(); + + final List strings = new ArrayList(nStrings); + for ( int i = 0; i < nStrings; i++ ) { + strings.add(inputStream.readUTF()); + } + + return strings; + } + + private static List linearize(final Map map) { + final ArrayList l = new ArrayList(map.size()); + for ( int i = 0; i < map.size(); i++ ) l.add(null); + for ( final Map.Entry elt : map.entrySet() ) + l.set(elt.getValue(), elt.getKey()); + return l; + } + + public String getSample(final int offset) { return samples.get(offset); } + public String getString(final int offset) { return strings.get(offset); } + public Allele getAllele(final int offset) { return alleles.get(offset); } + public List getAlleles(final int[] offsets) { + final List alleles = new ArrayList(offsets.length); + for ( int i : offsets ) alleles.add(getAllele(i)); + return alleles; + } + + public Set getFilters(final int offset) { + Set cached = filters.get(offset); + + if ( cached != null ) + return cached; + else { + final String filterString = getString(offset); + if ( filterString.equals(VCFConstants.UNFILTERED) ) + return null; // UNFILTERED records are represented by null + else { + Set set = VCFCodec.parseFilters(null, -1, filterString); + filters.set(offset, set); // remember the result + return set; + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java new file mode 100644 index 000000000..40e01ec72 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gcf; + +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.HashMap; +import java.util.Map; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public class GCFHeaderBuilder { + Map alleles = new HashMap(); + Map strings = new HashMap(); + Map samples = new HashMap(); + + public GCFHeader createHeader() { + return new GCFHeader(alleles, strings, samples); + } + + public int encodeString(final String chr) { return encode(strings, chr); } + public int encodeAllele(final Allele allele) { return encode(alleles, allele); } + public int encodeSample(final String sampleName) { return encode(samples, sampleName); } + + private int encode(Map map, T key) { + Integer v = map.get(key); + if ( v == null ) { + v = map.size(); + map.put(key, v); + } + return v; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java new file mode 100644 index 000000000..18fae18c4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gcf; + +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; + +/** + * GCFWriter implementing the VCFWriter interface + * @author Your Name + * @since Date created + */ +public class GCFWriter extends IndexingVCFWriter { + final boolean skipGenotypes; + final FileOutputStream fileOutputStream; + final DataOutputStream dataOutputStream; + final GCFHeaderBuilder gcfHeaderBuilder; + int nbytes = 0; + VCFHeader header = null; + File location; + + // -------------------------------------------------------------------------------- + // + // Constructors + // + // -------------------------------------------------------------------------------- + + public GCFWriter(final File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { + super(writerName(location, null), location, null, refDict, enableOnTheFlyIndexing); + this.location = location; + this.skipGenotypes = doNotWriteGenotypes; + + // write the output + try { + fileOutputStream = new FileOutputStream(location); + dataOutputStream = createDataOutputStream(fileOutputStream); + gcfHeaderBuilder = new GCFHeaderBuilder(); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotCreateOutputFile(location, e); + } + } + + // -------------------------------------------------------------------------------- + // + // VCFWriter interface functions + // + // -------------------------------------------------------------------------------- + + @Override + public void writeHeader(VCFHeader header) { + this.header = header; + try { + nbytes += GCFHeader.writeHeader(dataOutputStream); + } catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Couldn't write header", e); + } + } + + @Override + public void add(VariantContext vc) { + super.add(vc); + GCF gcf = new GCF(gcfHeaderBuilder, vc, skipGenotypes); + try { + nbytes += gcf.write(dataOutputStream); + } catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Failed to add gcf record " + gcf + " to stream " + getStreamName(), e); + } + } + + @Override + public void close() { + // todo -- write out VCF header lines + GCFHeader gcfHeader = gcfHeaderBuilder.createHeader(); + try { + long headerPosition = nbytes; + nbytes += gcfHeader.writeFooter(dataOutputStream); + dataOutputStream.close(); + //System.out.println("Writing forward reference to " + headerPosition); + + RandomAccessFile raFile = new RandomAccessFile(location, "rw"); + raFile.seek(GCFHeader.HEADER_FORWARD_REFERENCE_OFFSET); + raFile.writeLong(headerPosition); + raFile.close(); + } catch ( IOException e ) { + throw new ReviewedStingException("Failed to close GCFWriter " + getStreamName(), e); + } + + super.close(); + } + + private static final DataOutputStream createDataOutputStream(final OutputStream stream) { + return new DataOutputStream(new BufferedOutputStream(stream, GCF.BUFFER_SIZE)); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index f551e1368..41cbbe59f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -334,24 +334,44 @@ public class IntervalUtils { } /** - * Splits an interval list into multiple files. - * @param fileHeader The sam file header. + * Splits an interval list into multiple sublists. * @param locs The genome locs to split. * @param splits The stop points for the genome locs returned by splitFixedIntervals. - * @param scatterParts The output interval lists to write to. + * @return A list of lists of genome locs, split according to splits */ - public static void scatterFixedIntervals(SAMFileHeader fileHeader, List locs, List splits, List scatterParts) { - if (splits.size() != scatterParts.size()) - throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); - int fileIndex = 0; + public static List> splitIntervalsToSubLists(List locs, List splits) { int locIndex = 1; int start = 0; + List> sublists = new ArrayList>(splits.size()); for (Integer stop: splits) { - IntervalList intervalList = new IntervalList(fileHeader); + List curList = new ArrayList(); for (int i = start; i < stop; i++) - intervalList.add(toInterval(locs.get(i), locIndex++)); - intervalList.write(scatterParts.get(fileIndex++)); + curList.add(locs.get(i)); start = stop; + sublists.add(curList); + } + + return sublists; + } + + + /** + * Splits an interval list into multiple files. + * @param fileHeader The sam file header. + * @param splits Pre-divided genome locs returned by splitFixedIntervals. + * @param scatterParts The output interval lists to write to. + */ + public static void scatterFixedIntervals(SAMFileHeader fileHeader, List> splits, List scatterParts) { + if (splits.size() != scatterParts.size()) + throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); + + int fileIndex = 0; + int locIndex = 1; + for (final List split : splits) { + IntervalList intervalList = new IntervalList(fileHeader); + for (final GenomeLoc loc : split) + intervalList.add(toInterval(loc, locIndex++)); + intervalList.write(scatterParts.get(fileIndex++)); } } @@ -361,17 +381,15 @@ public class IntervalUtils { * @param numParts Number of parts to split the locs into. * @return The stop points to split the genome locs. */ - public static List splitFixedIntervals(List locs, int numParts) { + public static List> splitFixedIntervals(List locs, int numParts) { if (locs.size() < numParts) throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); - long locsSize = 0; - for (GenomeLoc loc: locs) - locsSize += loc.size(); - List splitPoints = new ArrayList(); + final long locsSize = intervalSize(locs); + final List splitPoints = new ArrayList(); addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); Collections.sort(splitPoints); splitPoints.add(locs.size()); - return splitPoints; + return splitIntervalsToSubLists(locs, splitPoints); } private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { @@ -441,4 +459,11 @@ public class IntervalUtils { return merged; } } + + public static final long intervalSize(final List locs) { + long size = 0; + for ( final GenomeLoc loc : locs ) + size += loc.size(); + return size; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 62bbb0307..5d3ef3086 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -118,31 +118,40 @@ public class ReadUtils { /** * This enum represents all the different ways in which a read can overlap an interval. * - * NO_OVERLAP: + * NO_OVERLAP_CONTIG: + * read and interval are in different contigs. + * + * NO_OVERLAP_LEFT: + * the read does not overlap the interval. + * + * |----------------| (interval) + * <----------------> (read) + * + * NO_OVERLAP_RIGHT: * the read does not overlap the interval. * * |----------------| (interval) * <----------------> (read) * - * LEFT_OVERLAP: + * OVERLAP_LEFT: * the read starts before the beginning of the interval but ends inside of it * * |----------------| (interval) * <----------------> (read) * - * RIGHT_OVERLAP: + * OVERLAP_RIGHT: * the read starts inside the interval but ends outside of it * * |----------------| (interval) * <----------------> (read) * - * FULL_OVERLAP: + * OVERLAP_LEFT_AND_RIGHT: * the read starts before the interval and ends after the interval * * |-----------| (interval) * <-------------------> (read) * - * CONTAINED: + * OVERLAP_CONTAINED: * the read starts and ends inside the interval * * |----------------| (interval) @@ -658,7 +667,7 @@ public class ReadUtils { return ReadAndIntervalOverlap.OVERLAP_RIGHT; } - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"}) + @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) public static int getRefCoordSoftUnclippedStart(SAMRecord read) { int start = read.getUnclippedStart(); for (CigarElement cigarElement : read.getCigar().getCigarElements()) { @@ -670,9 +679,13 @@ public class ReadUtils { return start; } - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"}) + @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) public static int getRefCoordSoftUnclippedEnd(SAMRecord read) { int stop = read.getUnclippedStart(); + + if (readIsEntirelyInsertion(read)) + return stop; + int shift = 0; CigarOperator lastOperator = null; for (CigarElement cigarElement : read.getCigar().getCigarElements()) { @@ -686,6 +699,14 @@ public class ReadUtils { return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } + private static boolean readIsEntirelyInsertion(SAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() != CigarOperator.INSERTION) + return false; + } + return true; + } + /** * Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region before * the alignment start of the read. diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index fdf3d97db..85d752003 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -293,17 +293,8 @@ public class Genotype { return commonInfo.getAttribute(key, defaultValue); } - public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); } public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); } - public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); } public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } - public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); } public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } - public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); } public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } - - public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); } - public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); } - public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); } - public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java index 3d162adb0..bf16cd1cf 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.utils.variantcontext; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; + import java.util.*; @@ -204,27 +206,40 @@ public final class InferredGeneticContext { return defaultValue; } -// public AttributedObject getAttributes(Collection keys) { -// AttributedObject selected = new AttributedObject(); -// -// for ( Object key : keys ) -// selected.putAttribute(key, this.getAttribute(key)); -// -// return selected; -// } + public String getAttributeAsString(String key, String defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof String ) return (String)x; + return String.valueOf(x); // throws an exception if this isn't a string + } - public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null" - public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); } - public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); } - public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); } + public int getAttributeAsInt(String key, int defaultValue) { + Object x = getAttribute(key); + if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue; + if ( x instanceof Integer ) return (Integer)x; + return Integer.valueOf((String)x); // throws an exception if this isn't a string + } - public String getAttributeAsString(String key, String defaultValue) { return (String)getAttribute(key, defaultValue); } - public int getAttributeAsInt(String key, int defaultValue) { return (Integer)getAttribute(key, defaultValue); } - public double getAttributeAsDouble(String key, double defaultValue) { return (Double)getAttribute(key, defaultValue); } - public boolean getAttributeAsBoolean(String key, boolean defaultValue){ return (Boolean)getAttribute(key, defaultValue); } + public double getAttributeAsDouble(String key, double defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof Double ) return (Double)x; + return Double.valueOf((String)x); // throws an exception if this isn't a string + } - public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} } - public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} } - public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); } - public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} } + public boolean getAttributeAsBoolean(String key, boolean defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof Boolean ) return (Boolean)x; + return Boolean.valueOf((String)x); // throws an exception if this isn't a string + } + +// public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null" +// public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); } +// public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); } +// public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); } +// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} } +// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} } +// public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); } +// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 673fe4529..c1623d2cb 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -666,21 +666,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati return commonInfo.getAttribute(key, defaultValue); } - public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); } public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); } - public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); } public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } - public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); } public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } - public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); } public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } - public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); } - public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); } - public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); } - public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); } - - // --------------------------------------------------------------------------------------------------------- // // Working with alleles @@ -817,6 +807,28 @@ public class VariantContext implements Feature { // to enable tribble intergrati throw new IllegalArgumentException("Requested " + i + " alternative allele but there are only " + n + " alternative alleles " + this); } + /** + * @param other VariantContext whose alternate alleles to compare against + * @return true if this VariantContext has the same alternate alleles as other, + * regardless of ordering. Otherwise returns false. + */ + public boolean hasSameAlternateAllelesAs ( VariantContext other ) { + Set thisAlternateAlleles = getAlternateAlleles(); + Set otherAlternateAlleles = other.getAlternateAlleles(); + + if ( thisAlternateAlleles.size() != otherAlternateAlleles.size() ) { + return false; + } + + for ( Allele allele : thisAlternateAlleles ) { + if ( ! otherAlternateAlleles.contains(allele) ) { + return false; + } + } + + return true; + } + // --------------------------------------------------------------------------------------------------------- // // Working with genotypes @@ -983,7 +995,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return true if it's monomorphic */ public boolean isMonomorphic() { - return ! isVariant() || getChromosomeCount(getReference()) == getChromosomeCount(); + return ! isVariant() || (hasGenotypes() && getHomRefCount() + getNoCallCount() == getNSamples()); } /** @@ -1085,14 +1097,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati } public void validateReferenceBases(Allele reference, Byte paddedRefBase) { - // don't validate if we're an insertion or complex event - if ( !reference.isNull() && getReference().length() == 1 && !reference.basesMatch(getReference()) ) { - throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString())); + // don't validate if we're a complex event + if ( !isComplexIndel() && !reference.isNull() && !reference.basesMatch(getReference()) ) { + throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString())); } // we also need to validate the padding base for simple indels - if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) ) - throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), (char)getReferenceBaseForIndel().byteValue(), (char)paddedRefBase.byteValue())); + if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) ) { + throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), (char)paddedRefBase.byteValue(), (char)getReferenceBaseForIndel().byteValue())); + } } public void validateRSIDs(Set rsIDs) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 986d6305c..343146d6d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -565,11 +565,11 @@ public class VariantContextUtils { // special case DP (add it up) and ID (just preserve it) // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) - depth += Integer.valueOf(vc.getAttributeAsString(VCFConstants.DEPTH_KEY)); + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); if (rsID == null && vc.hasID()) rsID = vc.getID(); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { - String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY); + String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); @@ -663,6 +663,18 @@ public class VariantContextUtils { return merged; } + public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { + // if all alleles of vc1 are a contained in alleles of vc2, return true + if (!vc1.getReference().equals(vc2.getReference())) + return false; + + for (Allele a :vc1.getAlternateAlleles()) { + if (!vc2.getAlternateAlleles().contains(a)) + return false; + } + + return true; + } public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { // see if we need to trim common reference base from all alleles boolean trimVC; @@ -1147,9 +1159,7 @@ public class VariantContextUtils { for (String orAttrib : MERGE_OR_ATTRIBS) { boolean attribVal = false; for (VariantContext vc : vcList) { - Boolean val = vc.getAttributeAsBooleanNoException(orAttrib); - if (val != null) - attribVal = (attribVal || val); + attribVal = vc.getAttributeAsBoolean(orAttrib, false); if (attribVal) // already true, so no reason to continue: break; } @@ -1159,7 +1169,7 @@ public class VariantContextUtils { // Merge ID fields: String iDVal = null; for (VariantContext vc : vcList) { - String val = vc.getAttributeAsStringNoException(VariantContext.ID_KEY); + String val = vc.getAttributeAsString(VariantContext.ID_KEY, null); if (val != null && !val.equals(VCFConstants.EMPTY_ID_FIELD)) { if (iDVal == null) iDVal = val; @@ -1239,8 +1249,10 @@ public class VariantContextUtils { public PhaseAndQuality(Genotype gt) { this.isPhased = gt.isPhased(); - if (this.isPhased) - this.PQ = gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY); + if (this.isPhased) { + this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); + if ( this.PQ == -1 ) this.PQ = null; + } } } diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 386c17659..a1817e3c7 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -75,7 +75,7 @@ public class WalkerTest extends BaseTest { Index indexFromOutputFile = IndexFactory.createIndex(resultFile, new VCFCodec()); Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath()); - if ( ! indexFromOutputFile.equalsIgnoreTimestamp(dynamicIndex) ) { + if ( ! indexFromOutputFile.equalsIgnoreProperties(dynamicIndex) ) { Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n", indexFromOutputFile.getProperties(), dynamicIndex.getProperties())); diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java index ae218e898..724c343e4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java @@ -29,7 +29,6 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.index.Index; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -45,7 +44,6 @@ import org.testng.annotations.Test; import java.io.*; import java.nio.channels.FileChannel; -import java.util.Map; /** @@ -164,7 +162,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { try { Index idx = builder.loadIndex(vcfFile, new VCFCodec()); // catch any exception; this call should pass correctly - SAMSequenceDictionary dict = RMDTrackBuilder.getSequenceDictionaryFromProperties(idx); + SAMSequenceDictionary dict = IndexDictionaryUtils.getSequenceDictionaryFromProperties(idx); } catch (IOException e) { e.printStackTrace(); Assert.fail("IO exception unexpected" + e.getMessage()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index c0d32a05b..7f4d96add 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -127,6 +127,7 @@ public class TraverseReadsUnitTest extends BaseTest { Object accumulator = countReadWalker.reduceInit(); while (shardStrategy.hasNext()) { + traversalEngine.startTimersIfNecessary(); Shard shard = shardStrategy.next(); if (shard == null) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java new file mode 100644 index 000000000..462abeba1 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.testng.Assert; +import org.testng.annotations.Test; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff.SnpEffEffect; + +public class SnpEffUnitTest { + + @Test + public void testParseWellFormedEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertTrue( effect.isWellFormed() && effect.isCoding() ); + } + + @Test + public void testParseInvalidEffectNameEffect() { + String effectName = "MADE_UP_EFFECT"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertFalse(effect.isWellFormed()); + } + + @Test + public void testParseInvalidEffectImpactEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MEDIUM", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertFalse(effect.isWellFormed()); + } + + @Test + public void testParseWrongNumberOfMetadataFieldsEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertFalse(effect.isWellFormed()); + } + + @Test + public void testParseSnpEffWarningEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: SNPEFF_WARNING") ); + } + + @Test + public void testParseSnpEffErrorEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "", "SNPEFF_ERROR" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following error: SNPEFF_ERROR") ); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 832079807..08baae7a7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.util.Arrays; @@ -129,12 +130,24 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testSnpEffAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + b37KGReference + " -NO_HEADER -o %s -A SnpEff --variant " + - validationDataLocation + "1000G.exomes.vcf --snpEffFile " + validationDataLocation + - "snpEff_1.9.6_1000G.exomes.vcf_hg37.61.out -L 1:26,000,000-26,500,000", + "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + + "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000", 1, - Arrays.asList("03eae1dab19a9358250890594bf53607") + Arrays.asList("486fc6a5ca1819f5ab180d5d72b1ebc9") ); executeTest("Testing SnpEff annotations", spec); } + + @Test + public void testSnpEffAnnotationsUnsupportedVersion() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + + "snpEff.AFR.unfiltered.unsupported.version.vcf -L 1:1-1,500,000", + 1, + UserException.class + ); + executeTest("Testing SnpEff annotations (unsupported version)", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index 5f759fdbf..1a01ef8e8 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -41,7 +41,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " + "--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " + "--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " + - "-o %s -NO_HEADER", 1, Arrays.asList("3531451e84208264104040993889aaf4")); + "-o %s -NO_HEADER", 1, Arrays.asList("b445d280fd8fee1eeb4aacb3f5a54847")); executeTest("test BeagleOutputToVCF", spec); } @@ -72,7 +72,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ "--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ "--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ - "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("8dd6ec53994fb46c5c22af8535d22965")); + "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("51a57ea565176edd96d907906914b0ee")); executeTest("testBeagleChangesSitesToRef",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index da0c8f81f..41496bdf1 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -18,6 +18,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129; private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm INDEL --dbsnp " + b36dbSNP129; + private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -NO_HEADER -glm INDEL --dbsnp " + b37dbSNP132; // -------------------------------------------------------------------------------------------------------------- // @@ -28,28 +29,10 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("149e6ad9b3fd23551254a691286a96b3")); + Arrays.asList("e6639ea2dc81635c706e6c35921406d7")); executeTest("test MultiSample Pilot1", spec); } - // @Test - // todo - currently not working because when calling indels, using GENOTYPE_GIVEN_ALLELES yields a different result than in normal mode. To be fixed when extended events are removed. - public void testMultiSamplePilot2AndRecallingWithAlleles() { - String md5 = "b45636b29891f9df573ad2af6f507ee0"; - - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1, - Arrays.asList(md5)); - List result = executeTest("test MultiSample Pilot2", spec1).getFirst(); - - GenomeAnalysisEngine.resetRandomGenerator(); - - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1, - Arrays.asList(md5)); - executeTest("test MultiSample Pilot2 with alleles passed in", spec2); - } - @Test public void testWithAllelesPassedIn() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( @@ -67,7 +50,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("82d469145c174486ccc494884852cc58")); + Arrays.asList("d1cbd1fb9f3f7323941a95bc2def7e5a")); executeTest("test SingleSample Pilot2", spec); } @@ -77,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "a5a9f38c645d6004d4640765a8b77ce4"; + private final static String COMPRESSED_OUTPUT_MD5 = "2732b169cdccb21eb3ea00429619de79"; @Test public void testCompressedOutput() { @@ -87,15 +70,6 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test compressed output", spec); } - // todo -- fixme -// @Test -// public void testCompressedOutputParallel() { -// WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( -// baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000 -nt 4", 1, -// Arrays.asList("gz"), Arrays.asList(COMPRESSED_OUTPUT_MD5)); -// executeTest("testCompressedOutput-nt4", spec); -// } - // -------------------------------------------------------------------------------------------------------------- // // testing parallelization @@ -107,7 +81,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "0a45761c0e557d9c2080eb9e7f4f6c41"; + String md5 = "cbac3960bbcb9d6192c57549208c182c"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -186,8 +160,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "af5199fbc0853cf5888acdcc88f012bc" ); - e.put( 1.0 / 1850, "4e6938645ccde1fdf204ffbf4e88170f" ); + e.put( 0.01, "aed69402ddffe7f2ed5ca98563bfba02" ); + e.put( 1.0 / 1850, "fa94a059f08c1821b721335d93ed2ea5" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -211,7 +185,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("213ebaaaacf850312d885e918eb33500")); + Arrays.asList("1c080e6596d4c830bb5d147b04e2a82c")); executeTest(String.format("test multiple technologies"), spec); } @@ -230,7 +204,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("3aecba34a89f3525afa57a38dc20e6cd")); + Arrays.asList("9129ad748ca3be2d3b321d2d7e83ae5b")); executeTest(String.format("test calling with BAQ"), spec); } @@ -249,7 +223,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("043973c719a85de29a35a33a674616fb")); + Arrays.asList("0bece77ce6bc447438ef9b2921b2dc41")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -264,7 +238,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("68d4e6c1849e892467aed61c33e7bf24")); + Arrays.asList("5fe98ee853586dc9db58f0bc97daea63")); executeTest(String.format("test indel caller in SLX witn low min allele count"), spec); } @@ -277,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("f86d453c5d2d2f33fb28ae2050658a5e")); + Arrays.asList("790b1a1d6ab79eee8c24812bb8ca6fae")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -296,7 +270,21 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, Arrays.asList("94977d6e42e764280e9deaf4e3ac8c80")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); + + WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, + Arrays.asList("e66b7321e2ac91742ad3ef91040daafd")); + executeTest("test MultiSample Pilot2 indels with complicated records", spec3); + + WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( + baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, + Arrays.asList("4be308fd9e8167ebee677f62a7a753b7")); + executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4); + } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 3503a2353..b90e6d0ff 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -6,7 +6,7 @@ import org.testng.annotations.Test; import java.util.Arrays; public class VariantEvalIntegrationTest extends WalkerTest { - private static String variantEvalTestDataRoot = validationDataLocation + "/VariantEval"; + private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval"; private static String fundamentalTestVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf"; private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.vcf"; private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.HG00625.vcf"; @@ -14,6 +14,47 @@ public class VariantEvalIntegrationTest extends WalkerTest { private static String cmdRoot = "-T VariantEval" + " -R " + b36KGReference; + @Test + public void testFunctionClassWithSnpeff() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "-noEV", + "-EV TiTvVariantEvaluator", + "-noST", + "-ST FunctionalClass", + "-BTI eval", + "-o %s" + ), + 1, + Arrays.asList("f5f811ceb973d7fd6c1b2b734f1b2b12") + ); + executeTest("testFunctionClassWithSnpeff", spec); + } + + @Test + public void testStratifySamplesAndExcludeMonomorphicSites() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + variantEvalTestDataRoot + "/CEU.trio.callsForVE.vcf", + "-noEV", + "-EV TiTvVariantEvaluator", + "-ST Sample", + "-BTI eval", + "-o %s" + ), + 1, + Arrays.asList("6a71b17c19f5914c277a99f45f5d9c39") + ); + executeTest("testStratifySamplesAndExcludeMonomorphicSites", spec); + } + @Test public void testFundamentalsCountVariantsSNPsAndIndels() { WalkerTestSpec spec = new WalkerTestSpec( @@ -236,7 +277,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("2df4f8911ffc3c8d042298723ed465f8")); + 1, Arrays.asList("f70997b6a3e7fdc89d11e1d61a2463d4")); executeTestParallel("testSelect1", spec); } @@ -253,7 +294,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("ed54aa127b173d8ad8b6482f2a929a42")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("407682de41dcf139ea635e9cda21b912")); executeTestParallel("testCompVsEvalAC",spec); } @@ -261,17 +302,17 @@ public class VariantEvalIntegrationTest extends WalkerTest { return String.format("%s -select '%s' -selectName %s", cmd, select, name); } - @Test + @Test(enabled = false) // no longer supported in the GATK public void testTranches() { String extraArgs = "-T VariantEval -R "+ hg18Reference +" --eval " + validationDataLocation + "GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized.vcf -o %s -EV TiTvVariantEvaluator -L chr1 -noEV -ST CpG -tf " + testDir + "tranches.6.txt"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("984df6e94a546294fc7e0846cbac2dfe")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("6af2b9959aa1778a5b712536de453952")); executeTestParallel("testTranches",spec); } @Test public void testCompOverlap() { String extraArgs = "-T VariantEval -R " + b37KGReference + " -L " + validationDataLocation + "VariantEval/pacbio.hg19.intervals --comp:comphapmap " + comparisonDataLocation + "Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf --eval " + validationDataLocation + "VariantEval/pacbio.ts.recalibrated.vcf -noEV -EV CompOverlap -sn NA12878 -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("462d4784dd55294ef9d5118217b157a5")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("009ecc8376a20dce81ff5299ef6bfecb")); executeTestParallel("testCompOverlap",spec); } @@ -283,7 +324,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --dbsnp " + b37dbSNP132 + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("18c44636e36d6657110bf984f8eac181")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("424c9d438b1faa59b2c29413ba32f37b")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -295,7 +336,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("1b8ae4fd10de0888bd843f833859d990")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("18fa0b89ebfff51141975d7e4ce7a159")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -312,13 +353,13 @@ public class VariantEvalIntegrationTest extends WalkerTest { " -noST -noEV -ST Novelty -EV CompOverlap" + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("a3c2177849cb00fdff99574cff7f0e4f")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("0b81d97f843ec4a1a4222d1f9949bfca")); executeTestParallel("testMultipleCompTracks",spec); } @Test public void testPerSampleAndSubsettedSampleHaveSameResults() { - String md5 = "dab415cc76846e18fcf8c78f2b2ee033"; + String md5 = "b0565ac61b2860248e4abd478a177b5e"; WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index a5b0412e8..c81891ac6 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -41,11 +41,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b37KGReference + - " -known:prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + - " -training:prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + - " -truth:prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + - " -training:prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + - " -truth:prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + + " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + " -T VariantRecalibrator" + " -input " + params.inVCF + " -L 20:1,000,000-40,000,000" + @@ -73,5 +71,52 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { Arrays.asList(params.cutVCFMD5)); executeTest("testApplyRecalibration-"+params.inVCF, spec); } + + VRTest indel = new VRTest("combined.phase1.chr20.raw.indels.sites.vcf", + "6d7ee4cb651c8b666e4a4523363caaff", // tranches + "4759b111a5aa53975d46e0f22c7983bf", // recal file + "5d7e07d8813db96ba3f3dfe4737f83d1"); // cut VCF + + @DataProvider(name = "VRIndelTest") + public Object[][] createData2() { + return new Object[][]{ {indel} }; + } + + @Test(dataProvider = "VRIndelTest") + public void testVariantRecalibratorIndel(VRTest params) { + //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -resource:training=true,truth=true,prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" + + " -T VariantRecalibrator" + + " -input " + params.inVCF + + " -L 20:1,000,000-40,000,000" + + " -an QD -an ReadPosRankSum -an HaplotypeScore" + + " -percentBad 0.08" + + " -mode INDEL -mG 3" + + " --minNumBadVariants 0" + + " --trustAllPolymorphic" + // for speed + " -recalFile %s" + + " -tranchesFile %s", + Arrays.asList(params.recalMD5, params.tranchesMD5)); + executeTest("testVariantRecalibratorIndel-"+params.inVCF, spec).getFirst(); + } + + @Test(dataProvider = "VRIndelTest",dependsOnMethods="testVariantRecalibratorIndel") + public void testApplyRecalibrationIndel(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:12,000,000-30,000,000" + + " -mode INDEL" + + " -NO_HEADER" + + " -input " + params.inVCF + + " -o %s" + + " -tranchesFile " + MD5DB.getMD5FilePath(params.tranchesMD5, null) + + " -recalFile " + MD5DB.getMD5FilePath(params.recalMD5, null), + Arrays.asList(params.cutVCFMD5)); + executeTest("testApplyRecalibrationIndel-"+params.inVCF, spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 3267173a7..35495d797 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -90,7 +90,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "1d5a021387a8a86554db45a29f66140f"); } // official project VCF files in tabix format @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "20163d60f18a46496f6da744ab5cc0f9"); } // official project VCF files in tabix format - @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "f1cf095c2fe9641b7ca1f8ee2c46fd4a"); } + @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "312a22aedb088b678bc891f1a1b03c91"); } @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e144b6283765494bfe8189ac59965083"); } @@ -110,7 +110,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + " -genotypeMergeOptions UNIQUIFY -L 1"), 1, - Arrays.asList("1de95f91ca15d2a8856de35dee0ce33e")); + Arrays.asList("35acb0f15f9cd18c653ede4e15e365c9")); executeTest("threeWayWithRefs", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index 3801e132d..00044f859 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " -EV CompOverlap -noEV -noST" + " -o %s", 1, - Arrays.asList("ea09bf764adba9765b99921c5ba2c709") + Arrays.asList("d46a735ffa898f4aa6b3758c5b03f06d") ); executeTest("testVCFStreamingChain", selectTestSpec); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java index adf3b21a8..5f71f82fd 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java @@ -113,4 +113,16 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { executeTest("test bad alt allele", spec); } + + @Test + public void testBadAllele2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("validationExampleBad3.vcf", "REF"), + 0, + UserException.MalformedFile.class + ); + + executeTest("test bad ref allele in deletion", spec); + } + } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java deleted file mode 100644 index 6d492565b..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodecUnitTest.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.snpEff; - -import org.apache.commons.io.input.ReaderInputStream; -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.AsciiLineReader; -import org.broad.tribble.readers.LineReader; -import org.testng.Assert; -import org.testng.annotations.Test; - -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.EffectType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.ChangeType; -import static org.broadinstitute.sting.utils.codecs.snpEff.SnpEffConstants.Zygosity; - -import java.io.StringReader; - -public class SnpEffCodecUnitTest { - - @Test - public void testParseWellFormedSnpEffHeaderLine() { - String wellFormedSnpEffHeaderLine = "# Chromo\tPosition\tReference\tChange\tChange type\t" + - "Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" + - "Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" + - "AAs around\tCustom_interval_ID"; - - SnpEffCodec codec = new SnpEffCodec(); - LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(wellFormedSnpEffHeaderLine))); - String headerReturned = (String)codec.readHeader(reader); - - Assert.assertEquals(headerReturned, wellFormedSnpEffHeaderLine); - } - - @Test(expectedExceptions = TribbleException.InvalidHeader.class) - public void testParseWrongNumberOfFieldsSnpEffHeaderLine() { - String wrongNumberOfFieldsSnpEffHeaderLine = "# Chromo\tPosition\tReference\tChange\tChange type\t" + - "Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" + - "Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" + - "AAs around"; - - SnpEffCodec codec = new SnpEffCodec(); - LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(wrongNumberOfFieldsSnpEffHeaderLine))); - codec.readHeader(reader); - } - - @Test(expectedExceptions = TribbleException.InvalidHeader.class) - public void testParseMisnamedColumnSnpEffHeaderLine() { - String misnamedColumnSnpEffHeaderLine = "# Chromo\tPosition\tRef\tChange\tChange type\t" + - "Homozygous\tQuality\tCoverage\tWarnings\tGene_ID\tGene_name\tBio_type\tTrancript_ID\tExon_ID\t" + - "Exon_Rank\tEffect\told_AA/new_AA\tOld_codon/New_codon\tCodon_Num(CDS)\tCDS_size\tCodons around\t" + - "AAs around\tCustom_interval_ID"; - - SnpEffCodec codec = new SnpEffCodec(); - LineReader reader = new AsciiLineReader(new ReaderInputStream(new StringReader(misnamedColumnSnpEffHeaderLine))); - codec.readHeader(reader); - } - - @Test - public void testParseSimpleEffectSnpEffLine() { - String simpleEffectSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" + - "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\t918\t\t\t"; - - SnpEffFeature expectedFeature = new SnpEffFeature("1", - 69428l, - "T", - "G", - ChangeType.SNP, - Zygosity.Hom, - 6049.69, - 61573l, - null, - "ENSG00000177693", - "OR4F5", - "mRNA", - "ENST00000326183", - "exon_1_69055_70108", - 1, - false, - EffectType.NON_SYNONYMOUS_CODING, - null, - "F/C", - "TTT/TGT", - 113, - 918, - null, - null, - null - ); - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(simpleEffectSnpEffLine); - - Assert.assertEquals(feature, expectedFeature); - } - - @Test - public void testParseNonCodingRegionSnpEffLine() { - String nonCodingRegionSnpEffLine = "1\t1337592\tG\tC\tSNP\tHom\t1935.52\t21885\t\tENSG00000250188\t" + - "RP4-758J18.5\tmRNA\tENST00000514958\texon_1_1337454_1338076\t2\tWITHIN_NON_CODING_GENE, NON_SYNONYMOUS_CODING\t" + - "L/V\tCTA/GTA\t272\t952\t\t\t"; - - SnpEffFeature expectedFeature = new SnpEffFeature("1", - 1337592l, - "G", - "C", - ChangeType.SNP, - Zygosity.Hom, - 1935.52, - 21885l, - null, - "ENSG00000250188", - "RP4-758J18.5", - "mRNA", - "ENST00000514958", - "exon_1_1337454_1338076", - 2, - true, - EffectType.NON_SYNONYMOUS_CODING, - null, - "L/V", - "CTA/GTA", - 272, - 952, - null, - null, - null - ); - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(nonCodingRegionSnpEffLine); - - Assert.assertEquals(feature, expectedFeature); - } - - @Test - public void testParseExtraEffectInformationSnpEffLine() { - String extraEffectInformationSnpEffLine = "1\t879537\tT\tC\tSNP\tHom\t341.58\t13733\t\tENSG00000187634\tSAMD11\t" + - "mRNA\tENST00000341065\t\t\tUTR_3_PRIME: 4 bases from transcript end\t\t\t\t\t\t\t"; - - SnpEffFeature expectedFeature = new SnpEffFeature("1", - 879537l, - "T", - "C", - ChangeType.SNP, - Zygosity.Hom, - 341.58, - 13733l, - null, - "ENSG00000187634", - "SAMD11", - "mRNA", - "ENST00000341065", - null, - null, - false, - EffectType.UTR_3_PRIME, - "4 bases from transcript end", - null, - null, - null, - null, - null, - null, - null - ); - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(extraEffectInformationSnpEffLine); - - Assert.assertEquals(feature, expectedFeature); - } - - @Test - public void testParseMultiEffectSnpEffLine() { - String multiEffectSnpEffLine = "1\t901901\tC\tT\tSNP\tHom\t162.91\t4646\t\tENSG00000187583\tPLEKHN1\tmRNA\t" + - "ENST00000379410\texon_1_901877_901994\t1\tSTART_GAINED: ATG, UTR_5_PRIME: 11 bases from TSS\t\t\t\t\t\t\t"; - - SnpEffFeature expectedFeature = new SnpEffFeature("1", - 901901l, - "C", - "T", - ChangeType.SNP, - Zygosity.Hom, - 162.91, - 4646l, - null, - "ENSG00000187583", - "PLEKHN1", - "mRNA", - "ENST00000379410", - "exon_1_901877_901994", - 1, - false, - EffectType.START_GAINED, - "ATG, UTR_5_PRIME: 11 bases from TSS", - null, - null, - null, - null, - null, - null, - null - ); - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(multiEffectSnpEffLine); - - Assert.assertEquals(feature, expectedFeature); - } - - @Test(expectedExceptions = TribbleException.InvalidDecodeLine.class) - public void testParseWrongNumberOfFieldsSnpEffLine() { - String wrongNumberOfFieldsSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" + - "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\t918\t\t"; - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(wrongNumberOfFieldsSnpEffLine); - } - - @Test(expectedExceptions = TribbleException.InvalidDecodeLine.class) - public void testParseBlankEffectFieldSnpEffLine() { - String blankEffectFieldSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" + - "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\t\tF/C\tTTT/TGT\t113\t918\t\t\t"; - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(blankEffectFieldSnpEffLine); - } - - @Test(expectedExceptions = TribbleException.InvalidDecodeLine.class) - public void testParseInvalidNumericFieldSnpEffLine() { - String invalidNumericFieldSnpEffLine = "1\t69428\tT\tG\tSNP\tHom\t6049.69\t61573\t\tENSG00000177693\t" + - "OR4F5\tmRNA\tENST00000326183\texon_1_69055_70108\t1\tNON_SYNONYMOUS_CODING\tF/C\tTTT/TGT\t113\tfoo\t\t\t";; - - SnpEffCodec codec = new SnpEffCodec(); - SnpEffFeature feature = (SnpEffFeature)codec.decode(invalidNumericFieldSnpEffLine); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java index d08cda949..55bd4783b 100755 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java @@ -1,53 +1,43 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.index.*; import org.broad.tribble.iterators.CloseableTribbleIterator; import org.broad.tribble.source.BasicFeatureSource; +import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.testng.Assert; +import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; /** * tests out the various functions in the index factory class */ -public class IndexFactoryUnitTest { +public class IndexFactoryUnitTest extends BaseTest { File inputFile = new File("public/testdata/HiSeq.10000.vcf"); File outputFile = new File("public/testdata/onTheFlyOutputTest.vcf"); File outputFileIndex = Tribble.indexFile(outputFile); - /** - * test out scoring the indexes - */ - @Test - public void testScoreIndexes() { - /*// make a list of indexes to score - Map creators = new HashMap(); - // add a linear index with the default bin size - LinearIndexCreator linearNormal = new LinearIndexCreator(); - linearNormal.initialize(inputFile, linearNormal.defaultBinSize()); - creators.add(LInearIndexlinearNormal); + private SAMSequenceDictionary dict; - // create a tree index with a small index size - IntervalIndexCreator treeSmallBin = new IntervalIndexCreator(); - treeSmallBin.initialize(inputFile, Math.max(200,treeSmallBin.defaultBinSize()/10)); - creators.add(treeSmallBin); - - List indexes = new ArrayList(); - for (IndexCreator creator : creators) - indexes.add(creator.finalizeIndex(0)); - - ArrayList scores = IndexFactory.scoreIndexes(0.5,indexes,100, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - System.err.println("scores are : "); - for (Double score : scores) { - System.err.println(score); -*/ + @BeforeTest + public void setup() { + try { + dict = new CachingIndexedFastaSequenceFile(new File(b37KGReference)).getSequenceDictionary(); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(b37KGReference,ex); + } } // @@ -65,7 +55,7 @@ public class IndexFactoryUnitTest { BasicFeatureSource source = new BasicFeatureSource(inputFile.getAbsolutePath(), indexFromInputFile, new VCFCodec()); int counter = 0; - VCFWriter writer = new StandardVCFWriter(outputFile); + VCFWriter writer = new StandardVCFWriter(outputFile, dict); writer.writeHeader((VCFHeader)source.getHeader()); CloseableTribbleIterator it = source.iterator(); while (it.hasNext() && (counter++ < maxRecords || maxRecords == -1) ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index e3a926fb9..a8e6593b1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -38,12 +38,13 @@ public class VCFWriterUnitTest extends BaseTest { private Set additionalColumns = new HashSet(); private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf"); private GenomeLocParser genomeLocParser; + private IndexedFastaSequenceFile seq; @BeforeClass public void beforeTests() { File referenceFile = new File(hg18Reference); try { - IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile); + seq = new CachingIndexedFastaSequenceFile(referenceFile); genomeLocParser = new GenomeLocParser(seq); } catch(FileNotFoundException ex) { @@ -55,7 +56,7 @@ public class VCFWriterUnitTest extends BaseTest { @Test public void testBasicWriteAndRead() { VCFHeader header = createFakeHeader(metaData,additionalColumns); - VCFWriter writer = new StandardVCFWriter(fakeVCFFile); + VCFWriter writer = new StandardVCFWriter(fakeVCFFile, seq.getSequenceDictionary()); writer.writeHeader(header); writer.add(createVC(header)); writer.add(createVC(header)); diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index bb892eec8..98b878d23 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -30,6 +30,20 @@ public class IntervalUtilsUnitTest extends BaseTest { private SAMFileHeader hg19Header; private GenomeLocParser hg19GenomeLocParser; private List hg19ReferenceLocs; + private List hg19exomeIntervals; + + private List getLocs(String... intervals) { + return getLocs(Arrays.asList(intervals)); + } + + private List getLocs(List intervals) { + if (intervals.size() == 0) + return hg18ReferenceLocs; + List locs = new ArrayList(); + for (String interval: intervals) + locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); + return locs; + } @BeforeClass public void init() { @@ -54,12 +68,69 @@ public class IntervalUtilsUnitTest extends BaseTest { ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref); hg19GenomeLocParser = new GenomeLocParser(seq); hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ; + + hg19exomeIntervals = Collections.unmodifiableList(IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(hg19Intervals), false)); } catch(FileNotFoundException ex) { throw new UserException.CouldNotReadInputFile(hg19Ref,ex); } } + // ------------------------------------------------------------------------------------- + // + // tests to ensure the quality of the interval cuts of the interval cutting functions + // + // ------------------------------------------------------------------------------------- + + private class IntervalSlicingTest extends TestDataProvider { + public int parts; + public double maxAllowableVariance; + + private IntervalSlicingTest(final int parts, final double maxAllowableVariance) { + super(IntervalSlicingTest.class); + this.parts = parts; + this.maxAllowableVariance = maxAllowableVariance; + } + + public String toString() { + return String.format("IntervalSlicingTest parts=%d maxVar=%.2f", parts, maxAllowableVariance); + } + } + + @DataProvider(name = "intervalslicingdata") + public Object[][] createTrees() { + new IntervalSlicingTest(1, 0); + new IntervalSlicingTest(2, 1); + new IntervalSlicingTest(5, 1); + new IntervalSlicingTest(10, 1); + new IntervalSlicingTest(67, 1); + new IntervalSlicingTest(100, 1); + new IntervalSlicingTest(500, 1); + new IntervalSlicingTest(1000, 1); + return IntervalSlicingTest.getTests(IntervalSlicingTest.class); + } + + @Test(enabled = true, dataProvider = "intervalslicingdata") + public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) { + List> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts); + + long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals); + long idealSplitSize = totalSize / test.parts; + + long sumOfSplitSizes = 0; + int counter = 0; + for ( final List split : splits ) { + long splitSize = IntervalUtils.intervalSize(split); + double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize); + //logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma)); + counter++; + sumOfSplitSizes += splitSize; + Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance)); + } + + Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); + } + @Test(expectedExceptions=UserException.class) public void testMergeListsBySetOperatorNoOverlap() { // a couple of lists we'll use for the testing @@ -129,19 +200,6 @@ public class IntervalUtilsUnitTest extends BaseTest { Assert.assertEquals((long)lengths.get("chrX"), 154913754); } - private List getLocs(String... intervals) { - return getLocs(Arrays.asList(intervals)); - } - - private List getLocs(List intervals) { - if (intervals.size() == 0) - return hg18ReferenceLocs; - List locs = new ArrayList(); - for (String interval: intervals) - locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); - return locs; - } - @Test public void testParseIntervalArguments() { Assert.assertEquals(getLocs().size(), 45); @@ -174,8 +232,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("basic.", 3, ".intervals"); List locs = getLocs("chr1", "chr2", "chr3"); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -200,8 +258,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("less.", 3, ".intervals"); List locs = getLocs("chr1", "chr2", "chr3", "chr4"); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -228,8 +286,8 @@ public class IntervalUtilsUnitTest extends BaseTest { public void testScatterFixedIntervalsMoreFiles() { List files = testFiles("more.", 3, ".intervals"); List locs = getLocs("chr1", "chr2"); - List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); } @Test public void testScatterFixedIntervalsStart() { @@ -242,8 +300,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -270,8 +328,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -298,8 +356,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -319,7 +377,7 @@ public class IntervalUtilsUnitTest extends BaseTest { public void testScatterFixedIntervalsFile() { List files = testFiles("sg.", 20, ".intervals"); List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); int[] counts = { 125, 138, 287, 291, 312, 105, 155, 324, @@ -332,16 +390,13 @@ public class IntervalUtilsUnitTest extends BaseTest { }; //String splitCounts = ""; - for (int lastIndex = 0, i = 0; i < splits.size(); i++) { - int splitIndex = splits.get(i); - int splitCount = (splitIndex - lastIndex); - //splitCounts += ", " + splitCount; - lastIndex = splitIndex; + for (int i = 0; i < splits.size(); i++) { + int splitCount = splits.get(i).size(); Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); } //System.out.println(splitCounts.substring(2)); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); int locIndex = 0; for (int i = 0; i < files.size(); i++) { @@ -357,8 +412,8 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterFixedIntervalsMax() { List files = testFiles("sg.", 85, ".intervals"); - List splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); - IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); + IntervalUtils.scatterFixedIntervals(hg19Header, splits, files); for (int i = 0; i < files.size(); i++) { String file = files.get(i).toString(); diff --git a/public/packages/AnalyzeCovariates.xml b/public/packages/AnalyzeCovariates.xml index 7e31934df..a6675a63d 100644 --- a/public/packages/AnalyzeCovariates.xml +++ b/public/packages/AnalyzeCovariates.xml @@ -6,10 +6,7 @@ - - - - + diff --git a/public/packages/FindContaminatingReadGroups.xml b/public/packages/FindContaminatingReadGroups.xml deleted file mode 100644 index 880f64a81..000000000 --- a/public/packages/FindContaminatingReadGroups.xml +++ /dev/null @@ -1,10 +0,0 @@ - - - - - - - - - - diff --git a/public/packages/GATKResources.xml b/public/packages/GATKResources.xml deleted file mode 100755 index 87e6e0e50..000000000 --- a/public/packages/GATKResources.xml +++ /dev/null @@ -1,20 +0,0 @@ - - - - - - - - - - - - - - - - - - - - diff --git a/public/packages/IndelGenotyper.xml b/public/packages/IndelGenotyper.xml deleted file mode 100644 index c9e3ae0f6..000000000 --- a/public/packages/IndelGenotyper.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - - - - - - diff --git a/public/packages/LocalRealignmentAroundIndels.xml b/public/packages/LocalRealignmentAroundIndels.xml deleted file mode 100644 index 46960e69f..000000000 --- a/public/packages/LocalRealignmentAroundIndels.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - - - diff --git a/public/packages/QualityScoresRecalibration.xml b/public/packages/QualityScoresRecalibration.xml deleted file mode 100644 index 95e8b7c63..000000000 --- a/public/packages/QualityScoresRecalibration.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - - - - - - - - - - - - diff --git a/public/packages/RMDIndexer.xml b/public/packages/RMDIndexer.xml deleted file mode 100644 index 5d40876de..000000000 --- a/public/packages/RMDIndexer.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - diff --git a/public/packages/UnifiedGenotyper.xml b/public/packages/UnifiedGenotyper.xml deleted file mode 100644 index 67a17640c..000000000 --- a/public/packages/UnifiedGenotyper.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - - - - - - diff --git a/public/packages/VariantAnnotator.xml b/public/packages/VariantAnnotator.xml deleted file mode 100644 index 88c0701f0..000000000 --- a/public/packages/VariantAnnotator.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/public/packages/VariantEval.xml b/public/packages/VariantEval.xml deleted file mode 100644 index 791066fb7..000000000 --- a/public/packages/VariantEval.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - - - - - - - - - - - - diff --git a/public/packages/VariantFiltration.xml b/public/packages/VariantFiltration.xml deleted file mode 100644 index 48fa0ff37..000000000 --- a/public/packages/VariantFiltration.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - diff --git a/public/packages/VariantRecalibration.xml b/public/packages/VariantRecalibration.xml deleted file mode 100644 index 6fe6b1eff..000000000 --- a/public/packages/VariantRecalibration.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - - - diff --git a/public/perl/liftOverVCF.pl b/public/perl/liftOverVCF.pl index 21cb8bb6b..ba4198292 100755 --- a/public/perl/liftOverVCF.pl +++ b/public/perl/liftOverVCF.pl @@ -36,7 +36,7 @@ my $unsorted_vcf = "$tmp_prefix.unsorted.vcf"; # lift over the file print "Lifting over the vcf..."; -my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; +my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -V:variant $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; if ($recordOriginalLocation) { $cmd .= " -recordOriginalLocation"; } @@ -66,7 +66,7 @@ system($cmd) == 0 or quit("The sorting step failed. Please correct the necessar # Filter the VCF for bad records print "\nFixing/removing bad records...\n"; -$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out"; +$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -V:variant $sorted_vcf -o $out"; system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying."); # clean up diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 2a135496d..f97ce4884 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -37,11 +37,6 @@ class DataProcessingPipeline extends QScript { * Optional Parameters ****************************************************************************/ - -// @Input(doc="path to Picard's SortSam.jar (if re-aligning a previously processed BAM file)", fullName="path_to_sort_jar", shortName="sort", required=false) -// var sortSamJar: File = _ -// - @Input(doc="extra VCF files to use as reference indels for Indel Realignment", fullName="extra_indels", shortName="indels", required=false) var indels: List[File] = List() @@ -132,24 +127,6 @@ class DataProcessingPipeline extends QScript { } } return sampleTable.toMap - -// println("\n\n*** INPUT FILES ***\n") -// // Creating one file for each sample in the dataset -// val sampleBamFiles = scala.collection.mutable.Map.empty[String, File] -// for ((sample, flist) <- sampleTable) { -// -// println(sample + ":") -// for (f <- flist) -// println (f) -// println() -// -// val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list") -// sampleBamFiles(sample) = sampleFileName -// //add(writeList(flist, sampleFileName)) -// } -// println("*** INPUT FILES ***\n\n") -// -// return sampleBamFiles.toMap } // Rebuilds the Read Group string to give BWA diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 59c00b8cd..036a77b58 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -131,11 +131,11 @@ class GATKResourcesBundle extends QScript { addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf", "hapmap_3.3", b37, true, true)) - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf", - "1000G_indels_for_realignment", b37, true, false)) + addResource(new Resource("/humgen/1kg/processing/official_release/phase1/ALL.wgs.VQSR_consensus_biallelic.20101123.indels.sites.vcf", + "1000G_biallelic.indels", b37, true, false)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf", - "indels_mills_devine", b37, true, true)) + "Mills_Devine_2hit.indels", b37, true, true)) // // example call set for wiki tutorial @@ -300,9 +300,9 @@ class GATKResourcesBundle extends QScript { bamFile = bamIn } - class IndexVCF(@Input vcf: File, @Input ref: File) extends CountRod with UNIVERSAL_GATK_ARGS { + class IndexVCF(@Input vcf: File, @Input ref: File) extends CountRODs with UNIVERSAL_GATK_ARGS { //@Output val vcfIndex: File = swapExt(vcf.getParent, vcf, ".vcf", ".vcf.idx") - this.rodBind :+= RodBind(vcf.getName, "VCF", vcf) + this.rod :+= vcf this.reference_sequence = ref } @@ -313,7 +313,7 @@ class GATKResourcesBundle extends QScript { } class MakeDBSNP129(@Input dbsnp: File, @Input ref: File, @Output dbsnp129: File) extends SelectVariants with UNIVERSAL_GATK_ARGS { - this.rodBind :+= RodBind("variant", "VCF", dbsnp) + this.variant = dbsnp this.select ++= List("\"dbSNPBuildID <= 129\"") this.reference_sequence = ref this.out = dbsnp129 diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala index 637d3edca..29ee2769b 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala @@ -22,8 +22,8 @@ class MethodsDevelopmentCallingPipeline extends QScript { @Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false) var noBAQ: Boolean = false - @Argument(shortName="indels", doc="calls indels with the Unified Genotyper", required=false) - var callIndels: Boolean = false + @Argument(shortName="noIndels", doc="do not call indels with the Unified Genotyper", required=false) + var noIndels: Boolean = false @Argument(shortName="LOCAL_ET", doc="Doesn't use the AWS S3 storage for ET option", required=false) var LOCAL_ET: Boolean = false @@ -177,7 +177,7 @@ class MethodsDevelopmentCallingPipeline extends QScript { val goldStandard = true for (target <- targets) { if( !skipCalling ) { - if (callIndels) add(new indelCall(target), new indelFilter(target), new indelEvaluation(target)) + if (!noIndels) add(new indelCall(target), new indelFilter(target), new indelEvaluation(target)) add(new snpCall(target)) add(new VQSR(target, !goldStandard)) add(new applyVQSR(target, !goldStandard)) @@ -260,12 +260,10 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } ) - this.training :+= new TaggedFile( t.hapmapFile, "prior=15.0") - this.truth :+= new TaggedFile( t.hapmapFile, "prior=15.0") - this.training :+= new TaggedFile( omni_b37, "prior=12.0") - this.truth :+= new TaggedFile( omni_b37, "prior=12.0") - this.training :+= new TaggedFile( training_1000G, "prior=10.0" ) - this.known :+= new TaggedFile( t.dbsnpFile, "prior=2.0" ) + this.resource :+= new TaggedFile( t.hapmapFile, "training=true,truth=true,prior=15.0" ) + this.resource :+= new TaggedFile( omni_b37, "training=true,truth=true,prior=12.0" ) + this.resource :+= new TaggedFile( training_1000G, "training=true,prior=10.0" ) + this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" ) this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" ) this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS") if(t.nSamples >= 10) { diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala index 1d473b210..9bddfd97c 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala @@ -56,15 +56,15 @@ class ExampleUnifiedGenotyper extends QScript { genotyper.input_file :+= qscript.bamFile genotyper.out = swapExt(qscript.bamFile, "bam", "unfiltered.vcf") - evalUnfiltered.rodBind :+= RodBind("eval", "VCF", genotyper.out) + evalUnfiltered.eval :+= genotyper.out evalUnfiltered.out = swapExt(genotyper.out, "vcf", "eval") - variantFilter.rodBind :+= RodBind("variant", "VCF", genotyper.out) + variantFilter.variant = genotyper.out variantFilter.out = swapExt(qscript.bamFile, "bam", "filtered.vcf") variantFilter.filterName = filterNames variantFilter.filterExpression = filterExpressions.map("\"" + _ + "\"") - evalFiltered.rodBind :+= RodBind("eval", "VCF", variantFilter.out) + evalFiltered.eval :+= variantFilter.out evalFiltered.out = swapExt(variantFilter.out, "vcf", "eval") add(genotyper, evalUnfiltered) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala index ef7f2afb0..4cb925d9f 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala @@ -156,7 +156,7 @@ class FunctionEdge(val function: QFunction, val inputs: QNode, val outputs: QNod val maxLines = 100 val tailLines = IOUtils.tail(errorFile, maxLines) val nl = "%n".format() - val summary = if (tailLines.size <= maxLines) "Last %d lines".format(maxLines) else "Contents" + val summary = if (tailLines.size > maxLines) "Last %d lines".format(maxLines) else "Contents" logger.error("%s of %s:%n%s".format(summary, errorFile, tailLines.mkString(nl))) } else { logger.error("Unable to access log file: %s".format(errorFile)) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index aae5e438c..0fb997f43 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -1,65 +1,65 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.queue.extensions.gatk - -import java.io.File -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.interval.IntervalUtils -import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource -import net.sf.samtools.SAMFileHeader -import java.util.Collections -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} - -case class GATKIntervals(reference: File, intervals: List[String]) { - private lazy val referenceDataSource = new ReferenceDataSource(reference) - private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]] - - lazy val samFileHeader = { - val header = new SAMFileHeader - header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary) - header - } - - lazy val locs: java.util.List[GenomeLoc] = { - val parser = new GenomeLocParser(referenceDataSource.getReference) - val parsedLocs = - if (intervals.isEmpty) - GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList - else - IntervalUtils.parseIntervalArguments(parser, intervals, false) - Collections.sort(parsedLocs) - Collections.unmodifiableList(parsedLocs) - } - - lazy val contigs = locs.map(_.getContig).distinct.toList - - def getSplits(size: Int) = { - splitsBySize.getOrElse(size, { - val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size) - splitsBySize += size -> splits - splits - }) - } -} +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.extensions.gatk + +import java.io.File +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.interval.IntervalUtils +import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource +import net.sf.samtools.SAMFileHeader +import java.util.Collections +import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} + +case class GATKIntervals(reference: File, intervals: List[String]) { + private lazy val referenceDataSource = new ReferenceDataSource(reference) +// private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]] + + lazy val samFileHeader = { + val header = new SAMFileHeader + header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary) + header + } + + lazy val locs: java.util.List[GenomeLoc] = { + val parser = new GenomeLocParser(referenceDataSource.getReference) + val parsedLocs = + if (intervals.isEmpty) + GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList + else + IntervalUtils.parseIntervalArguments(parser, intervals, false) + Collections.sort(parsedLocs) + Collections.unmodifiableList(parsedLocs) + } + + lazy val contigs = locs.map(_.getContig).distinct.toList + +// def getSplits(size: Int) = { +// splitsBySize.getOrElse(size, { +// val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size) +// splitsBySize += size -> splits +// splits +// }) +// } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala index d88d272b9..f65d5ab29 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala @@ -37,7 +37,7 @@ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction def run() { val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) - IntervalUtils.scatterFixedIntervals(gi.samFileHeader, gi.locs, - gi.getSplits(this.scatterOutputFiles.size), this.scatterOutputFiles) + val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size) + IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala index 42f63e225..b4c5d91d3 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala @@ -1,7 +1,7 @@ package org.broadinstitute.sting.queue.extensions.gatk import java.io.File -import org.broadinstitute.sting.queue.function.FileExtension +import org.broadinstitute.sting.queue.util.FileExtension import java.lang.String /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala index ed8158b49..b19f9e430 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala @@ -1,7 +1,7 @@ package org.broadinstitute.sting.queue.extensions.gatk import java.io.File -import org.broadinstitute.sting.queue.function.FileExtension +import org.broadinstitute.sting.queue.util.FileExtension /** * Used to provide tagged -I input_file arguments to the GATK. diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala index c905581fa..500f7b200 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala @@ -387,25 +387,11 @@ trait QFunction extends Logging with QJobReport { */ protected def canon(value: Any) = { value match { - case fileExtension: FileExtension => - val newFile = absolute(fileExtension); - val newFileExtension = fileExtension.withPath(newFile.getPath) - newFileExtension - case file: File => - if (file.getClass != classOf[File]) - throw new QException("Extensions of file must also extend with FileExtension so that the path can be modified."); - absolute(file) + case file: File => IOUtils.absolute(commandDirectory, file) case x => x } } - /** - * Returns the absolute path to the file relative to the run directory and the job command directory. - * @param file File to root relative to the command directory if it is not already absolute. - * @return The absolute path to file. - */ - private def absolute(file: File) = IOUtils.absolute(commandDirectory, file) - /** * Scala sugar type for checking annotation required and exclusiveOf. */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/FileExtension.scala b/public/scala/src/org/broadinstitute/sting/queue/util/FileExtension.scala similarity index 89% rename from public/scala/src/org/broadinstitute/sting/queue/function/FileExtension.scala rename to public/scala/src/org/broadinstitute/sting/queue/util/FileExtension.scala index e2394a5bf..9b6e52c8e 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/FileExtension.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/FileExtension.scala @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.queue.function +package org.broadinstitute.sting.queue.util import java.io.File diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala index 79ffa8cb9..b17ccc0d5 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/IOUtils.scala @@ -3,6 +3,7 @@ package org.broadinstitute.sting.queue.util import org.apache.commons.io.FileUtils import java.io.{FileReader, File} import org.broadinstitute.sting.utils.exceptions.UserException +import org.broadinstitute.sting.queue.QException /** * A collection of utilities for modifying java.io. @@ -12,7 +13,7 @@ object IOUtils extends Logging { * Checks if the temp directory has been setup and throws an exception if they user hasn't set it correctly. * @param tempDir Temporary directory. */ - def checkTempDir(tempDir: File) = { + def checkTempDir(tempDir: File) { val tempDirPath = tempDir.getAbsolutePath // Keeps the user from leaving the temp directory as the default, and on Macs from having pluses // in the path which can cause problems with the Google Reflections library. @@ -20,7 +21,7 @@ object IOUtils extends Logging { if (tempDirPath.startsWith("/var/folders/") || (tempDirPath == "/tmp") || (tempDirPath == "/tmp/")) throw new UserException.BadTmpDir("java.io.tmpdir must be explicitly set") if (!tempDir.exists && !tempDir.mkdirs) - throw new UserException.BadTmpDir("Could not create directory: " + tempDir.getAbsolutePath()) + throw new UserException.BadTmpDir("Could not create directory: " + tempDir.getAbsolutePath) } /** @@ -35,9 +36,9 @@ object IOUtils extends Logging { throw new UserException.BadTmpDir("Could not create temp directory: " + tempDirParent) val temp = File.createTempFile(prefix + "-", suffix, tempDirParent) if (!temp.delete) - throw new UserException.BadTmpDir("Could not delete sub file: " + temp.getAbsolutePath()) + throw new UserException.BadTmpDir("Could not delete sub file: " + temp.getAbsolutePath) if (!temp.mkdir) - throw new UserException.BadTmpDir("Could not create sub directory: " + temp.getAbsolutePath()) + throw new UserException.BadTmpDir("Could not create sub directory: " + temp.getAbsolutePath) absolute(temp) } @@ -46,7 +47,7 @@ object IOUtils extends Logging { * @param file File to write to. * @param content Content to write. */ - def writeContents(file: File, content: String) = FileUtils.writeStringToFile(file, content) + def writeContents(file: File, content: String) { FileUtils.writeStringToFile(file, content) } /** * Reads content of a file into a string. @@ -146,10 +147,12 @@ object IOUtils extends Logging { * @return The absolute path to the file in the parent dir if the path was not absolute, otherwise the original path. */ def absolute(parent: File, file: File): File = { - if (file.isAbsolute) - absolute(file) - else - absolute(new File(parent, file.getPath)) + val newPath = + if (file.isAbsolute) + absolutePath(file) + else + absolutePath(new File(parent, file.getPath)) + replacePath(file, newPath) } /** @@ -159,12 +162,16 @@ object IOUtils extends Logging { * @return the absolute path to the file. */ def absolute(file: File) = { + replacePath(file, absolutePath(file)) + } + + private def absolutePath(file: File) = { var fileAbs = file.getAbsoluteFile var names = List.empty[String] while (fileAbs != null) { val name = fileAbs.getName fileAbs = fileAbs.getParentFile - + if (name == ".") { /* skip */ @@ -190,7 +197,18 @@ object IOUtils extends Logging { } } - new File(names.mkString("/", "/", "")) + names.mkString("/", "/", "") + } + + private def replacePath(file: File, path: String) = { + file match { + case fileExtension: FileExtension => + fileExtension.withPath(path) + case file: File => + if (file.getClass != classOf[File]) + throw new QException("Sub classes of java.io.File must also implement FileExtension so that the path can be modified.") + new File(path) + } } /** diff --git a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala index b3a2d23ae..38abe24ef 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala @@ -53,8 +53,8 @@ class GATKIntervalsUnitTest { val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5")) Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3)) Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3")) - Assert.assertEquals(gi.getSplits(2).toList, List(2, 3)) - Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) +// Assert.assertEquals(gi.getSplits(2).toList, List(2, 3)) +// Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) } @Test(timeOut = 30000) @@ -65,7 +65,7 @@ class GATKIntervalsUnitTest { // for(Item item: javaConvertedScalaList) // This for loop is actually an O(N^2) operation as the iterator calls the // O(N) javaConvertedScalaList.size() for each iteration of the loop. - Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) + //Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) Assert.assertEquals(gi.contigs.size, 24) } @@ -74,8 +74,8 @@ class GATKIntervalsUnitTest { val gi = new GATKIntervals(hg18Reference, Nil) Assert.assertEquals(gi.locs, hg18ReferenceLocs) Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size) - Assert.assertEquals(gi.getSplits(2).toList, List(10, 45)) - Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45)) +// Assert.assertEquals(gi.getSplits(2).toList, List(10, 45)) +// Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45)) } @Test diff --git a/settings/repository/org.broad/tribble-24.jar b/settings/repository/org.broad/tribble-25.jar similarity index 89% rename from settings/repository/org.broad/tribble-24.jar rename to settings/repository/org.broad/tribble-25.jar index b1c39e60a..6d467ba3f 100644 Binary files a/settings/repository/org.broad/tribble-24.jar and b/settings/repository/org.broad/tribble-25.jar differ diff --git a/settings/repository/org.broad/tribble-24.xml b/settings/repository/org.broad/tribble-25.xml similarity index 51% rename from settings/repository/org.broad/tribble-24.xml rename to settings/repository/org.broad/tribble-25.xml index 9b2b967f8..ed7a1fd69 100644 --- a/settings/repository/org.broad/tribble-24.xml +++ b/settings/repository/org.broad/tribble-25.xml @@ -1,3 +1,3 @@ - +