From 5f84a4ad82ee7ae445e3c0a8ba13d83256e16e7e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 24 Dec 2012 16:31:47 -0500 Subject: [PATCH 06/22] Clover report excludes test files and other non-interesting files from the clover reports --- build.xml | 56 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/build.xml b/build.xml index 47e4eeb47..b4bb8716d 100644 --- a/build.xml +++ b/build.xml @@ -111,7 +111,7 @@ - + @@ -273,19 +273,19 @@ - - - - - - + + - - - - - - + + @@ -1140,7 +1140,34 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1148,6 +1175,7 @@ + From 38cc496de8f0b6aac37bd649a4b1fe3eade7a59d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 29 Dec 2012 14:34:08 -0500 Subject: [PATCH 07/22] Move SomaticIndelDetector and associated tools and libraries into private/andrey package -- Intermediate commit on the way to archiving SomaticIndelDetector and other tools. -- SomaticIndelDetector, PairMaker and RemapAlignments tools have been refactored into the private andrey package. All utility classes refactored into here as well. At this point, the SomaticIndelDetector builds in this version of the GATK. -- Subsequent commit will put this code into the archive so it no longer builds in the GATK --- .../walkers/indels/SomaticIndelDetector.java | 2291 ----------------- .../utils/collections/CircularArray.java | 231 -- .../interval/OverlappingIntervalIterator.java | 195 -- .../sting/utils/sam/AlignmentUtils.java | 53 - 4 files changed, 2770 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/collections/CircularArray.java delete mode 100755 public/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java deleted file mode 100644 index 68be1629c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java +++ /dev/null @@ -1,2291 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.indels; - -import net.sf.samtools.*; -import org.apache.commons.jexl2.Expression; -import org.apache.commons.jexl2.JexlContext; -import org.apache.commons.jexl2.JexlEngine; -import org.apache.commons.jexl2.MapContext; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; -import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; -import org.broadinstitute.sting.gatk.filters.Platform454Filter; -import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; -import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; -import org.broadinstitute.sting.utils.codecs.refseq.Transcript; -import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.sting.utils.collections.CircularArray; -import org.broadinstitute.sting.utils.collections.PrimitivePair; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.interval.OverlappingIntervalIterator; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; - -import java.io.*; -import java.util.*; - - -/** - * Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well, - * but this latter functionality is now superceded by UnifiedGenotyper. - * - *

- * This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing - * data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs - * include additional statistics such as mismatches and base qualitites around the calls, read strandness (how many - * forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional - * statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will - * attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional - * metrics for the post-processing tools to make the final decision). The calls are performed by default - * from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I - * command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments, - * respectively. Indels are called from the tumor sample and annotated as germline - * if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic - * if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling - * is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold - * in tumor sample, it will not be reported. - * - * To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input - * bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged - * on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups). - * - * Which (putative) calls will make it into the output file(s) is controlled by an expression/list of expressions passed with -filter - * flag: if any of the expressions evaluate to TRUE, the site will be discarded. Otherwise the putative call and all the - * associated statistics will be printed into the output. Expressions recognize the following variables(in paired-sample - * somatic mode variables are prefixed with T_ and N_ for Tumor and Normal, e.g. N_COV and T_COV are defined instead of COV): - * COV for coverage at the site, INDEL_F for fraction of reads supporting consensus indel at the site (wrt total coverage), - * INDEL_CF for fraction of reads with consensus indel wrt all reads with an indel at the site, CONS_CNT for the count of - * reads supporting the consensus indel at the site. Conventional arithmetic and logical operations are supported. For instance, - * N_COV<4||T_COV<6||T_INDEL_F<0.3||T_INDEL_CF<0.7 instructs the tool to only output indel calls with at least 30% observed - * allelic fraction and with consensus indel making at least 70% of all indel observations at the site, and only at the sites - * where tumor coverage and normal coverage are at least 6 and 4, respectively. - *

Input

- *

- * Tumor and normal bam files (or single sample bam file(s) in --unpaired mode). - *

- * - *

Output

- *

- * Indel calls with associated metrics. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T SomaticIndelDetector \
- *   -o indels.vcf \
- *   -verbose indels.txt
- *   -I:normal normal.bam \
- *   -I:tumor tumor.bam
- * 
- * - */ - -@DocumentedGATKFeature( groupName = "Cancer-specific Variant Discovery Tools", extraDocs = {CommandLineGATK.class} ) -@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class}) -public class SomaticIndelDetector extends ReadWalker { -// @Output -// PrintStream out; - @Output(doc="File to write variants (indels) in VCF format",required=true) - protected VariantContextWriter vcf_writer = null; - - @Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true) - @Deprecated - java.io.File output_file; - - @Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print callability metrics output", required = false) - public PrintStream metricsWriter = null; - -// @Argument(fullName="vcf_format", shortName="vcf", doc="generate output file in VCF format", required=false) -// boolean FORMAT_VCF = false; - - @Hidden - @Input(fullName = "genotype_intervals", shortName = "genotype", - doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false) - public IntervalBinding genotypeIntervalsFile = null; - - @Hidden - @Argument(fullName="unpaired", shortName="unpaired", - doc="Perform unpaired calls (no somatic status detection)", required=false) - boolean call_unpaired = false; - boolean call_somatic ; - - @Argument(fullName="verboseOutput", shortName="verbose", - doc="Verbose output file in text format", required=false) - java.io.File verboseOutput = null; - - @Argument(fullName="bedOutput", shortName="bed", - doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false) - java.io.File bedOutput = null; - - @Deprecated - @Argument(fullName="minCoverage", shortName="minCoverage", - doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+ - "with --unpaired (single sample) option, this value is used for minimum sample coverage. "+ - "INSTEAD USE: T_COV FILTER_EXPRESSIONS = new ArrayList(); - -//@Argument(fullName="blacklistedLanes", shortName="BL", -// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ -// "by this application, so they will not contribute indels to consider and will not be counted.", required=false) -//PlatformUnitFilterHelper dummy; - - @Hidden - @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on", - required=false) Boolean DEBUG = false; - @Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+ - "May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+ - "window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+ - "than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+ - "if window is too small",required=false) int WINDOW_SIZE = 200; - @Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+ - " the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000; - - - - private WindowContext tumor_context; - private WindowContext normal_context; - private int currentContigIndex = -1; - private int contigLength = -1; // we see to much messy data with reads hanging out of contig ends... - private int currentPosition = -1; // position of the last read we've seen on the current contig - private String refName = null; - private java.io.Writer output = null; - private GenomeLoc location = null; - private long normalCallsMade = 0L, tumorCallsMade = 0L; - - boolean outOfContigUserWarned = false; - - private LocationAwareSeekableRODIterator refseqIterator=null; - -// private Set normalReadGroups; // we are going to remember which read groups are normals and which are tumors in order to be able -// private Set tumorReadGroups ; // to properly assign the reads coming from a merged stream - private Set normalSamples; // we are going to remember which samples are normal and which are tumor: - private Set tumorSamples ; // these are used only to generate genotypes for vcf output - - private int NQS_WIDTH = 5; // 5 bases on each side of the indel for NQS-style statistics - - private Writer bedWriter = null; - private Writer verboseWriter = null; - - - private static String annGenomic = "GENOMIC\t"; - private static String annIntron = "INTRON"; - private static String annUTR = "UTR"; - private static String annCoding = "CODING"; - private static String annUnknown = "UNKNOWN"; - - enum CallType { - NOCOVERAGE, - BADCOVERAGE, - NOEVIDENCE, - GERMLINE, - SOMATIC - }; - - private SAMRecord lastRead; - private byte[] refBases; - private ReferenceDataSource refData; - private Iterator genotypeIntervalIterator = null; - - // the current interval in the list of intervals, for which we want to do full genotyping - private GenomeLoc currentGenotypeInterval = null; - private long lastGenotypedPosition = -1; // last position on the currentGenotypeInterval, for which a call was already printed; - // can be 1 base before lastGenotyped start - - private JexlEngine jexlEngine = new JexlEngine(); - private ArrayList jexlExpressions = new ArrayList(); - - // the following arrays store indel source-specific (normal/tumor) metric names - // for fast access when populating JEXL expression contexts (see IndelPrecall.fillContext()) - private final static String[] normalMetricsCassette = new String[4]; - private final static String[] tumorMetricsCassette = new String[4]; - private final static String[] singleMetricsCassette = new String[4]; - private final static int C_COV=0; - private final static int C_CONS_CNT=1; - private final static int C_INDEL_F=2; - private final static int C_INDEL_CF=3; - static { - normalMetricsCassette[C_COV] = "N_COV"; - tumorMetricsCassette[C_COV] = "T_COV"; - singleMetricsCassette[C_COV] = "COV"; - normalMetricsCassette[C_CONS_CNT] = "N_CONS_CNT"; - tumorMetricsCassette[C_CONS_CNT] = "T_CONS_CNT"; - singleMetricsCassette[C_CONS_CNT] = "CONS_CNT"; - normalMetricsCassette[C_INDEL_F] = "N_INDEL_F"; - tumorMetricsCassette[C_INDEL_F] = "T_INDEL_F"; - singleMetricsCassette[C_INDEL_F] = "INDEL_F"; - normalMetricsCassette[C_INDEL_CF] = "N_INDEL_CF"; - tumorMetricsCassette[C_INDEL_CF] = "T_INDEL_CF"; - singleMetricsCassette[C_INDEL_CF] = "INDEL_CF"; - } - - // "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt" - - private Set getVCFHeaderInfo() { - Set headerInfo = new HashSet(); - - // first, the basic info - headerInfo.add(new VCFHeaderLine("source", "SomaticIndelDetector")); - headerInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - headerInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY)); - - // FORMAT and INFO fields -// headerInfo.addAll(VCFUtils.getSupportedHeaderStrings()); - - headerInfo.addAll(VCFIndelAttributes.getAttributeHeaderLines()); - if ( call_somatic ) { - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); - } else { - } - - // all of the arguments from the argument collection - Set args = new HashSet(); - args.add(this); - args.addAll(getToolkit().getFilters()); - Map commandLineArgs = getToolkit().getApproximateCommandLineArguments(args); - for ( Map.Entry commandLineArg : commandLineArgs.entrySet() ) - headerInfo.add(new VCFHeaderLine(String.format("SID_%s", commandLineArg.getKey()), commandLineArg.getValue())); - // also, the list of input bams - for ( String fileName : getToolkit().getArguments().samFiles ) - headerInfo.add(new VCFHeaderLine("SID_bam_file_used", fileName)); - - return headerInfo; - } - - - @Override - public void initialize() { - - call_somatic = (call_unpaired ? false : true); - normal_context = new WindowContext(0,WINDOW_SIZE); - normalSamples = new HashSet(); - - if ( bedOutput != null && output_file != null ) { - throw new UserException.DeprecatedArgument("-O", "-O option is deprecated and -bed option replaces it; you can not use both at the same time"); - } - - if ( RefseqFileName != null ) { - logger.info("Using RefSeq annotations from "+RefseqFileName); - - RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - getToolkit().getArguments().unsafe); - RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,new File(RefseqFileName)); - - refseqIterator = new SeekableRODIterator(refseq.getHeader(), - refseq.getSequenceDictionary(), - getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - refseq.getIterator()); - } - - if ( refseqIterator == null ) logger.info("No gene annotations available"); - - int nSams = getToolkit().getArguments().samFiles.size(); - - if ( call_somatic ) { - if ( nSams < 2 ) throw new UserException.BadInput("In default (paired sample) mode at least two bam files (normal and tumor) must be specified"); - tumor_context = new WindowContext(0,WINDOW_SIZE); - tumorSamples = new HashSet(); - } - - int nNorm = 0; - int nTum = 0; - for ( SAMReaderID rid : getToolkit().getReadsDataSource().getReaderIDs() ) { - Tags tags = rid.getTags() ; - if ( tags.getPositionalTags().isEmpty() && call_somatic ) - throw new UserException.BadInput("In default (paired sample) mode all input bam files must be tagged as either 'normal' or 'tumor'. Untagged file: "+ - getToolkit().getSourceFileForReaderID(rid)); - boolean normal = false; - boolean tumor = false; - for ( String s : tags.getPositionalTags() ) { // we allow additional unrelated tags (and we do not use them), but we REQUIRE one of Tumor/Normal to be present if --somatic is on - if ( "NORMAL".equals(s.toUpperCase()) ) { - normal = true; - nNorm++; - } - if ( "TUMOR".equals(s.toUpperCase()) ) { - tumor = true; - nTum++ ; - } - } - if ( call_somatic && normal && tumor ) throw new UserException.BadInput("Input bam file "+ - getToolkit().getSourceFileForReaderID(rid)+" is tagged both as normal and as tumor. Which one is it??"); - if ( call_somatic && !normal && ! tumor ) - throw new UserException.BadInput("In somatic mode all input bams must be tagged as either normal or tumor. Encountered untagged file: "+ - getToolkit().getSourceFileForReaderID(rid)); - if ( ! call_somatic && (normal || tumor) ) - System.out.println("WARNING: input bam file "+getToolkit().getSourceFileForReaderID(rid) - +" is tagged as Normal and/or Tumor, but somatic mode is not on. Tags will ne IGNORED"); - if ( call_somatic && tumor ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) { - tumorSamples.add(rg.getSample()); - } - } else { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) { - normalSamples.add(rg.getSample()); - } - } - if ( genotypeIntervalsFile != null ) { - - // read in the whole list of intervals for cleaning - GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), genotypeIntervalsFile.getIntervals(getToolkit()), IntervalMergingRule.OVERLAPPING_ONLY); - genotypeIntervalIterator = locs.iterator(); - - // wrap intervals requested for genotyping inside overlapping iterator, so that we actually - // genotype only on the intersections of the requested intervals with the -L intervals - genotypeIntervalIterator = new OverlappingIntervalIterator(genotypeIntervalIterator, getToolkit().getIntervals().iterator() ); - - currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; - - if ( DEBUG) System.out.println("DEBUG>> first genotyping interval="+currentGenotypeInterval); - - if ( currentGenotypeInterval != null ) lastGenotypedPosition = currentGenotypeInterval.getStart()-1; - } - - } - - location = getToolkit().getGenomeLocParser().createGenomeLoc(getToolkit().getSAMFileHeader().getSequence(0).getSequenceName(),1); - - try { - // we already checked that bedOutput and output_file are not set simultaneously - if ( bedOutput != null ) bedWriter = new FileWriter(bedOutput); - if ( output_file != null ) bedWriter = new FileWriter(output_file); - } catch (java.io.IOException e) { - throw new UserException.CouldNotReadInputFile(bedOutput, "Failed to open BED file for writing.", e); - } - try { - if ( verboseOutput != null ) verboseWriter = new FileWriter(verboseOutput); - } catch (java.io.IOException e) { - throw new UserException.CouldNotReadInputFile(verboseOutput, "Failed to open BED file for writing.", e); - } - - vcf_writer.writeHeader(new VCFHeader(getVCFHeaderInfo(), SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()))) ; - refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile); - - // Now initialize JEXL expressions: - if ( FILTER_EXPRESSIONS.size() == 0 ) { - if ( call_unpaired ) { - FILTER_EXPRESSIONS.add("COV<6||INDEL_F<0.3||INDEL_CF<0.7"); - } else { - FILTER_EXPRESSIONS.add("T_COV<6||N_COV<4||T_INDEL_F<0.3||T_INDEL_CF<0.7"); - } - } - for ( String s : FILTER_EXPRESSIONS ) { - try { - Expression e = jexlEngine.createExpression(s); - jexlExpressions.add(e); - } catch (Exception e) { - throw new UserException.BadArgumentValue("Filter expression", "Invalid expression used (" + s + "). Please see the JEXL docs for correct syntax.") ; - } - - } - } - - - @Override - public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { - - // if ( read.getReadName().equals("428EFAAXX090610:2:36:1384:639#0") ) System.out.println("GOT READ"); - - if ( DEBUG ) { - // System.out.println("DEBUG>> read at "+ read.getAlignmentStart()+"-"+read.getAlignmentEnd()+ - // "("+read.getCigarString()+")"); - if ( read.getDuplicateReadFlag() ) System.out.println("DEBUG>> Duplicated read (IGNORED)"); - } - - if ( AlignmentUtils.isReadUnmapped(read) || - read.getDuplicateReadFlag() || - read.getNotPrimaryAlignmentFlag() || - read.getMappingQuality() == 0 ) { - return 0; // we do not need those reads! - } - - if ( read.getReferenceIndex() != currentContigIndex ) { - // we just jumped onto a new contig - if ( DEBUG ) System.out.println("DEBUG>>> Moved to contig "+read.getReferenceName()); - if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName()+": contig is out of order; input BAM file is unsorted"); - - // print remaining indels from the previous contig (if any); - if ( call_somatic ) emit_somatic(1000000000, true); - else emit(1000000000,true); - - currentContigIndex = read.getReferenceIndex(); - currentPosition = read.getAlignmentStart(); - refName = new String(read.getReferenceName()); - - location = getToolkit().getGenomeLocParser().createGenomeLoc(refName,location.getStart(),location.getStop()); - contigLength = getToolkit().getGenomeLocParser().getContigInfo(refName).getSequenceLength(); - outOfContigUserWarned = false; - - lastGenotypedPosition = -1; - - normal_context.clear(); // reset coverage window; this will also set reference position to 0 - if ( call_somatic) tumor_context.clear(); - - refBases = new String(refData.getReference().getSequence(read.getReferenceName()).getBases()).toUpperCase().getBytes(); - } - - // we have reset the window to the new contig if it was required and emitted everything we collected - // on a previous contig. At this point we are guaranteed that we are set up properly for working - // with the contig of the current read. - - // NOTE: all the sanity checks and error messages below use normal_context only. We make sure that normal_context and - // tumor_context are synchronized exactly (windows are always shifted together by emit_somatic), so it's safe - - if ( read.getAlignmentStart() < currentPosition ) // oops, read out of order? - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName() +" out of order on the contig\n"+ - "Read starts at "+refName+":"+read.getAlignmentStart()+"; last read seen started at "+refName+":"+currentPosition - +"\nLast read was: "+lastRead.getReadName()+" RG="+lastRead.getAttribute("RG")+" at "+lastRead.getAlignmentStart()+"-" - +lastRead.getAlignmentEnd()+" cigar="+lastRead.getCigarString()); - - currentPosition = read.getAlignmentStart(); - lastRead = read; - - if ( read.getAlignmentEnd() > contigLength ) { - if ( ! outOfContigUserWarned ) { - System.out.println("WARNING: Reads aligned past contig length on "+ location.getContig()+"; all such reads will be skipped"); - outOfContigUserWarned = true; - } - return 0; - } - - long alignmentEnd = read.getAlignmentEnd(); - Cigar c = read.getCigar(); - int lastNonClippedElement = 0; // reverse offset to the last unclipped element - CigarOperator op = null; - // moving backwards from the end of the cigar, skip trailing S or H cigar elements: - do { - lastNonClippedElement++; - op = c.getCigarElement( c.numCigarElements()-lastNonClippedElement ).getOperator(); - } while ( op == CigarOperator.H || op == CigarOperator.S ); - - // now op is the last non-S/H operator in the cigar. - - // a little trick here: we want to make sure that current read completely fits into the current - // window so that we can accumulate indel observations over the whole length of the read. - // The ::getAlignmentEnd() method returns the last position on the reference where bases from the - // read actually match (M cigar elements). After our cleaning procedure, we can have reads that end - // with I element, which is not gonna be counted into alignment length on the reference. On the other hand, - // in this program we assign insertions, internally, to the first base *after* the insertion position. - // Hence, we have to make sure that that extra base is already in the window or we will get IndexOutOfBounds. - - if ( op == CigarOperator.I) alignmentEnd++; - - if ( alignmentEnd > normal_context.getStop()) { - - // we don't emit anything until we reach a read that does not fit into the current window. - // At that point we try shifting the window to the start of that read (or reasonably close) and emit everything prior to - // that position. This is legitimate, since the reads are sorted and we are not gonna see any more coverage at positions - // below the current read's start. - // Clearly, we assume here that window is large enough to accomodate any single read, so simply shifting - // the window to around the read's start will ensure that the read fits... - - if ( DEBUG) System.out.println("DEBUG>> Window at "+normal_context.getStart()+"-"+normal_context.getStop()+", read at "+ - read.getAlignmentStart()+": trying to emit and shift" ); - if ( call_somatic ) emit_somatic( read.getAlignmentStart(), false ); - else emit( read.getAlignmentStart(), false ); - - // let's double check now that the read fits after the shift - if ( read.getAlignmentEnd() > normal_context.getStop()) { - // ooops, looks like the read does not fit into the window even after the latter was shifted!! - // we used to die over such reads and require user to run with larger window size. Now we - // just print a warning and discard the read (this means that our counts can be slightly off in - // th epresence of such reads) - //throw new UserException.BadArgumentValue("window_size", "Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small, so increase the value of the window_size argument.\n"+ - // "Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+ - // read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+ - // "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop()); - System.out.println("WARNING: Read "+read.getReadName()+ - " is out of coverage window bounds. Probably window is too small and the window_size value must be increased.\n"+ - " The read is ignored in this run (so all the counts/statistics reported will not include it).\n"+ - " Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+ - read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+ - "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop()); - return 1; - } - } - - if ( call_somatic ) { - - Tags tags = getToolkit().getReaderIDForRead(read).getTags(); - boolean assigned = false; - for ( String s : tags.getPositionalTags() ) { - if ( "NORMAL".equals(s.toUpperCase()) ) { - normal_context.add(read,ref.getBases()); - assigned = true; - break; - } - if ( "TUMOR".equals(s.toUpperCase()) ) { - tumor_context.add(read,ref.getBases()); - assigned = true; - break; - } - } - if ( ! assigned ) - throw new StingException("Read "+read.getReadName()+" from "+getToolkit().getSourceFileForReaderID(getToolkit().getReaderIDForRead(read))+ - "has no Normal/Tumor tag associated with it"); - -// String rg = (String)read.getExtendedAttribute("RG"); -// if ( rg == null ) -// throw new UserException.MalformedBam(read, "Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls."); - -// if ( normalReadGroups.contains(rg) ) { -// normal_context.add(read,ref.getBases()); -// } else if ( tumorReadGroups.contains(rg) ) { -// tumor_context.add(read,ref.getBases()); -// } else { -// throw new UserException.MalformedBam(read, "Unrecognized read group in merged stream: "+rg); -// } - - if ( tumor_context.getReads().size() > MAX_READ_NUMBER ) { - System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ - refName+':'+tumor_context.getStart()+'-'+tumor_context.getStop()+" in tumor sample. The whole window will be dropped."); - tumor_context.shift(WINDOW_SIZE); - normal_context.shift(WINDOW_SIZE); - } - if ( normal_context.getReads().size() > MAX_READ_NUMBER ) { - System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ - refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+" in normal sample. The whole window will be dropped"); - tumor_context.shift(WINDOW_SIZE); - normal_context.shift(WINDOW_SIZE); - } - - - } else { - normal_context.add(read, ref.getBases()); - if ( normal_context.getReads().size() > MAX_READ_NUMBER ) { - System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ - refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+". The whole window will be dropped"); - normal_context.shift(WINDOW_SIZE); - } - } - - return 1; - } - - /** An auxiliary shortcut: returns true if position(location.getContig(), p) is past l */ - private boolean pastInterval(long p, GenomeLoc l) { - return ( location.getContigIndex() > l.getContigIndex() || - location.getContigIndex() == l.getContigIndex() && p > l.getStop() ); - } - - /** Emit calls of the specified type across genotyping intervals, from position lastGenotypedPosition+1 to - * pos-1, inclusive. - * @param contigIndex - * @param pos - * @param call - */ - /* - private void emitNoCallsUpTo(int contigIndex, long pos, CallType call) { - - if ( contigIndex < currentGenotypeInterval.getContigIndex() || - contigIndex == currentGenotypeInterval.getContigIndex() && pos <= currentGenotypeInterval.getStart() ) return; - - if ( contigIndex == currentGenotypeInterval.getContigIndex() && pos >= currentGenotypeInterval.getStart() ) { - for ( long p = lastGenotypedPosition+1; p < pos; p++ ) { - - } - } - while( currentGenotypeInterval != null ) { - - while ( ) - if ( genotypeIntervalIterator.hasNext() ) { - currentGenotypeInterval = genotypeIntervalIterator.next() ; - if ( pastInterval(p,currentGenotypeInterval) ) { - // if we are about to jump over the whole next interval, we need to emit NO_COVERAGE calls there! - emitNoCoverageCalls(currentGenotypeInterval); - } - } else { - currentGenotypeInterval = null; - } - } - } -*/ - - /** Output indel calls up to the specified position and shift the window: after this method is executed, the - * first element of the window maps onto 'position', if possible, or at worst a few bases to the left of 'position' if we may need more - * reads to get full NQS-style statistics for an indel in the close proximity of 'position'. - * - * @param position - */ - private void emit(long position, boolean force) { - - long adjustedPosition = adjustPosition(position); - - if ( adjustedPosition == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(position-normal_context.getStart())); - return; - } - long move_to = adjustedPosition; - - for ( int pos = normal_context.getStart() ; pos < Math.min(adjustedPosition,normal_context.getStop()+1) ; pos++ ) { - - boolean genotype = false; - // first let's see if we need to genotype current position: - - final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf) - - if ( pos <= lastGenotypedPosition ) continue; - - while ( currentGenotypeInterval != null ) { - - // if we did not even reach next interval yet, no genotyping at current position: - if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() || - location.getContigIndex() == currentGenotypeInterval.getContigIndex() && - p < currentGenotypeInterval.getStart() ) break; - if ( pastInterval(p, currentGenotypeInterval) ) { - // we are past current genotyping interval, so we are done with it; let's load next interval: - currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; - continue; // re-enter the loop to check against the interval we just loaded - } - - // we reach this point only if p is inside current genotyping interval; set the flag and bail out: - genotype = true; - break; - } - -// if ( DEBUG ) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype); - - if ( normal_context.indelsAt(pos).size() == 0 && ! genotype ) continue; - - IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); - JexlContext jc = new MapContext(); - normalCall.fillContext(jc,singleMetricsCassette); - boolean discard_event = false; - - for ( Expression e : jexlExpressions ) { - if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { discard_event=true; break; } - } - - if ( discard_event && ! genotype ) { - normal_context.indelsAt(pos).clear(); - continue; //* - } - -// if ( normalCall.getCoverage() < minCoverage && ! genotype ) { -// if ( DEBUG ) { -// System.out.println("DEBUG>> Indel at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); -// } -// continue; // low coverage -// } - - if ( DEBUG ) System.out.println("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" at "+pos); - - long left = Math.max( pos-NQS_WIDTH, normal_context.getStart() ); - long right = pos+( normalCall.getVariant() == null ? 0 : normalCall.getVariant().lengthOnRef())+NQS_WIDTH-1; - - if ( right >= adjustedPosition && ! force) { - // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect - - // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; - // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage - move_to = adjustPosition(left); - if ( move_to == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(adjustedPosition-normal_context.getStart())); - return; - } - if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to); - break; - } - - // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right: - if ( right > normal_context.getStop() ) right = normal_context.getStop(); - - // location = getToolkit().getGenomeLocParser().setStart(location,pos); - // location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data - - location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(), pos); - - if ( ! discard_event ) normalCallsMade++; - printVCFLine(vcf_writer,normalCall, discard_event); - if ( bedWriter != null ) normalCall.printBedLine(bedWriter); - if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, discard_event); - lastGenotypedPosition = pos; - - normal_context.indelsAt(pos).clear(); - // we dealt with this indel; don't want to see it again - // (we might otherwise in the case when 1) there is another indel that follows - // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - } - - if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); - normal_context.shift((int)(move_to - normal_context.getStart() ) ); - } - - /** A shortcut. Returns true if we got indels within the specified interval in single and only window context - * (for single-sample calls) or in either of the two window contexts (for two-sample/somatic calls) - * - */ - private boolean indelsPresentInInterval(long start, long stop) { - if ( tumor_context == null ) return normal_context.hasIndelsInInterval(start,stop); - return tumor_context.hasIndelsInInterval(start,stop) || - normal_context.hasIndelsInInterval(start,stop); - } - /** Takes the position, to which window shift is requested, and tries to adjust it in such a way that no NQS window is broken. - * Namely, this method checks, iteratively, if there is an indel within NQS_WIDTH bases ahead of initially requested or adjusted - * shift position. If there is such an indel, - * then shifting to that position would lose some or all NQS-window bases to the left of the indel (since it's not going to be emitted - * just yet). Instead, this method tries to readjust the shift position leftwards so that full NQS window to the left of the next indel - * is preserved. This method tries thie strategy 4 times (so that it would never walk away too far to the left), and if it fails to find - * an appropriate adjusted shift position (which could happen if there are many indels following each other at short intervals), it will give up, - * go back to the original requested shift position and try finding the first shift poisition that has no indel associated with it. - */ - - private long adjustPosition(long request) { - long initial_request = request; - int attempts = 0; - boolean failure = false; - while ( indelsPresentInInterval(request,request+NQS_WIDTH) ) { - request -= NQS_WIDTH; - if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request); - attempts++; - if ( attempts == 4 ) { - if ( DEBUG ) System.out.println("DEBUG>> attempts to preserve full NQS window failed; now trying to find any suitable position.") ; - failure = true; - break; - } - } - - if ( failure ) { - // we tried 4 times but did not find a good shift position that would preserve full nqs window - // around all indels. let's fall back and find any shift position as long and there's no indel at the very - // first position after the shift (this is bad for other reasons); if it breaks a nqs window, so be it - request = initial_request; - attempts = 0; - while ( indelsPresentInInterval(request,request+1) ) { - request--; - if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request); - attempts++; - if ( attempts == 50 ) { - System.out.println("WARNING: Indel at every position in the interval "+refName+":"+request+"-"+initial_request+ - ". Can not find a break to shift context window to; no calls will be attempted in the current window."); - return -1; - } - } - } - if ( DEBUG ) System.out.println("DEBUG>> Found acceptable target position "+request); - return request; - } - - /** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed - * first elements of the coverage arrays map onto 'position', or a few bases prior to the specified position - * if there is an indel in close proximity to 'position' so that we may get more coverage around it later. - * - * @param position - */ - private void emit_somatic(long position, boolean force) { - - long adjustedPosition = adjustPosition(position); - if ( adjustedPosition == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(position-normal_context.getStart())); - tumor_context.shift((int)(position-tumor_context.getStart())); - return; - } - long move_to = adjustedPosition; - - if ( DEBUG ) System.out.println("DEBUG>> Emitting in somatic mode up to "+position+" force shift="+force+" current window="+tumor_context.getStart()+"-"+tumor_context.getStop()); - - for ( int pos = tumor_context.getStart() ; pos < Math.min(adjustedPosition,tumor_context.getStop()+1) ; pos++ ) { - - boolean genotype = false; - // first let's see if we need to genotype current position: - - final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf) - - if ( pos <= lastGenotypedPosition ) continue; - - while ( currentGenotypeInterval != null ) { - - // if we did not even reach next interval yet, no genotyping at current position: - if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() || - location.getContigIndex() == currentGenotypeInterval.getContigIndex() && - p < currentGenotypeInterval.getStart() ) break; - if ( pastInterval(p, currentGenotypeInterval) ) { - // we are past current genotyping interval, so we are done with it; let's load next interval: - currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; - continue; // re-enter the loop to check against the interval we just loaded - } - - // we reach tjis point only if p is inside current genotyping interval; set the flag and bail out: - genotype = true; - break; - } -// if ( DEBUG) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype); - - if ( tumor_context.indelsAt(pos).size() == 0 && ! genotype ) continue; // no indels in tumor - - if ( DEBUG && genotype ) System.out.println("DEBUG>> Genotyping requested at "+pos); - - IndelPrecall tumorCall = new IndelPrecall(tumor_context,pos,NQS_WIDTH); - IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); - - JexlContext jc = new MapContext(); - tumorCall.fillContext(jc,tumorMetricsCassette); - normalCall.fillContext(jc,normalMetricsCassette); - boolean discard_event = false; - - for ( Expression e : jexlExpressions ) { - if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { discard_event=true; break; } - } - - if ( discard_event && ! genotype ) { - tumor_context.indelsAt(pos).clear(); - normal_context.indelsAt(pos).clear(); - continue; //* - } -// if ( tumorCall.getCoverage() < minCoverage && ! genotype ) { -// if ( DEBUG ) { -// System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)"); -// } -// continue; // low coverage -// } -// if ( normalCall.getCoverage() < minNormalCoverage && ! genotype ) { -// if ( DEBUG ) { -// System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); -// } -// continue; // low coverage -// } - - if ( DEBUG ) { - System.out.print("DEBUG>> "+(tumorCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in tumor, "); - System.out.print("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in normal at "+pos); - } - - long left = Math.max( pos-NQS_WIDTH, tumor_context.getStart() ); - long right = pos+ ( tumorCall.getVariant() == null ? 0 : tumorCall.getVariant().lengthOnRef() )+NQS_WIDTH-1; - - if ( right >= adjustedPosition && ! force) { - // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect - - // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; - // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage - move_to = adjustPosition(left); - if ( move_to == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(adjustedPosition-normal_context.getStart())); - tumor_context.shift((int)(adjustedPosition-tumor_context.getStart())); - return; - } - if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to); - break; - } - - if ( right > tumor_context.getStop() ) right = tumor_context.getStop(); // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right - - location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(),pos); // retrieve annotation data - -// boolean haveCall = tumorCall.isCall(); // cache the value - - if ( ! discard_event ) tumorCallsMade++; - - printVCFLine(vcf_writer,normalCall,tumorCall,discard_event); - - if ( bedWriter != null ) tumorCall.printBedLine(bedWriter); - - if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, tumorCall, discard_event ); - lastGenotypedPosition = pos; - - tumor_context.indelsAt(pos).clear(); - normal_context.indelsAt(pos).clear(); - // we dealt with this indel; don't want to see it again - // (we might otherwise in the case when 1) there is another indel that follows - // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - } - - if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); - tumor_context.shift((int)(move_to - tumor_context.getStart() ) ); - normal_context.shift((int)(move_to - normal_context.getStart() ) ); - } - - private String makeFullRecord(IndelPrecall normalCall, IndelPrecall tumorCall) { - StringBuilder fullRecord = new StringBuilder(); - if ( tumorCall.getVariant() != null || normalCall.getVariant() == null) { - fullRecord.append(tumorCall.makeEventString()); - } else { - fullRecord.append(normalCall.makeEventString()); - } - fullRecord.append('\t'); - fullRecord.append(normalCall.makeStatsString("N_")); - fullRecord.append('\t'); - fullRecord.append(tumorCall.makeStatsString("T_")); - fullRecord.append('\t'); - return fullRecord.toString(); - } - - private String makeFullRecord(IndelPrecall normalCall) { - StringBuilder fullRecord = new StringBuilder(); - fullRecord.append(normalCall.makeEventString()); - fullRecord.append('\t'); - fullRecord.append(normalCall.makeStatsString("")); - fullRecord.append('\t'); - return fullRecord.toString(); - } - - private String getAnnotationString(RODRecordList ann) { - if ( ann == null ) return annGenomic; - else { - StringBuilder b = new StringBuilder(); - - if ( RefSeqFeature.isExon(ann) ) { - if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence - else b.append(annUTR); // exon but not coding = UTR - } else { - if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron - else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon... - } - b.append('\t'); - b.append(((Transcript)ann.get(0).getUnderlyingObject()).getGeneName()); // there is at least one transcript in the list, guaranteed -// while ( it.hasNext() ) { // -// t.getGeneName() -// } - return b.toString(); - } - - } - - public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, boolean discard_event) { - RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); - String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); - - StringBuilder fullRecord = new StringBuilder(); - fullRecord.append(makeFullRecord(normalCall)); - fullRecord.append(annotationString); - if ( discard_event && normalCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); - try { - verboseWriter.write(fullRecord.toString()); - verboseWriter.write('\n'); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e); - } - - } - - - public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, IndelPrecall tumorCall, boolean discard_event) { - RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); - String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); - - StringBuilder fullRecord = new StringBuilder(); - fullRecord.append(makeFullRecord(normalCall,tumorCall)); - - if ( normalCall.getVariant() == null && tumorCall.getVariant() == null ) { - // did not observe anything - if ( normalCall.getCoverage() >= minNormalCoverage && tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE"); - else { - if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE"); // no coverage in normal but nothing in tumor - else { - // no coverage in tumor; if we have no coverage in normal, it can be anything; if we do have coverage in normal, - // this still could be a somatic event. so either way it is 'unknown' - fullRecord.append("UNKNOWN"); - } - } - - } - - if ( normalCall.getVariant() == null && tumorCall.getVariant() != null ) { - // looks like somatic call - if ( normalCall.getCoverage() >= minNormalCoverage ) fullRecord.append("SOMATIC"); // we confirm there is nothing in normal - else { - // low coverage in normal - fullRecord.append("EVENT_T"); // no coverage in normal, no idea whether it is germline or somatic - } - } - - if ( normalCall.getVariant() != null && tumorCall.getVariant() == null ) { - // it's likely germline (with missing observation in tumor - maybe loh? - if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("GERMLINE_LOH"); // we confirm there is nothing in tumor - else { - // low coverage in tumor, maybe we missed the event - fullRecord.append("GERMLINE"); // no coverage in tumor but we already saw it in normal... - } - } - - if ( normalCall.getVariant() != null && tumorCall.getVariant() != null ) { - // events in both T/N, got to be germline! - fullRecord.append("GERMLINE"); - } - - - fullRecord.append('\t'); - fullRecord.append(annotationString); - - if ( discard_event && tumorCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); - - try { - verboseWriter.write(fullRecord.toString()); - verboseWriter.write('\n'); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e); - } - } - - public void printVCFLine(VariantContextWriter vcf, IndelPrecall call, boolean discard_event) { - - long start = call.getPosition()-1; - // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed. - // The suggestion is instead of putting the base before the indel, to put the base after the indel. - // For now, just don't print out that site. - if ( start == 0 ) - return; - - long stop = start; - - List alleles = new ArrayList(2); // actual observed (distinct!) alleles at the site - List homref_alleles = null; // when needed, will contain two identical copies of ref allele - needed to generate hom-ref genotype - - final byte referencePaddingBase = refBases[(int)start-1]; - - if ( call.getVariant() == null ) { - // we will need to create genotype with two (hom) ref alleles (below). - // we can not use 'alleles' list here, since that list is supposed to contain - // only *distinct* alleles observed at the site or VCFContext will frown upon us... - alleles.add( Allele.create(referencePaddingBase,true) ); - homref_alleles = new ArrayList(2); - homref_alleles.add( alleles.get(0)); - homref_alleles.add( alleles.get(0)); - } else { - // we always create alt allele when we observe anything but the ref, even if it is not a call! - // (Genotype will tell us whether it is an actual call or not!) - int event_length = call.getVariant().lengthOnRef(); - if ( event_length < 0 ) event_length = 0; - fillAlleleList(alleles,call,referencePaddingBase); - stop += event_length; - } - - GenotypesContext genotypes = GenotypesContext.create(); - for ( String sample : normalSamples ) { - GenotypeBuilder gb = new GenotypeBuilder(sample); - - gb=call.addStatsAttributes(gb); - gb.alleles(! discard_event - ? alleles // we made a call - put actual het genotype here: - : homref_alleles); // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all) - genotypes.add(gb.make()); - - } - Set filters = null; - if ( call.getVariant() != null && discard_event ) { - filters = new HashSet(); - filters.add("NoCall"); - } - VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles) - .genotypes(genotypes).filters(filters).make(); - vcf.add(vc); - } - - /** Fills l with appropriate alleles depending on whether call is insertion or deletion - * (l MUST have a variant or this method will crash). It is guaranteed that the *first* allele added - * to the list is ref, and the next one is alt. - * @param l - * @param call - */ - private void fillAlleleList(List l, IndelPrecall call, byte referencePaddingBase) { - int event_length = call.getVariant().lengthOnRef(); - if ( event_length == 0 ) { // insertion - - l.add( Allele.create(referencePaddingBase,true) ); - l.add( Allele.create((char)referencePaddingBase + new String(call.getVariant().getBases()), false )); - - } else { //deletion: - l.add( Allele.create((char)referencePaddingBase + new String(call.getVariant().getBases()), true )); - l.add( Allele.create(referencePaddingBase,false) ); - } - } - - public void printVCFLine(VariantContextWriter vcf, IndelPrecall nCall, IndelPrecall tCall, boolean discard_event) { - - long start = tCall.getPosition()-1; - long stop = start; - - // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed. - // The suggestion is instead of putting the base before the indel, to put the base after the indel. - // For now, just don't print out that site. - if ( start == 0 ) - return; - - GenotypeBuilder nGB = new GenotypeBuilder(); - GenotypeBuilder tGB = new GenotypeBuilder(); - - nCall.addStatsAttributes(nGB); - tCall.addStatsAttributes(tGB); - - Map attrs = new HashMap(); - - boolean isSomatic = false; - if ( nCall.getVariant() == null && tCall.getVariant() != null ) { - isSomatic = true; - attrs.put(VCFConstants.SOMATIC_KEY,true); - } - List alleles = new ArrayList(2); // all alleles at the site - // List normal_alleles = null; // all alleles at the site - List homRefAlleles = null; - -// if ( nCall.getVariant() == null || tCall.getVariant() == null ) { - homRefAlleles = new ArrayList(2) ; // we need this for somatic calls (since normal is ref-ref), and also for no-calls -// } - boolean homRefT = ( tCall.getVariant() == null ); - boolean homRefN = ( nCall.getVariant() == null ); - final byte referencePaddingBase = refBases[(int)start-1]; - if ( tCall.getVariant() == null && nCall.getVariant() == null) { - // no indel at all ; create base-representation ref/ref alleles for genotype construction - alleles.add( Allele.create(referencePaddingBase,true) ); - } else { - // we got indel(s) - int event_length = 0; - if ( tCall.getVariant() != null ) { - // indel in tumor - event_length = tCall.getVariant().lengthOnRef(); - fillAlleleList(alleles, tCall, referencePaddingBase); - } else { - event_length = nCall.getVariant().lengthOnRef(); - fillAlleleList(alleles, nCall, referencePaddingBase); - } - if ( event_length > 0 ) stop += event_length; - } - homRefAlleles.add( alleles.get(0)); - homRefAlleles.add( alleles.get(0)); - - GenotypesContext genotypes = GenotypesContext.create(); - - for ( String sample : normalSamples ) { - genotypes.add(nGB.name(sample).alleles(homRefN ? homRefAlleles : alleles).make()); - } - - for ( String sample : tumorSamples ) { - genotypes.add(tGB.name(sample).alleles(homRefT ? homRefAlleles : alleles).make()); - } - - Set filters = null; - if ( tCall.getVariant() != null && discard_event ) { - filters = new HashSet(); - filters.add("NoCall"); - } - - VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles) - .genotypes(genotypes).filters(filters).attributes(attrs).make(); - vcf.add(vc); - } - - @Override - public void onTraversalDone(Integer result) { - if ( DEBUG ) { - System.out.println("DEBUG>> Emitting last window at "+normal_context.getStart()+"-"+normal_context.getStop()); - } - if ( call_somatic ) emit_somatic(1000000000, true); - else emit(1000000000,true); // emit everything we might have left - - if ( metricsWriter != null ) { - metricsWriter.println(String.format("Normal calls made %d", normalCallsMade)); - metricsWriter.println(String.format("Tumor calls made %d", tumorCallsMade)); - metricsWriter.close(); - } - - try { - if ( bedWriter != null ) bedWriter.close(); - if ( verboseWriter != null ) verboseWriter.close(); - } catch (IOException e) { - System.out.println("Failed to close output BED file gracefully, data may be lost"); - e.printStackTrace(); - } - super.onTraversalDone(result); - } - - @Override - public Integer reduce(Integer value, Integer sum) { - if ( value == -1 ) { - onTraversalDone(sum); - System.exit(1); - } - sum += value; - return sum; - } - - @Override - public Integer reduceInit() { - return 0; - } - - - static class IndelVariant { - public static enum Type { I, D}; - private String bases; - private Type type; - private ArrayList fromStartOffsets = null; - private ArrayList fromEndOffsets = null; - - private Set reads = new HashSet(); // keep track of reads that have this indel - private Set samples = new HashSet(); // which samples had the indel described by this object - - public IndelVariant(ExpandedSAMRecord read , Type type, String bases) { - this.type = type; - this.bases = bases.toUpperCase(); - addObservation(read); - fromStartOffsets = new ArrayList(); - fromEndOffsets = new ArrayList(); - } - - /** Adds another observation for the current indel. It is assumed that the read being registered - * does contain the observation, no checks are performed. Read's sample is added to the list of samples - * this indel was observed in as well. - * @param read - */ - public void addObservation(ExpandedSAMRecord read) { - if ( reads.contains(read) ) { - //TODO fix CleanedReadInjector and reinstate exception here: duplicate records may signal a problem with the bam - // seeing the same read again can mean only one thing: the input bam file is corrupted and contains - // duplicate records. We KNOW that this may happen for the time being due to bug in CleanedReadInjector - // so this is a short-term patch: don't cry, but just ignore the duplicate record - - //throw new StingException("Attempting to add indel observation that was already registered"); - return; - } - reads.add(read); - String sample = null; - if ( read.getSAMRecord().getReadGroup() != null ) sample = read.getSAMRecord().getReadGroup().getSample(); - if ( sample != null ) samples.add(sample); - } - - - /** Returns length of the event on the reference (number of deleted bases - * for deletions, -1 for insertions. - * @return - */ - public int lengthOnRef() { - if ( type == Type.D ) return bases.length(); - else return 0; - } - - - public void addSample(String sample) { - if ( sample != null ) - samples.add(sample); - } - - public void addReadPositions(int fromStart, int fromEnd) { - fromStartOffsets.add(fromStart); - fromEndOffsets.add(fromEnd); - } - - public List getOffsetsFromStart() { return fromStartOffsets ; } - public List getOffsetsFromEnd() { return fromEndOffsets; } - - public String getSamples() { - StringBuffer sb = new StringBuffer(); - Iterator i = samples.iterator(); - while ( i.hasNext() ) { - sb.append(i.next()); - if ( i.hasNext() ) - sb.append(","); - } - return sb.toString(); - } - - public Set getReadSet() { return reads; } - - public int getCount() { return reads.size(); } - - public String getBases() { return bases; } - - public Type getType() { return type; } - - @Override - public boolean equals(Object o) { - if ( ! ( o instanceof IndelVariant ) ) return false; - IndelVariant that = (IndelVariant)o; - return ( this.type == that.type && this.bases.equals(that.bases) ); - } - - public boolean equals(Type type, String bases) { - return ( this.type == type && this.bases.equals(bases.toUpperCase()) ); - } - } - - /** - * Utility class that encapsulates the logic related to collecting all the stats and counts required to - * make (or discard) a call, as well as the calling heuristics that uses those data. - */ - class IndelPrecall { -// private boolean DEBUG = false; - private int NQS_MISMATCH_CUTOFF = 1000000; - private double AV_MISMATCHES_PER_READ = 1.5; - - private int nqs = 0; - private IndelVariant consensus_indel = null; // indel we are going to call - private long pos = -1 ; // position on the ref - private int total_coverage = 0; // total number of reads overlapping with the event - private int consensus_indel_count = 0; // number of reads, in which consensus indel was observed - private int all_indel_count = 0 ; // number of reads, in which any indel was observed at current position - - private int total_mismatches_in_nqs_window = 0; // total number of mismatches in the nqs window around the indel - private int total_bases_in_nqs_window = 0; // total number of bases in the nqs window (some reads may not fully span the window so it's not coverage*nqs_size) - private int total_base_qual_in_nqs_window = 0; // sum of qualitites of all the bases in the nqs window - private int total_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of all mismatching bases in the nqs window - - private int indel_read_mismatches_in_nqs_window = 0; // mismatches inside the nqs window in indel-containing reads only - private int indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from indel-containing reads only - private int indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from indel-containing reads only - private int indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from indel-containing reads only - - - private int consensus_indel_read_mismatches_in_nqs_window = 0; // mismatches within the nqs window from consensus indel reads only - private int consensus_indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from consensus indel-containing reads only - private int consensus_indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from consensus indel-containing reads only - private int consensus_indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from consensus indel-containing reads only - - - private double consensus_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain consensus indel - private double all_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain any indel at given position - private double all_read_total_mm = 0.0; // sum of all mismatches in all reads - - private double consensus_indel_read_total_mapq = 0.0; // sum of mapping qualitites of all reads with consensus indel - private double all_indel_read_total_mapq = 0.0 ; // sum of mapping qualitites of all reads with (any) indel at current position - private double all_read_total_mapq = 0.0; // sum of all mapping qualities of all reads - - private PrimitivePair.Int consensus_indel_read_orientation_cnt = new PrimitivePair.Int(); - private PrimitivePair.Int all_indel_read_orientation_cnt = new PrimitivePair.Int(); - private PrimitivePair.Int all_read_orientation_cnt = new PrimitivePair.Int(); - - private int from_start_median = 0; - private int from_start_mad = 0; - private int from_end_median = 0; - private int from_end_mad = 0; - - /** Makes an empty call (no-call) with all stats set to 0 - * - * @param position - */ - public IndelPrecall(long position) { - this.pos = position; - } - - public IndelPrecall(WindowContext context, long position, int nqs_width) { - this.pos = position; - this.nqs = nqs_width; - total_coverage = context.coverageAt(pos,true); - List variants = context.indelsAt(pos); - findConsensus(variants); - - // pos is the first base after the event: first deleted base or first base after insertion. - // hence, [pos-nqs, pos+nqs-1] (inclusive) is the window with nqs bases on each side of a no-event or an insertion - // and [pos-nqs, pos+Ndeleted+nqs-1] is the window with nqs bases on each side of a deletion. - // we initialize the nqs window for no-event/insertion case - long left = Math.max( pos-nqs, context.getStart() ); - long right = Math.min(pos+nqs-1, context.getStop()); -//if ( pos == 3534096 ) System.out.println("pos="+pos +" total reads: "+context.getReads().size()); - Iterator read_iter = context.getReads().iterator(); - - - while ( read_iter.hasNext() ) { - ExpandedSAMRecord rec = read_iter.next(); - SAMRecord read = rec.getSAMRecord(); - byte[] flags = rec.getExpandedMMFlags(); - byte[] quals = rec.getExpandedQuals(); - int mm = rec.getMMCount(); - - - if( read.getAlignmentStart() > pos || read.getAlignmentEnd() < pos ) continue; - - long local_right = right; // end of nqs window for this particular read. May need to be advanced further right - // if read has a deletion. The gap in the middle of nqs window will be skipped - // automatically since flags/quals are set to -1 there - - boolean read_has_a_variant = false; - boolean read_has_consensus = ( consensus_indel!= null && consensus_indel.getReadSet().contains(rec) ); - for ( IndelVariant v : variants ) { - if ( v.getReadSet().contains(rec) ) { - read_has_a_variant = true; - local_right += v.lengthOnRef(); - break; - } - } - - if ( read_has_consensus ) { - consensus_indel_read_total_mm += mm; - consensus_indel_read_total_mapq += read.getMappingQuality(); - if ( read.getReadNegativeStrandFlag() ) consensus_indel_read_orientation_cnt.second++; - else consensus_indel_read_orientation_cnt.first++; - } - if ( read_has_a_variant ) { - all_indel_read_total_mm += mm; - all_indel_read_total_mapq += read.getMappingQuality(); - if ( read.getReadNegativeStrandFlag() ) all_indel_read_orientation_cnt.second++; - else all_indel_read_orientation_cnt.first++; - } - - all_read_total_mm+= mm; - all_read_total_mapq += read.getMappingQuality(); - if ( read.getReadNegativeStrandFlag() ) all_read_orientation_cnt.second++; - else all_read_orientation_cnt.first++; - - for ( int pos_in_flags = Math.max((int)(left - read.getAlignmentStart()),0); - pos_in_flags <= Math.min((int)local_right-read.getAlignmentStart(),flags.length - 1); - pos_in_flags++) { - - if ( flags[pos_in_flags] == -1 ) continue; // gap (deletion), skip it; we count only bases aligned to the ref - total_bases_in_nqs_window++; - if ( read_has_consensus ) consensus_indel_read_bases_in_nqs_window++; - if ( read_has_a_variant ) indel_read_bases_in_nqs_window++; - - if ( quals[pos_in_flags] != -1 ) { - - total_base_qual_in_nqs_window += quals[pos_in_flags]; - if ( read_has_a_variant ) indel_read_base_qual_in_nqs_window += quals[pos_in_flags]; - if ( read_has_consensus ) consensus_indel_read_base_qual_in_nqs_window += quals[pos_in_flags]; - } - - if ( flags[pos_in_flags] == 1 ) { // it's a mismatch - total_mismatches_in_nqs_window++; - total_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; - - if ( read_has_consensus ) { - consensus_indel_read_mismatches_in_nqs_window++; - consensus_indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; - } - - if ( read_has_a_variant ) { - indel_read_mismatches_in_nqs_window++; - indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; - } - } - } -// if ( pos == 3534096 ) { -// System.out.println(read.getReadName()); -// System.out.println(" cons nqs bases="+consensus_indel_read_bases_in_nqs_window); -// System.out.println(" qual sum="+consensus_indel_read_base_qual_in_nqs_window); -// } - - } - - // compute median/mad for offsets from the read starts/ends - if ( consensus_indel != null ) { - from_start_median = median(consensus_indel.getOffsetsFromStart()) ; - from_start_mad = mad(consensus_indel.getOffsetsFromStart(),from_start_median); - from_end_median = median(consensus_indel.getOffsetsFromEnd()) ; - from_end_mad = mad(consensus_indel.getOffsetsFromEnd(),from_end_median); - } - } - - /** As a side effect will sort l! - * - * @param l - * @return - */ - private int median(List l) { - Collections.sort(l); - int k = l.size()/2; - return ( l.size() % 2 == 0 ? - (l.get(k-1).intValue()+l.get(k).intValue())/2 : - l.get(k).intValue()); - } - - private int median(int[] l) { - Arrays.sort(l); - int k = l.length/2; - return ( l.length % 2 == 0 ? - (l[k-1]+l[k])/2 : - l[k]); - } - - private int mad(List l, int med) { - int [] diff = new int[l.size()]; - for ( int i = 0; i < l.size(); i++ ) { - diff[i] = Math.abs(l.get(i).intValue() - med); - } - return median(diff); - } - - public long getPosition() { return pos; } - - public boolean hasObservation() { return consensus_indel != null; } - - public int getCoverage() { return total_coverage; } - - public double getTotalMismatches() { return all_read_total_mm; } - public double getConsensusMismatches() { return consensus_indel_read_total_mm; } - public double getAllVariantMismatches() { return all_indel_read_total_mm; } - - /** Returns average number of mismatches per consensus indel-containing read */ - public double getAvConsensusMismatches() { - return ( consensus_indel_count != 0 ? consensus_indel_read_total_mm/consensus_indel_count : 0.0 ); - } - - /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants) */ - public double getAvRefMismatches() { - int coverage_ref = total_coverage-all_indel_count; - return ( coverage_ref != 0 ? (all_read_total_mm - all_indel_read_total_mm )/coverage_ref : 0.0 ); - } - - public PrimitivePair.Int getConsensusStrandCounts() { - return consensus_indel_read_orientation_cnt; - } - - public PrimitivePair.Int getRefStrandCounts() { - return new PrimitivePair.Int(all_read_orientation_cnt.first-all_indel_read_orientation_cnt.first, - all_read_orientation_cnt.second - all_indel_read_orientation_cnt.second); - } - - /** Returns a sum of mapping qualities of all reads spanning the event. */ - public double getTotalMapq() { return all_read_total_mapq; } - - /** Returns a sum of mapping qualities of all reads, in which the consensus variant is observed. */ - public double getConsensusMapq() { return consensus_indel_read_total_mapq; } - - /** Returns a sum of mapping qualities of all reads, in which any variant is observed at the current event site. */ - public double getAllVariantMapq() { return all_indel_read_total_mapq; } - - /** Returns average mapping quality per consensus indel-containing read. */ - public double getAvConsensusMapq() { - return ( consensus_indel_count != 0 ? consensus_indel_read_total_mapq/consensus_indel_count : 0.0 ); - } - - /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants). */ - public double getAvRefMapq() { - int coverage_ref = total_coverage-all_indel_count; - return ( coverage_ref != 0 ? (all_read_total_mapq - all_indel_read_total_mapq )/coverage_ref : 0.0 ); - } - - /** Returns fraction of bases in NQS window around the indel that are mismatches, across all reads, - * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around - * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]} - * */ - public double getNQSConsensusMMRate() { - if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0; - return ((double)consensus_indel_read_mismatches_in_nqs_window)/consensus_indel_read_bases_in_nqs_window; - } - - /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads - * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref - * reads is defined around the event start position, NOT around the actual consensus indel. - * */ - public double getNQSRefMMRate() { - int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window; - if ( num_ref_bases == 0 ) return 0; - return ((double)(total_mismatches_in_nqs_window - indel_read_mismatches_in_nqs_window))/num_ref_bases; - } - - /** Returns average base quality in NQS window around the indel, across all reads, - * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around - * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]} - * */ - public double getNQSConsensusAvQual() { - if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0; - return ((double)consensus_indel_read_base_qual_in_nqs_window)/consensus_indel_read_bases_in_nqs_window; - } - - /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads - * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref - * reads is defined around the event start position, NOT around the actual consensus indel. - * */ - public double getNQSRefAvQual() { - int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window; - if ( num_ref_bases == 0 ) return 0; - return ((double)(total_base_qual_in_nqs_window - indel_read_base_qual_in_nqs_window))/num_ref_bases; - } - - public int getTotalNQSMismatches() { return total_mismatches_in_nqs_window; } - - public int getAllVariantCount() { return all_indel_count; } - public int getConsensusVariantCount() { return consensus_indel_count; } - -// public boolean failsNQSMismatch() { -// //TODO wrong fraction: mismatches are counted only in indel-containing reads, but total_coverage is used! -// return ( indel_read_mismatches_in_nqs_window > NQS_MISMATCH_CUTOFF ) || -// ( indel_read_mismatches_in_nqs_window > total_coverage * AV_MISMATCHES_PER_READ ); -// } - - public IndelVariant getVariant() { return consensus_indel; } - - public void fillContext(JexlContext context,String[] cassette) { - context.set(cassette[C_INDEL_F],((double)consensus_indel_count)/total_coverage); - context.set(cassette[C_INDEL_CF],((double)consensus_indel_count/all_indel_count)); - context.set(cassette[C_COV],total_coverage); - context.set(cassette[C_CONS_CNT],consensus_indel_count); - } -/* - public boolean isCall() { - boolean ret = ( consensus_indel_count >= minIndelCount && - (double)consensus_indel_count > minFraction * total_coverage && - (double) consensus_indel_count > minConsensusFraction*all_indel_count && total_coverage >= minCoverage); - if ( DEBUG && ! ret ) System.out.println("DEBUG>> NOT a call: count="+consensus_indel_count+ - " total_count="+all_indel_count+" cov="+total_coverage+ - " minConsensusF="+((double)consensus_indel_count)/all_indel_count+ - " minF="+((double)consensus_indel_count)/total_coverage); - - return ret; -// return true; - } -*/ - /** Utility method: finds the indel variant with the largest count (ie consensus) among all the observed - * variants, and sets the counts of consensus observations and all observations of any indels (including non-consensus) - * @param variants - * @return - */ - private void findConsensus(List variants) { - for ( IndelVariant var : variants ) { - if ( DEBUG ) System.out.println("DEBUG>> Variant "+var.getBases()+" (cnt="+var.getCount()+")"); - int cnt = var.getCount(); - all_indel_count +=cnt; - if ( cnt > consensus_indel_count ) { - consensus_indel = var; - consensus_indel_count = cnt; - } - } - if ( DEBUG && consensus_indel != null ) System.out.println("DEBUG>> Returning: "+consensus_indel.getBases()+ - " (cnt="+consensus_indel.getCount()+") with total count of "+all_indel_count); - } - - - - public void printBedLine(Writer bed) { - int event_length; - if ( consensus_indel == null ) event_length = 0; - else { - event_length = consensus_indel.lengthOnRef(); - if ( event_length < 0 ) event_length = 0; - } - - StringBuffer message = new StringBuffer(); - message.append(refName+"\t"+(pos-1)+"\t"); - message.append((pos-1+event_length)+"\t"); - if ( consensus_indel != null ) { - message.append((event_length>0? "-":"+")+consensus_indel.getBases()); - } else { - message.append('.'); - } - message.append(":"+all_indel_count+"/"+total_coverage); - try { - bed.write(message.toString()+"\n"); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(bedOutput, "Error encountered while writing into output BED file", e); - } - } - - public String makeEventString() { - int event_length; - if ( consensus_indel == null ) event_length = 0; - else { - event_length = consensus_indel.lengthOnRef(); - if ( event_length < 0 ) event_length = 0; - } - StringBuffer message = new StringBuffer(); - message.append(refName); - message.append('\t'); - message.append(pos-1); - message.append('\t'); - message.append(pos-1+event_length); - message.append('\t'); - if ( consensus_indel != null ) { - message.append((event_length>0?'-':'+')); - message.append(consensus_indel.getBases()); - } else { - message.append('.'); - } - return message.toString(); - } - - public String makeStatsString(String prefix) { - StringBuilder message = new StringBuilder(); - message.append(prefix+"OBS_COUNTS[C/A/T]:"+getConsensusVariantCount()+"/"+getAllVariantCount()+"/"+getCoverage()); - message.append('\t'); - message.append(prefix+"AV_MM[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMismatches(), - getAvRefMismatches())); - message.append('\t'); - message.append(prefix+"AV_MAPQ[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMapq(), - getAvRefMapq())); - message.append('\t'); - message.append(prefix+"NQS_MM_RATE[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusMMRate(),getNQSRefMMRate())); - message.append('\t'); - message.append(prefix+"NQS_AV_QUAL[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusAvQual(),getNQSRefAvQual())); - - PrimitivePair.Int strand_cons = getConsensusStrandCounts(); - PrimitivePair.Int strand_ref = getRefStrandCounts(); - message.append('\t'); - message.append(prefix+"STRAND_COUNTS[C/C/R/R]:"+strand_cons.first+"/"+strand_cons.second+"/"+strand_ref.first+"/"+strand_ref.second); - - message.append('\t'); - message.append(prefix+"OFFSET_RSTART:"+from_start_median+"/"+from_start_mad); - message.append('\t'); - message.append(prefix+"OFFSET_REND:"+from_end_median+"/"+from_end_mad); - - return message.toString(); - - } - - /** - * Places alignment statistics into attribute map and returns the map. If attr parameter is null, - * a new map is allocated, filled and returned. If attr is not null, new attributes are added to that - * preexisting map, and the same instance of the (updated) map is returned. - * - * @param attr - * @return - */ - public Map makeStatsAttributes(Map attr) { - if ( attr == null ) attr = new HashMap(); - - VCFIndelAttributes.recordDepth(getConsensusVariantCount(),getAllVariantCount(),getCoverage(),attr); - - VCFIndelAttributes.recordAvMM(getAvConsensusMismatches(),getAvRefMismatches(),attr); - - VCFIndelAttributes.recordAvMapQ(getAvConsensusMapq(),getAvRefMapq(),attr); - - VCFIndelAttributes.recordNQSMMRate(getNQSConsensusMMRate(),getNQSRefMMRate(),attr); - - VCFIndelAttributes.recordNQSAvQ(getNQSConsensusAvQual(),getNQSRefAvQual(),attr); - - VCFIndelAttributes.recordOffsetFromStart(from_start_median,from_start_mad,attr); - - VCFIndelAttributes.recordOffsetFromEnd(from_end_median,from_end_mad,attr); - - PrimitivePair.Int strand_cons = getConsensusStrandCounts(); - PrimitivePair.Int strand_ref = getRefStrandCounts(); - - VCFIndelAttributes.recordStrandCounts(strand_cons.first,strand_cons.second,strand_ref.first,strand_ref.second,attr); - return attr; - } - - - /** - * Adds alignment statistics directly into the genotype builder object. - * - * @param gb - * @return - */ - public GenotypeBuilder addStatsAttributes(GenotypeBuilder gb) { - if ( gb == null ) gb = new GenotypeBuilder(); - - gb = VCFIndelAttributes.recordDepth(getConsensusVariantCount(),getAllVariantCount(),getCoverage(),gb); - - gb = VCFIndelAttributes.recordAvMM(getAvConsensusMismatches(),getAvRefMismatches(),gb); - - gb = VCFIndelAttributes.recordAvMapQ(getAvConsensusMapq(),getAvRefMapq(),gb); - - gb = VCFIndelAttributes.recordNQSMMRate(getNQSConsensusMMRate(),getNQSRefMMRate(),gb); - - gb = VCFIndelAttributes.recordNQSAvQ(getNQSConsensusAvQual(),getNQSRefAvQual(),gb); - - gb = VCFIndelAttributes.recordOffsetFromStart(from_start_median,from_start_mad,gb); - - gb = VCFIndelAttributes.recordOffsetFromEnd(from_end_median,from_end_mad,gb); - - PrimitivePair.Int strand_cons = getConsensusStrandCounts(); - PrimitivePair.Int strand_ref = getRefStrandCounts(); - - gb = VCFIndelAttributes.recordStrandCounts(strand_cons.first,strand_cons.second,strand_ref.first,strand_ref.second,gb); - return gb; - } - - } - - interface IndelListener { - public void addObservation(int pos, IndelVariant.Type t, String bases, int fromStart, int fromEnd, ExpandedSAMRecord r); - } - - class WindowContext implements IndelListener { - private Set reads; - private int start=0; // where the window starts on the ref, 1-based - private CircularArray< List< IndelVariant > > indels; - - private List emptyIndelList = new ArrayList(); - - - public WindowContext(int start, int length) { - this.start = start; - indels = new CircularArray< List >(length); -// reads = new LinkedList(); - reads = new HashSet(); - } - - /** Returns 1-based reference start position of the interval this object keeps context for. - * - * @return - */ - public int getStart() { return start; } - - /** Returns 1-based reference stop position (inclusive) of the interval this object keeps context for. - * - * @return - */ - public int getStop() { return start + indels.length() - 1; } - - /** Resets reference start position to 0 and clears the context. - * - */ - public void clear() { - start = 0; - reads.clear(); - indels.clear(); - } - - /** - * Returns true if any indel observations are present in the specified interval - * [begin,end] (1-based, inclusive). Interval can be partially of fully outside of the - * current context window: positions outside of the window will be ignored. - * @param begin - * @param end - */ - public boolean hasIndelsInInterval(long begin, long end) { - for ( long k = Math.max(start,begin); k < Math.min(getStop(),end); k++ ) { - if ( indelsAt(k) != emptyIndelList ) return true; - } - return false; - } - - public Set getReads() { return reads; } - - /** Returns the number of reads spanning over the specified reference position - * (regardless of whether they have a base or indel at that specific location). - * The second argument controls whether to count with indels in mind (this is relevant for insertions only, - * deletions do not require any special treatment since they occupy non-zero length on the ref and since - * alignment can not start or end with a deletion). For insertions, note that, internally, we assign insertions - * to the reference position right after the actual event, and we count all events assigned to a given position. - * This count (reads with indels) should be contrasted to reads without indels, or more rigorously, reads - * that support the ref rather than the indel. Few special cases may occur here: - * 1) an alignment that ends (as per getAlignmentEnd()) right before the current position but has I as its - * last element: we have to count that read into the "coverage" at the current position for the purposes of indel - * assessment, as the indel in that read will be counted at the current position, so the total coverage - * should be consistent with that. - */ - /* NOT IMPLEMENTED: 2) alsignments that start exactly at the current position do not count for the purpose of insertion - * assessment since they do not contribute any evidence to either Ref or Alt=insertion hypothesis, unless - * the alignment starts with I (so that we do have evidence for an indel assigned to the current position and - * read should be counted). For deletions, reads starting at the current position should always be counted (as they - * show no deletion=ref). - * @param refPos position on the reference; must be within the bounds of the window - */ - public int coverageAt(final long refPos, boolean countForIndels) { - int cov = 0; - for ( ExpandedSAMRecord read : reads ) { - if ( read.getSAMRecord().getAlignmentStart() > refPos || read.getSAMRecord().getAlignmentEnd() < refPos ) { - if ( countForIndels && read.getSAMRecord().getAlignmentEnd() == refPos - 1) { - Cigar c = read.getSAMRecord().getCigar(); - if ( c.getCigarElement(c.numCigarElements()-1).getOperator() == CigarOperator.I ) cov++; - } - continue; - } - cov++; - } - return cov; - } - - - /** Shifts current window to the right along the reference contig by the specified number of bases. - * The context will be updated accordingly (indels and reads that go out of scope will be dropped). - * @param offset - */ - public void shift(int offset) { - start += offset; - - indels.shiftData(offset); - if ( indels.get(0) != null && indels.get(0).size() != 0 ) { - IndelVariant indel = indels.get(0).get(0); - - System.out.println("WARNING: Indel(s) at first position in the window ("+refName+":"+start+"): currently not supported: "+ - (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; read: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName()+"; site ignored"); - indels.get(0).clear(); -// throw new StingException("Indel found at the first position ("+start+") after a shift was performed: currently not supported: "+ -// (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; reads: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName()); - } - - Iterator read_iter = reads.iterator(); - - while ( read_iter.hasNext() ) { - ExpandedSAMRecord r = read_iter.next(); - if ( r.getSAMRecord().getAlignmentEnd() < start ) { // discard reads and associated data that went out of scope - read_iter.remove(); - } - } - } - - public void add(SAMRecord read, byte [] ref) { - - if ( read.getAlignmentStart() < start ) return; // silently ignore reads starting before the window start - - ExpandedSAMRecord er = new ExpandedSAMRecord(read,ref,read.getAlignmentStart()-start,this); - //TODO duplicate records may actually indicate a problem with input bam file; throw an exception when the bug in CleanedReadInjector is fixed - if ( reads.contains(er)) return; // ignore duplicate records - reads.add(er); - } - - public void addObservation(int pos, IndelVariant.Type type, String bases, int fromStart, int fromEnd, ExpandedSAMRecord rec) { - List indelsAtSite; - try { - indelsAtSite = indels.get(pos); - } catch (IndexOutOfBoundsException e) { - SAMRecord r = rec.getSAMRecord(); - System.out.println("Failed to add indel observation, probably out of coverage window bounds (trailing indel?):\nRead "+ - r.getReadName()+": "+ - "read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+ - r.getAlignmentStart()+"; end="+r.getAlignmentEnd()+"; window start="+getStart()+ - "; window end="+getStop()); - throw e; - } - - if ( indelsAtSite == null ) { - indelsAtSite = new ArrayList(); - indels.set(pos, indelsAtSite); - } - - IndelVariant indel = null; - for ( IndelVariant v : indelsAtSite ) { - if ( ! v.equals(type, bases) ) continue; - - indel = v; - indel.addObservation(rec); - break; - } - - if ( indel == null ) { // not found: - indel = new IndelVariant(rec, type, bases); - indelsAtSite.add(indel); - } - indel.addReadPositions(fromStart,fromEnd); - } - - public List indelsAt( final long refPos ) { - List l = indels.get((int)( refPos - start )); - if ( l == null ) return emptyIndelList; - else return l; - } - - - } - - - class ExpandedSAMRecord { - private SAMRecord read; - private byte[] mismatch_flags; - private byte[] expanded_quals; - private int mms; - - public ExpandedSAMRecord(SAMRecord r, byte [] ref, long offset, IndelListener l) { - - read = r; - final long rStart = read.getAlignmentStart(); - final long rStop = read.getAlignmentEnd(); - final byte[] readBases = read.getReadString().toUpperCase().getBytes(); - - ref = new String(ref).toUpperCase().getBytes(); - - mismatch_flags = new byte[(int)(rStop-rStart+1)]; - expanded_quals = new byte[(int)(rStop-rStart+1)]; - - // now let's extract indels: - - Cigar c = read.getCigar(); - final int nCigarElems = c.numCigarElements(); - - - int readLength = 0; // length of the aligned part of the read NOT counting clipped bases - for ( CigarElement cel : c.getCigarElements() ) { - - switch(cel.getOperator()) { - case H: - case S: - case D: - case N: - case P: - break; // do not count gaps or clipped bases - case I: - case M: - case EQ: - case X: - readLength += cel.getLength(); - break; // advance along the gapless block in the alignment - default : - throw new IllegalArgumentException("Unexpected operator in cigar string: "+cel.getOperator()); - } - } - - int fromStart = 0; - int posOnRead = 0; - int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read: - // its start on the actual full reference contig is r.getAlignmentStart() - for ( int i = 0 ; i < nCigarElems ; i++ ) { - - final CigarElement ce = c.getCigarElement(i); - IndelVariant.Type type = null; - String indel_bases = null; - int eventPosition = posOnRef; - - switch(ce.getOperator()) { - case H: break; // hard clipped reads do not have clipped indel_bases in their sequence, so we just ignore the H element... - case I: - type = IndelVariant.Type.I; - indel_bases = read.getReadString().substring(posOnRead,posOnRead+ce.getLength()); - // will increment position on the read below, there's no 'break' statement yet... - case S: - // here we also skip soft-clipped indel_bases on the read; according to SAM format specification, - // alignment start position on the reference points to where the actually aligned - // (not clipped) indel_bases go, so we do not need to increment reference position here - posOnRead += ce.getLength(); - break; - case D: - type = IndelVariant.Type.D; - indel_bases = new String( ref, posOnRef, ce.getLength() ); - for( int k = 0 ; k < ce.getLength(); k++, posOnRef++ ) mismatch_flags[posOnRef] = expanded_quals[posOnRef] = -1; - - break; - case M: - case EQ: - case X: - for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) { - if ( readBases[posOnRead] != ref[posOnRef] ) { // mismatch! - mms++; - mismatch_flags[posOnRef] = 1; - } - expanded_quals[posOnRef] = read.getBaseQualities()[posOnRead]; - } - fromStart += ce.getLength(); - break; // advance along the gapless block in the alignment - default : - throw new IllegalArgumentException("Unexpected operator in cigar string: "+ce.getOperator()); - } - - if ( type == null ) continue; // element was not an indel, go grab next element... - - // we got an indel if we are here... - if ( i == 0 ) logger.debug("Indel at the start of the read "+read.getReadName()); - if ( i == nCigarElems - 1) logger.debug("Indel at the end of the read "+read.getReadName()); - - // note that here we will be assigning indels to the first deleted base or to the first - // base after insertion, not to the last base before the event! - int fromEnd = readLength - fromStart; - if ( type == IndelVariant.Type.I ) fromEnd -= ce.getLength(); - - l.addObservation((int)(offset+eventPosition), type, indel_bases, fromStart, fromEnd, this); - - if ( type == IndelVariant.Type.I ) fromStart += ce.getLength(); - - } - } - - public SAMRecord getSAMRecord() { return read; } - - public byte [] getExpandedMMFlags() { return mismatch_flags; } - - public byte [] getExpandedQuals() { return expanded_quals; } - - public int getMMCount() { return mms; } - - public boolean equals(Object o) { - if ( this == o ) return true; - if ( read == null ) return false; - if ( o instanceof SAMRecord ) return read.equals(o); - if ( o instanceof ExpandedSAMRecord ) return read.equals(((ExpandedSAMRecord)o).read); - return false; - } - - - } - -} - - -class VCFIndelAttributes { - public static String ALLELIC_DEPTH_KEY = VCFConstants.GENOTYPE_ALLELE_DEPTHS; - public static String DEPTH_TOTAL_KEY = VCFConstants.DEPTH_KEY; - - public static String MAPQ_KEY = "MQS"; - - public static String MM_KEY = "MM"; - - public static String NQS_MMRATE_KEY = "NQSMM"; - - public static String NQS_AVQ_KEY = "NQSBQ"; - - public static String STRAND_COUNT_KEY = "SC"; - public static String RSTART_OFFSET_KEY = "RStart"; - public static String REND_OFFSET_KEY = "REnd"; - - public static Set getAttributeHeaderLines() { - Set lines = new HashSet(); - - lines.add(new VCFFormatHeaderLine(ALLELIC_DEPTH_KEY, 2, VCFHeaderLineType.Integer, "# of reads supporting consensus reference/indel at the site")); - lines.add(new VCFFormatHeaderLine(DEPTH_TOTAL_KEY, 1, VCFHeaderLineType.Integer, "Total coverage at the site")); - - lines.add(new VCFFormatHeaderLine(MAPQ_KEY, 2, VCFHeaderLineType.Float, "Average mapping qualities of ref-/consensus indel-supporting reads")); - - lines.add(new VCFFormatHeaderLine(MM_KEY, 2, VCFHeaderLineType.Float, "Average # of mismatches per ref-/consensus indel-supporting read")); - - lines.add(new VCFFormatHeaderLine(NQS_MMRATE_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: fraction of mismatching bases in ref/consensus indel-supporting reads")); - - lines.add(new VCFFormatHeaderLine(NQS_AVQ_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: average quality of bases in ref-/consensus indel-supporting reads")); - - lines.add(new VCFFormatHeaderLine(STRAND_COUNT_KEY, 4, VCFHeaderLineType.Integer, "Strandness: counts of forward-/reverse-aligned reference and indel-supporting reads (FwdRef,RevRef,FwdIndel,RevIndel)")); - - lines.add(new VCFFormatHeaderLine(RSTART_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the starts of the reads")); - lines.add(new VCFFormatHeaderLine(REND_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the ends of the reads")); - - return lines; - } - - public static Map recordStrandCounts(int cnt_cons_fwd, int cnt_cons_rev, int cnt_ref_fwd, int cnt_ref_rev, Map attrs) { - attrs.put(STRAND_COUNT_KEY, new Integer[] {cnt_cons_fwd, cnt_cons_rev, cnt_ref_fwd, cnt_ref_rev} ); - return attrs; - } - - public static GenotypeBuilder recordStrandCounts(int cnt_cons_fwd, int cnt_cons_rev, int cnt_ref_fwd, int cnt_ref_rev, - GenotypeBuilder gb) { - return gb.attribute(STRAND_COUNT_KEY, new Integer[] {cnt_ref_fwd, cnt_ref_rev,cnt_cons_fwd, cnt_cons_rev } ); - } - - public static Map recordDepth(int cnt_cons, int cnt_indel, int cnt_total, Map attrs) { - attrs.put(ALLELIC_DEPTH_KEY, new Integer[] {cnt_total-cnt_indel, cnt_cons} ); - attrs.put(DEPTH_TOTAL_KEY, cnt_total); - return attrs; - } - - public static GenotypeBuilder recordDepth(int cnt_cons, int cnt_indel, int cnt_total, GenotypeBuilder gb) { - return gb.AD(new int[] {cnt_total-cnt_indel,cnt_cons} ).DP(cnt_total); - } - - public static Map recordAvMapQ(double cons, double ref, Map attrs) { - attrs.put(MAPQ_KEY, new Float[] {(float)ref, (float)cons} ); - return attrs; - } - - public static GenotypeBuilder recordAvMapQ(double cons, double ref, GenotypeBuilder gb) { - return gb.attribute(MAPQ_KEY,new float[] {(float)ref, (float)cons} ); - } - - public static Map recordAvMM(double cons, double ref, Map attrs) { - attrs.put(MM_KEY, new Float[] {(float)ref, (float)cons} ); - return attrs; - } - - public static GenotypeBuilder recordAvMM(double cons, double ref, GenotypeBuilder gb) { - return gb.attribute(MM_KEY, new float[] {(float)ref, (float)cons} ); - } - - public static Map recordNQSMMRate(double cons, double ref, Map attrs) { - attrs.put(NQS_MMRATE_KEY, new Float[] {(float)ref, (float)cons} ); - return attrs; - } - - public static GenotypeBuilder recordNQSMMRate(double cons, double ref, GenotypeBuilder gb) { - return gb.attribute(NQS_MMRATE_KEY, new float[] {(float)ref, (float)cons} ); - } - - public static Map recordNQSAvQ(double cons, double ref, Map attrs) { - attrs.put(NQS_AVQ_KEY, new float[] {(float)ref, (float)cons} ); - return attrs; - } - - public static GenotypeBuilder recordNQSAvQ(double cons, double ref, GenotypeBuilder gb) { - return gb.attribute(NQS_AVQ_KEY, new float[] {(float)ref, (float)cons} ); - } - - public static Map recordOffsetFromStart(int median, int mad, Map attrs) { - attrs.put(RSTART_OFFSET_KEY, new Integer[] {median, mad} ); - return attrs; - } - - public static GenotypeBuilder recordOffsetFromStart(int median, int mad, GenotypeBuilder gb) { - return gb.attribute(RSTART_OFFSET_KEY, new int[] {median, mad} ); - } - - public static Map recordOffsetFromEnd(int median, int mad, Map attrs) { - attrs.put(REND_OFFSET_KEY, new Integer[] {median, mad} ); - return attrs; - } - - public static GenotypeBuilder recordOffsetFromEnd(int median, int mad, GenotypeBuilder gb) { - return gb.attribute(REND_OFFSET_KEY, new int[] {median, mad} ); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/CircularArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/CircularArray.java deleted file mode 100644 index 5a669e65c..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/collections/CircularArray.java +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.collections; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - - -/** This class, closely resembling a deque (except that it is not dynamically grown), - * provides an object with array-like interface and efficient - * implementation of shift operation. Use this class when some kind of sliding window is required: - * e.g. an array (window) is populated from some stream of data, and then the window is shifted. - * If most of the data in the window remains the same so that only a few old elements sjould be popped from - * and a few new elements pushed onto the array, both re-populating the whole array from the data and - * shifting a regular array would be grossly inefficient. Instead, shiftData(int N) method of circular array - * efficiently pops out N first elements and makes last N elements available. - * - * Consider an example of reading a character stream A,B,C,D,....,Z into an array with requirement of keeping - * last 5 letters. First, we would read first 5 letters same way as we would with a regular array:

- * - * - * CircularArray a(5);
- * for ( int i = 0; i < 5; i++ ) a.set(i, readChar());
- *
- *
- * and then on the arrival of each next character we shift the array:

- * - * - * a.shiftData(1); a.set(4, readChar() );
- *
- *
- * After the lines from the above example are executed, the array will logically look as:
- * - * B,C,D,E,F,

- * - * e.g. as if we had a regular array, shifted it one element down and added new element on the top. - * - * - * @author asivache - * - */ -public class CircularArray { - - - private Object[] data ; - private int offset; - - /** Creates an array of fixed length */ - public CircularArray(int length) { - if ( length <= 0 ) throw new ReviewedStingException("CircularArray length must be positive. Passed: "+length); - data = new Object[length]; - offset = 0; - } - - /** Returns length of the array */ - public int length() { - return data.length; - } - - /** Gets i-th element of the array - * - * @throws IndexOutOfBoundsException if value of i is illegal - */ - @SuppressWarnings("unchecked") - public T get(int i) { - if ( i < 0 || i >= data.length ) - throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; element requested: "+i); - return (T)(data [ ( offset + i ) % data.length ]); - } - - /** Sets i-th element of the array to the specified value. - * - * @throws IndexOutOfBoundsException if value of i is illegal - */ - public void set(int i, T value) { - if ( i < 0 || i >= data.length ) - throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; set element request at: "+i); - data [ ( offset + i ) % data.length ] = value; - } - - /** Set all elements to null. - * - */ - public void clear() { - for ( int i = 0 ; i < data.length ; i++ ) data[i] = null; - offset = 0; - } - - /** Efficient shift-down of the array data. After this operation, array.get(0), array.get(1), etc will - * be returning what array.get(shift), array.get(shift+1),... were returning before the shift was performed, - * and last shift elements of the array will be reset to 0. - * @param shift - */ - public void shiftData(int shift) { - if ( shift >= data.length ) { - // if we shift by more than the length of stored data, we lose - // all that data completely, so we just re-initialize the array. - // This is not the operating mode CircularArray is intended for - // but we can handle it, just in case. - for ( int i = 0 ; i < data.length ; i++ ) data[i] = null; - offset = 0; - return; - } - - // shift < data.length, so at least some data should be preserved - - final int newOffset = ( offset+shift ) % data.length; - if ( newOffset < offset ) { - // wrap-around! - for ( int i = offset ; i < data.length ; i++ ) data[i] = null; - for ( int i = 0; i < newOffset ; i++ ) data[i] = null; - } else { - for ( int i = offset ; i < newOffset ; i++ ) data[i] = null; - } - offset = newOffset; - } - - - - /** Implements primitive int type-based circular array. See CircularArray for details. - * - * @author asivache - * - */ - public static class Int { - private int [] data ; - private int offset; - - /** Creates an array of fixed length */ - public Int(int length) { - if ( length <= 0 ) throw new ReviewedStingException("CircularArray length must be positive. Passed: "+length); - data = new int[length]; // automaticaly initialized to zeros - offset = 0; - } - - /** Returns length of the array */ - public int length() { - return data.length; - } - - /** Gets i-th element of the array - * - * @throws IndexOutOfBoundsException if value of i is illegal - */ - public int get(int i) { - if ( i < 0 || i >= data.length ) - throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; element requested: "+i); - return data [ ( offset + i ) % data.length ]; - } - - /** Sets i-th element of the array to the specified value. - * - * @throws IndexOutOfBoundsException if value of i is illegal - */ - public void set(int i, int value) { - if ( i < 0 || i >= data.length ) - throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; set element request at: "+i); - data [ ( offset + i ) % data.length ] = value; - } - - /** Increments i-th element of the array by the specified value (value can be negative). - * - * @throws IndexOutOfBoundsException if i is illegal - */ - public void increment(int i, int value) { - if ( i < 0 || i >= data.length ) - throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; increment element request at: "+i); - data [ ( offset + i ) % data.length ] += value; - } - - /** Set all elements to 0. - * - */ - public void clear() { - for ( int i = 0 ; i < data.length ; i++ ) data[i] = 0; - offset = 0; - } - - /** Efficient shift-down of the array data. After this operation, array.get(0), array.get(1), etc will - * be returning what array.get(shift), array.get(shift+1),... were returning before the shift was performed, - * and last shift elements of the array will be reset to 0. - * @param shift - */ - public void shiftData(int shift) { - if ( shift >= data.length ) { - // if we shift by more than the length of stored data, we lose - // all that data completely, so we just re-initialize the array. - // This is not the operating mode CircularArray is intended for - // but we can handle it, just in case. - for ( int i = 0 ; i < data.length ; i++ ) data[i] = 0; - offset = 0; - return; - } - - // shift < data.length, so at least some data should be preserved - - final int newOffset = ( offset+shift ) % data.length; - if ( newOffset < offset ) { - // wrap-around! - for ( int i = offset ; i < data.length ; i++ ) data[i] = 0; - for ( int i = 0; i < newOffset ; i++ ) data[i] = 0; - } else { - for ( int i = offset ; i < newOffset ; i++ ) data[i] = 0; - } - offset = newOffset; - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java b/public/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java deleted file mode 100755 index 29ffb13e4..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.interval; - -import org.broadinstitute.sting.gatk.iterators.PushbackIterator; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.util.Iterator; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Oct 7, 2010 - * Time: 2:40:02 PM - * To change this template use File | Settings | File Templates. - */ - -/** This class provides an adapter to Iterator that returns only (parts of) underlying iterator's - * intervals overlapping with specified "master set" of bounding intervals. The underlying iterator must return - * NON-overlapping intervals in coordinate-sorted order, otherwise the behavior is unspecified. If the master set is represented by - * another interval iterator, it should return sorted and NON-overlapping intervals. - * - */ -public class OverlappingIntervalIterator implements Iterator { - PushbackIterator iter = null; - PushbackIterator boundBy = null; - - GenomeLoc prefetchedOverlap = null; - GenomeLoc currentBound = null; - GenomeLoc currentInterval = null; - - - /** Creates new overlapping iterator that will internally traverse intervals and return only - * overlaps of those with set of intervals returned by boundBy. - * @param intervals - * @param boundBy - */ - public OverlappingIntervalIterator(Iterator intervals, Iterator boundBy) { - this.iter = new PushbackIterator(intervals); - this.boundBy = new PushbackIterator(boundBy); - - if ( iter.hasNext() && boundBy.hasNext() ) { - currentInterval = iter.next(); // load first interval - currentBound = boundBy.next(); // load first bounding interval - fetchNextOverlap(); - } - } - - /** Traverses both iterators in sync, until the first overlap between the two is reached. If no overlap is found - * until the end of the either of the two streams, leaves prefetchedOverlap set to null - */ - private void fetchNextOverlap() { - - prefetchedOverlap = null; - // System.out.println("Fetching... (interval="+currentInterval+"; bound="+currentBound+")"); - while ( currentInterval != null && currentBound != null ) { - - if ( currentInterval.isBefore(currentBound) ) { -// System.out.println(currentInterval +" is before "+currentBound ); - if ( ! iter.hasNext() ) currentInterval = null; - else currentInterval = iter.next(); - continue; - } - - if ( currentInterval.isPast(currentBound) ) { -// System.out.println(currentInterval +" is past "+currentBound ); - if ( ! boundBy.hasNext() ) currentBound = null; - else currentBound = boundBy.next(); - continue; - } - - // we are at this point only if currentInterval overlaps with currentBound - - prefetchedOverlap = currentInterval.intersect(currentBound); -// System.out.println("Fetched next overlap: "+prefetchedOverlap); - // now we need to advance at least one of the iterators, so that we would not - // call the same overlap again - - // however we still do not know if we are done with either current interval or current bound, because - // two special situations are possible: - // - // 1) next interval overlaps with 2) current interval also overlaps with - // the same bounding interval; next bounding interval; note that - // note that in this case next in this case next bound necessarily - // interval necessarily starts before starts before the next interval - // the next bound - // - // curr. int next int. curr. int - // ----- ------ -------------------------- - // ------------------- --------- ------------- - // curr. bound curr. bound next bound - - // To solve this issue we update either only currentInterval or only currentBound to their next value, - // whichever of those next values (intervals) comes first on the reference genome; - // the rest of the traversal to the next overlap will be performed on the next invocation of - // fetchNextOverlap(). - - advanceToNearest(); - - break; // now that we computed the overlap and advanced (at least one of) the intervals/bounds to - // the next location, we are done - bail out from the loop. - } - - } - - private void advanceToNearest() { - if ( ! iter.hasNext() ) { - currentBound = boundBy.hasNext() ? boundBy.next() : null; - } else { - if ( ! boundBy.hasNext() ) currentInterval = iter.hasNext() ? iter.next() : null; - else { - // both intervals and bounds have next value available; let's check which comes first: - GenomeLoc nextInterval = iter.next(); - GenomeLoc nextBound = boundBy.next(); - - if ( nextInterval.compareTo(nextBound) < 0 ) { - currentInterval = nextInterval; - boundBy.pushback(nextBound); - } else { - currentBound = nextBound; - iter.pushback(nextInterval); - } - - } - } - } - - /** - * Returns true if the iteration has more elements. (In other - * words, returns true if next would return an element - * rather than throwing an exception.) - * - * @return true if the iterator has more elements. - */ - public boolean hasNext() { - return prefetchedOverlap != null; - } - - /** - * Returns the next element in the iteration. - * - * @return the next element in the iteration. - * @throws java.util.NoSuchElementException - * iteration has no more elements. - */ - public GenomeLoc next() { - if ( prefetchedOverlap == null ) - throw new java.util.NoSuchElementException("Illegal call to next(): Overlapping iterator has no more overlaps"); - GenomeLoc ret = prefetchedOverlap; // cache current prefetched overlap - fetchNextOverlap(); // prefetch next overlap - return ret ; - } - - /** - * Removes from the underlying collection the last element returned by the - * iterator (optional operation). This method can be called only once per - * call to next. The behavior of an iterator is unspecified if - * the underlying collection is modified while the iteration is in - * progress in any way other than by calling this method. - * - * @throws UnsupportedOperationException if the remove - * operation is not supported by this Iterator. - * @throws IllegalStateException if the next method has not - * yet been called, or the remove method has already - * been called after the last call to the next - * method. - */ - public void remove() { - throw new UnsupportedOperationException("remove() method is not supported by OverlappingIntervalIterator"); - //To change body of implemented methods use File | Settings | File Templates. - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 7ef05edd8..5ad4f5c8b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -236,18 +236,6 @@ public class AlignmentUtils { return n; } - public static int getNumAlignedBases(final SAMRecord r) { - int n = 0; - final Cigar cigar = r.getCigar(); - if (cigar == null) return 0; - - for (final CigarElement e : cigar.getCigarElements()) - if (e.getOperator() == CigarOperator.M) - n += e.getLength(); - - return n; - } - public static int getNumAlignedBasesCountingSoftClips(final SAMRecord r) { int n = 0; final Cigar cigar = r.getCigar(); @@ -512,47 +500,6 @@ public class AlignmentUtils { return true; } - /** - * Returns true is read is mapped and mapped uniquely (Q>0). - * - * @param read - * @return - */ - public static boolean isReadUniquelyMapped(SAMRecord read) { - return (!AlignmentUtils.isReadUnmapped(read)) && read.getMappingQuality() > 0; - } - - /** - * Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from - * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base - * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array - * of read's base qualitites is inverted (in this case new array is allocated and returned). - * - * @param read - * @return - */ - public static byte[] getQualsInCycleOrder(SAMRecord read) { - if (isReadUnmapped(read) || !read.getReadNegativeStrandFlag()) return read.getBaseQualities(); - - return Utils.reverse(read.getBaseQualities()); - } - - /** - * Returns the array of original base qualitites (before recalibration) in the order the bases were read on the machine (i.e. always starting from - * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base - * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array - * of read's base qualitites is inverted (in this case new array is allocated and returned). If no original base qualities - * are available this method will throw a runtime exception. - * - * @param read - * @return - */ - public static byte[] getOriginalQualsInCycleOrder(SAMRecord read) { - if (isReadUnmapped(read) || !read.getReadNegativeStrandFlag()) return read.getOriginalBaseQualities(); - - return Utils.reverse(read.getOriginalBaseQualities()); - } - /** * Takes the alignment of the read sequence readSeq to the reference sequence refSeq * starting at 0-based position refIndex on the refSeq and specified by its cigar. From 5558a6b8f7aaaf77a961754b6f6e41e2b3e44ec7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 29 Dec 2012 14:34:17 -0500 Subject: [PATCH 08/22] Deleting / archiving no longer classes -- AminoAcidTable and AminoAcid goes to the archive -- Removing two unused SAMRecord classes --- ...CoordinateComparatorWithUnmappedReads.java | 60 ----- .../broadinstitute/sting/utils/AminoAcid.java | 93 -------- .../sting/utils/AminoAcidTable.java | 214 ------------------ .../sting/utils/sam/ComparableSAMRecord.java | 68 ------ 4 files changed, 435 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SAMRecordCoordinateComparatorWithUnmappedReads.java delete mode 100755 public/java/src/org/broadinstitute/sting/utils/AminoAcid.java delete mode 100755 public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java delete mode 100755 public/java/src/org/broadinstitute/sting/utils/sam/ComparableSAMRecord.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SAMRecordCoordinateComparatorWithUnmappedReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SAMRecordCoordinateComparatorWithUnmappedReads.java deleted file mode 100644 index 3854a4a8c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SAMRecordCoordinateComparatorWithUnmappedReads.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * The MIT License - * - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -package org.broadinstitute.sting.gatk.walkers.indels; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMRecordCoordinateComparator; - -/** - * Extends Picard's Comparator for sorting SAMRecords by coordinate. This one actually deals with unmapped reads - * (among other things) sitting at the same position as their mates (so that they both can be put into the same set). - */ -public class SAMRecordCoordinateComparatorWithUnmappedReads extends SAMRecordCoordinateComparator { - public int compare(final SAMRecord samRecord1, final SAMRecord samRecord2) { - int cmp = fileOrderCompare(samRecord1, samRecord2); - if ( cmp != 0 ) - return cmp; - - // deal with unmapped reads - if ( samRecord1.getReadUnmappedFlag() != samRecord2.getReadUnmappedFlag() ) - return (samRecord1.getReadUnmappedFlag()? 1: -1); - - if ( samRecord1.getReadNegativeStrandFlag() != samRecord2.getReadNegativeStrandFlag() ) - return (samRecord1.getReadNegativeStrandFlag()? 1: -1); - - // even the names can be the same - cmp = samRecord1.getReadName().compareTo(samRecord2.getReadName()); - if ( cmp != 0 ) - return cmp; - - if ( samRecord1.getDuplicateReadFlag() != samRecord2.getDuplicateReadFlag() ) - return (samRecord1.getDuplicateReadFlag()? -1: 1); - - if ( samRecord1.getReadPairedFlag() && samRecord2.getReadPairedFlag() && samRecord1.getFirstOfPairFlag() != samRecord2.getFirstOfPairFlag() ) - return (samRecord1.getFirstOfPairFlag()? -1: 1); - - // such a case was actually observed - return samRecord1.getMappingQuality() - samRecord2.getMappingQuality(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/AminoAcid.java b/public/java/src/org/broadinstitute/sting/utils/AminoAcid.java deleted file mode 100755 index 0b47093fa..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/AminoAcid.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils; - -/** - * Represents a single amino acid. - */ -public class AminoAcid { - private String name; - private String threeLetterCode; - private String letter; - - - /** - * Constructor. - * - * @param letter The 1 letter code. (eg. I). This is '*' for the stop codon. - * @param name The full name of the AA (eg. Isoleucine). - * @param code The 3 letter code. (eg. Ile). - */ - public AminoAcid( String letter, String name, String code) { - this.name = name; - this.threeLetterCode = code; - this.letter = letter; - } - - /** Equality based on the amino acid code. */ - public boolean equals(Object o) { - if (this == o) { return true; } - if (o == null || !(o instanceof AminoAcid)) { return false; } - - final AminoAcid aminoAcid = (AminoAcid) o; - return !(getCode() != null ? !getCode().equals(aminoAcid.getCode()) : aminoAcid.getCode() != null); - } - - /** Hashes the three letter code. */ - public int hashCode() { - return (getCode() != null ? getCode().hashCode() : 0); - } - - /** - * Returns the full name of this AA. - */ - public String getName() { return name; } - - /** - * Returns the 1 letter code for this AA. - */ - public String getLetter() { return letter; } - - /** - * Returns the 3 letter code. - */ - public String getCode() { return threeLetterCode; } - - - /** Returns true if the amino acid is really just a stop codon. */ - public boolean isStop() { - return "*".equals(getLetter()); - } - - /** Returns true if the amino acid is really just a stop codon. */ - public boolean isUnknown() { - return "X".equals(getLetter()); - } - - public String toString() { - return name; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java b/public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java deleted file mode 100755 index 1ae28ffb3..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils; - -import java.util.HashMap; - -/** - * A simple {codon -> amino acid name} lookup table. - * Handles differences between mitochondrial and nuclear genes. - */ -public class AminoAcidTable { - - - protected static final AminoAcid UNKNOWN = new AminoAcid("X" , "Unknown", "Unk"); - protected static final AminoAcid ISOLEUCINE = new AminoAcid("I" , "Isoleucine", "Ile"); - protected static final AminoAcid LEUCINE = new AminoAcid("L" , "Leucine", "Leu"); - protected static final AminoAcid VALINE = new AminoAcid("V" , "Valine", "Val"); - protected static final AminoAcid PHENYLALANINE = new AminoAcid("F" , "Phenylalanine", "Phe"); - protected static final AminoAcid METHIONINE = new AminoAcid("M" , "Methionine", "Met"); - protected static final AminoAcid CYSTEINE = new AminoAcid("C" , "Cysteine", "Cys"); - protected static final AminoAcid ALANINE = new AminoAcid("A" , "Alanine", "Ala"); - protected static final AminoAcid STOP_CODON = new AminoAcid("*" , "Stop Codon", "Stop"); - protected static final AminoAcid GLYCINE = new AminoAcid("G" , "Glycine", "Gly"); - protected static final AminoAcid PROLINE = new AminoAcid("P" , "Proline", "Pro"); - protected static final AminoAcid THEONINE = new AminoAcid("T" , "Threonine", "Thr"); - protected static final AminoAcid SERINE = new AminoAcid("S" , "Serine", "Ser"); - protected static final AminoAcid TYROSINE = new AminoAcid("Y" , "Tyrosine", "Tyr"); - protected static final AminoAcid TRYPTOPHAN = new AminoAcid("W" , "Tryptophan", "Trp"); - protected static final AminoAcid GLUTAMINE = new AminoAcid("Q" , "Glutamine", "Gln"); - protected static final AminoAcid ASPARAGINE = new AminoAcid("N" , "Asparagine", "Asn"); - protected static final AminoAcid HISTIDINE = new AminoAcid("H" , "Histidine", "His"); - protected static final AminoAcid GLUTAMIC_ACID = new AminoAcid("E" , "Glutamic acid", "Glu"); - protected static final AminoAcid ASPARTIC_ACID = new AminoAcid("D" , "Aspartic acid", "Asp"); - protected static final AminoAcid LYSINE = new AminoAcid("K" , "Lysine", "Lys"); - protected static final AminoAcid ARGININE = new AminoAcid("R" , "Arginine", "Arg"); - - protected static HashMap aminoAcidTable = new HashMap(); - protected static HashMap mitochondrialAminoAcidTable = new HashMap(); - - static { - //populate the tables - aminoAcidTable.put("ATT", ISOLEUCINE); - aminoAcidTable.put("ATC", ISOLEUCINE); - aminoAcidTable.put("ATA", ISOLEUCINE); - - - aminoAcidTable.put("CTT", LEUCINE); - aminoAcidTable.put("CTC", LEUCINE); - aminoAcidTable.put("CTA", LEUCINE); - aminoAcidTable.put("CTG", LEUCINE); - aminoAcidTable.put("TTA", LEUCINE); - aminoAcidTable.put("TTG", LEUCINE); - - - aminoAcidTable.put("GTT", VALINE); - aminoAcidTable.put("GTC", VALINE); - aminoAcidTable.put("GTA", VALINE); - aminoAcidTable.put("GTG", VALINE); - - - aminoAcidTable.put("TTT", PHENYLALANINE); - aminoAcidTable.put("TTC", PHENYLALANINE); - - - aminoAcidTable.put("ATG", METHIONINE); - - aminoAcidTable.put("TGT", CYSTEINE); - aminoAcidTable.put("TGC", CYSTEINE); - - aminoAcidTable.put("GCT", ALANINE); - aminoAcidTable.put("GCC", ALANINE); - aminoAcidTable.put("GCA", ALANINE); - aminoAcidTable.put("GCG", ALANINE); - - - aminoAcidTable.put("GGT", GLYCINE); - aminoAcidTable.put("GGC", GLYCINE); - aminoAcidTable.put("GGA", GLYCINE); - aminoAcidTable.put("GGG", GLYCINE); - - - aminoAcidTable.put("CCT", PROLINE); - aminoAcidTable.put("CCC", PROLINE); - aminoAcidTable.put("CCA", PROLINE); - aminoAcidTable.put("CCG", PROLINE); - - - - - aminoAcidTable.put("ACT", THEONINE); - aminoAcidTable.put("ACC", THEONINE); - aminoAcidTable.put("ACA", THEONINE); - aminoAcidTable.put("ACG", THEONINE); - - - - aminoAcidTable.put("TCT", SERINE); - aminoAcidTable.put("TCC", SERINE); - aminoAcidTable.put("TCA", SERINE); - aminoAcidTable.put("TCG", SERINE); - aminoAcidTable.put("AGT", SERINE); - aminoAcidTable.put("AGC", SERINE); - - aminoAcidTable.put("TAT", TYROSINE); - aminoAcidTable.put("TAC", TYROSINE); - - - - aminoAcidTable.put("TGG", TRYPTOPHAN); - - - aminoAcidTable.put("CAA", GLUTAMINE); - aminoAcidTable.put("CAG", GLUTAMINE); - - - aminoAcidTable.put("AAT", ASPARAGINE); - aminoAcidTable.put("AAC", ASPARAGINE); - - - aminoAcidTable.put("CAT", HISTIDINE); - aminoAcidTable.put("CAC", HISTIDINE); - - - aminoAcidTable.put("GAA", GLUTAMIC_ACID); - aminoAcidTable.put("GAG", GLUTAMIC_ACID); - - - - aminoAcidTable.put("GAT", ASPARTIC_ACID); - aminoAcidTable.put("GAC", ASPARTIC_ACID); - - - aminoAcidTable.put("AAA", LYSINE); - aminoAcidTable.put("AAG", LYSINE); - - - aminoAcidTable.put("CGT", ARGININE); - aminoAcidTable.put("CGC", ARGININE); - aminoAcidTable.put("CGA", ARGININE); - aminoAcidTable.put("CGG", ARGININE); - aminoAcidTable.put("AGA", ARGININE); - aminoAcidTable.put("AGG", ARGININE); - - - aminoAcidTable.put("TAA", STOP_CODON ); - aminoAcidTable.put("TAG", STOP_CODON); - aminoAcidTable.put("TGA", STOP_CODON); - - - //populate the mitochondrial AA table - mitochondrialAminoAcidTable.putAll(aminoAcidTable); - mitochondrialAminoAcidTable.put("AGA", STOP_CODON); - mitochondrialAminoAcidTable.put("AGG", STOP_CODON); - mitochondrialAminoAcidTable.put("ATA", METHIONINE); - mitochondrialAminoAcidTable.put("TGA", TRYPTOPHAN); - } - - /** - * Returns the amino acid encoded by the given codon in a eukaryotic genome. - * - * @param codon The 3-letter mRNA nucleotide codon 5' to 3'. Expects T's instead of U's. Not case sensitive. - * - * @return The amino acid matching the given codon, or the UNKNOWN amino acid if the codon string doesn't match anything - */ - public static AminoAcid getEukaryoticAA(String codon) { - codon = codon.toUpperCase(); - final AminoAcid aa = aminoAcidTable.get(codon); - return aa == null ? UNKNOWN : aa; - } - - - /** - * Returns the amino acid encoded by the given codon in a mitochondrial genome. - * - * @param codon The 3-letter mRNA nucleotide codon 5' to 3'. Expects T's instead of U's. Not case sensitive. - * @param isFirstCodon If this is the 1st codon in the gene, then "ATT" encodes Methyonine - * - * @return The amino acid matching the given codon in mitochondrial genes, or the UNKNOWN amino acid if the codon string doesn't match anything - */ - public static AminoAcid getMitochondrialAA(String codon, boolean isFirstCodon) { - codon = codon.toUpperCase(); - final AminoAcid aa = mitochondrialAminoAcidTable.get(codon); - if(aa == null) { - return UNKNOWN; - } else if(isFirstCodon && codon.equals("ATT")) { - return METHIONINE; //special case - 'ATT' in the first codon of a mitochondrial gene codes for methionine instead of isoleucine - } else { - return aa; - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ComparableSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/ComparableSAMRecord.java deleted file mode 100755 index 31deb7535..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ComparableSAMRecord.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.samtools.SAMRecord; - -public class ComparableSAMRecord implements Comparable { - - private SAMRecord record; - - public ComparableSAMRecord(SAMRecord record) { - this.record = record; - } - - public SAMRecord getRecord() { - return record; - } - - public int compareTo(ComparableSAMRecord o) { - // first sort by start position -- with not coverflow because both are guaranteed to be positive. - int comparison = record.getAlignmentStart() - o.record.getAlignmentStart(); - // if the reads have the same start position, we must give a non-zero comparison - // (because java Sets often require "consistency with equals") - if ( comparison == 0 ) - comparison = record.getReadName().compareTo(o.getRecord().getReadName()); - // if the read names are the same, use the first of the pair if appropriate - if ( comparison == 0 && record.getReadPairedFlag() ) - comparison = ( record.getFirstOfPairFlag() ? -1 : 1); - return comparison; - } - - public boolean equals(Object obj) { - if ( !(obj instanceof ComparableSAMRecord) ) - return false; - if ( this == obj ) - return true; - - ComparableSAMRecord csr = (ComparableSAMRecord)obj; - if(record.getAlignmentStart() != csr.record.getAlignmentStart()) - return false; - if ( !record.getReadName().equals(csr.getRecord().getReadName()) ) - return false; - return ( record.getFirstOfPairFlag() == csr.record.getFirstOfPairFlag() ); - } -} \ No newline at end of file From 5afeb465aa0a53a80f7490914e5c8c8a5f473db1 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Thu, 20 Dec 2012 09:52:37 -0500 Subject: [PATCH 10/22] TODOs --- .../sting/gatk/traversals/TraverseActiveRegionsTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index 185eca85c..f276d21e5 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -117,7 +117,8 @@ public class TraverseActiveRegionsTest extends BaseTest { // TODO: reads which are partially between intervals (in/outside extension) // TODO: duplicate reads - // TODO: should we assign reads which are completely outside intervals but within extension? + // TODO: reads which are completely outside intervals but within extension + // TODO: test the extension itself intervals = new ArrayList(); From 7748b3816f30cb5a4f6729f1cf6451634bfd241e Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Mon, 31 Dec 2012 17:22:57 -0500 Subject: [PATCH 11/22] Delete the test BAI file as well as the BAM --- .../sting/gatk/traversals/TraverseActiveRegionsTest.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index f276d21e5..f37c2ce63 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -104,6 +104,7 @@ public class TraverseActiveRegionsTest extends BaseTest { private List intervals; private static final String testBAM = "TraverseActiveRegionsTest.bam"; + private static final String testBAI = "TraverseActiveRegionsTest.bai"; @BeforeClass private void init() throws FileNotFoundException { @@ -149,6 +150,8 @@ public class TraverseActiveRegionsTest extends BaseTest { private void createBAM(List reads) { File outFile = new File(testBAM); outFile.deleteOnExit(); + File indexFile = new File(testBAI); + indexFile.deleteOnExit(); SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(reads.get(0).getHeader(), true, outFile); for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) { From 57d38aac8a983deaca1fb79d928f0c1ab82d579b Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Tue, 1 Jan 2013 18:36:56 -0500 Subject: [PATCH 12/22] Temporarily disable due to unknown contracts problem --- .../sting/gatk/traversals/TraverseActiveRegionsTest.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index f37c2ce63..1c79b2aa9 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -179,13 +179,15 @@ public class TraverseActiveRegionsTest extends BaseTest { return activeIntervals; } - @Test (expectedExceptions = PreconditionError.class) + // TODO: fix this contracts issue and re-enable + @Test (enabled = false, expectedExceptions = PreconditionError.class) public void testIsActiveRangeLow () { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1); getActiveRegions(walker, intervals).values(); } - @Test (expectedExceptions = PreconditionError.class) + // TODO: fix this contracts issue and re-enable + @Test (enabled = false, expectedExceptions = PreconditionError.class) public void testIsActiveRangeHigh () { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1); getActiveRegions(walker, intervals).values(); From 429567cd3fae68f7a3f9d1c7cc78d7987a5ee7e8 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Tue, 1 Jan 2013 19:20:30 -0500 Subject: [PATCH 13/22] Rename to TraverseActiveRegionsUnitTest --- ...eRegionsTest.java => TraverseActiveRegionsUnitTest.java} | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename public/java/test/org/broadinstitute/sting/gatk/traversals/{TraverseActiveRegionsTest.java => TraverseActiveRegionsUnitTest.java} (99%) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java rename to public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 1c79b2aa9..0e47c14c5 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -45,7 +45,7 @@ import java.util.*; * Test the Active Region Traversal Contract * http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract */ -public class TraverseActiveRegionsTest extends BaseTest { +public class TraverseActiveRegionsUnitTest extends BaseTest { private class DummyActiveRegionWalker extends ActiveRegionWalker { private final double prob; @@ -103,8 +103,8 @@ public class TraverseActiveRegionsTest extends BaseTest { private List intervals; - private static final String testBAM = "TraverseActiveRegionsTest.bam"; - private static final String testBAI = "TraverseActiveRegionsTest.bai"; + private static final String testBAM = "TraverseActiveRegionsUnitTest.bam"; + private static final String testBAI = "TraverseActiveRegionsUnitTest.bai"; @BeforeClass private void init() throws FileNotFoundException { From 3d1c107f9d637bef3f19e844dd337b4e45ff3c26 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 31 Dec 2012 09:24:01 -0500 Subject: [PATCH 14/22] Detect if clover is present in build.xml. Automatically clean clover db in ant clean, if present --- build.xml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/build.xml b/build.xml index b4bb8716d..c4c31c892 100644 --- a/build.xml +++ b/build.xml @@ -114,7 +114,6 @@ - @@ -1031,6 +1030,14 @@ + + + + + + + + @@ -1038,7 +1045,7 @@ - + @@ -1135,10 +1142,6 @@ - - - - From c1c5737f80b1e5dd8fc7edee77d2e63272454573 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 2 Jan 2013 11:28:54 -0500 Subject: [PATCH 15/22] Re-enabling contracts in tests in build.xml -- Left contracts turned off in original clover commit --- build.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.xml b/build.xml index c4c31c892..3aead62d6 100644 --- a/build.xml +++ b/build.xml @@ -1278,7 +1278,7 @@ - + From 12f4c6307ef3a06eb290dd467d3fe27323ebf82f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 2 Jan 2013 08:46:11 -0500 Subject: [PATCH 16/22] AutoFormattingTime cleanup and complete unittests -- Underlying system now uses long nano times to be more consistent with standard java practice -- Updated a few places in the code that were converting from nanoseconds to double seconds to use the new nanoseconds interface directly -- Bringing us to 100% test coverage with clover with AutoFormattingTimeUnitTest --- .../sting/gatk/executive/MicroScheduler.java | 2 +- .../genotyper/afcalc/ExactCallLogger.java | 2 +- .../sting/utils/AutoFormattingTime.java | 118 +++++++++++++++--- .../threading/ThreadEfficiencyMonitor.java | 2 +- .../sting/AutoFormattingTimeUnitTest.java | 117 +++++++++++++++++ .../org/broadinstitute/sting/BaseTest.java | 10 +- 6 files changed, 229 insertions(+), 22 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/AutoFormattingTimeUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 8d0cefaa4..f8aec1489 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -284,7 +284,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { protected boolean abortExecution() { final boolean abort = engine.exceedsRuntimeLimit(progressMeter.getRuntimeInNanoseconds(), TimeUnit.NANOSECONDS); if ( abort ) { - final AutoFormattingTime aft = new AutoFormattingTime(TimeUnit.SECONDS.convert(engine.getRuntimeLimitInNanoseconds(), TimeUnit.NANOSECONDS), 1, 4); + final AutoFormattingTime aft = new AutoFormattingTime(engine.getRuntimeLimitInNanoseconds(), -1, 4); logger.info("Aborting execution (cleanly) because the runtime has exceeded the requested maximum " + aft); } return abort; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java index c9270a6a7..3681451c6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java @@ -50,7 +50,7 @@ public class ExactCallLogger implements Cloneable { return String.format("ExactCall %s:%d alleles=%s nSamples=%s orig.pNonRef=%.2f orig.runtime=%s", vc.getChr(), vc.getStart(), vc.getAlleles(), vc.getNSamples(), originalCall.getLog10PosteriorOfAFGT0(), - new AutoFormattingTime(runtime / 1e9).toString()); + new AutoFormattingTime(runtime).toString()); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java b/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java index 4455666e8..fdcb8ba03 100644 --- a/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java +++ b/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java @@ -1,32 +1,104 @@ package org.broadinstitute.sting.utils; +import java.util.concurrent.TimeUnit; + /** - * Simple utility class that makes it convenient to print unit adjusted times + * Conveniently print a time with an automatically determined time unit + * + * For example, if the amount of time is 10^6 seconds, instead of printing + * out 10^6 seconds, prints out 11.57 days instead. + * + * Dynamically uses time units: + * + * - seconds: s + * - minutes: m + * - hours : h + * - days : d + * - weeks : w + * + * @author depristo + * @since 2009 */ public class AutoFormattingTime { + private static final double NANOSECONDS_PER_SECOND = 1e9; + + /** + * Width a la format's %WIDTH.PERCISIONf + */ private final int width; // for format + + /** + * Precision a la format's %WIDTH.PERCISIONf + */ private final int precision; // for format - double timeInSeconds; // in Seconds - private final String formatString; + /** + * The elapsed time in nanoseconds + */ + private final long nanoTime; + + /** + * Create a new autoformatting time with elapsed time nanoTime in nanoseconds + * @param nanoTime the elapsed time in nanoseconds + * @param width the width >= 0 (a la format's %WIDTH.PERCISIONf) to use to display the format, or -1 if none is required + * @param precision the precision to display the time at. Must be >= 0; + */ + public AutoFormattingTime(final long nanoTime, final int width, int precision) { + if ( width < -1 ) throw new IllegalArgumentException("Width " + width + " must be >= -1"); + if ( precision < 0 ) throw new IllegalArgumentException("Precision " + precision + " must be >= 0"); - public AutoFormattingTime(double timeInSeconds, final int width, int precision) { this.width = width; - this.timeInSeconds = timeInSeconds; + this.nanoTime = nanoTime; this.precision = precision; - this.formatString = "%" + width + "." + precision + "f %s"; } - public AutoFormattingTime(double timeInSeconds, int precision) { - this(timeInSeconds, 6, precision); + /** + * @see #AutoFormattingTime(long, int, int) but with default width and precision + * @param nanoTime + */ + public AutoFormattingTime(final long nanoTime) { + this(nanoTime, 6, 1); } + /** + * @see #AutoFormattingTime(long, int, int) but with time specificied as a double in seconds + */ + public AutoFormattingTime(final double timeInSeconds, final int width, final int precision) { + this(secondsToNano(timeInSeconds), width, precision); + } + + /** + * @see #AutoFormattingTime(long) but with time specificied as a double in seconds + */ public AutoFormattingTime(double timeInSeconds) { - this(timeInSeconds, 1); + this(timeInSeconds, 6, 1); } + /** + * Precomputed format string suitable for string.format with the required width and precision + */ + private String getFormatString() { + final StringBuilder b = new StringBuilder("%"); + if ( width != -1 ) + b.append(width); + b.append(".").append(precision).append("f %s"); + return b.toString(); + } + + /** + * Get the time associated with this object in nanoseconds + * @return the time in nanoseconds + */ + public long getTimeInNanoSeconds() { + return nanoTime; + } + + /** + * Get the time associated with this object in seconds, as a double + * @return time in seconds as a double + */ public double getTimeInSeconds() { - return timeInSeconds; + return TimeUnit.NANOSECONDS.toSeconds(getTimeInNanoSeconds()); } /** @@ -44,15 +116,16 @@ public class AutoFormattingTime { } /** - * Instead of 10000 s, returns 2.8 hours - * @return + * Get a string representation of this time, automatically converting the time + * to a human readable unit with width and precision provided during construction + * @return a non-null string */ public String toString() { - double unitTime = timeInSeconds; + double unitTime = getTimeInSeconds(); String unit = "s"; - if ( timeInSeconds > 120 ) { - unitTime = timeInSeconds / 60; // minutes + if ( unitTime > 120 ) { + unitTime /= 60; // minutes unit = "m"; if ( unitTime > 120 ) { @@ -64,13 +137,24 @@ public class AutoFormattingTime { unit = "d"; if ( unitTime > 20 ) { - unitTime /= 7; // days + unitTime /= 7; // weeks unit = "w"; } } } } - return String.format(formatString, unitTime, unit); + return String.format(getFormatString(), unitTime, unit); } + + + /** + * Convert a time in seconds as a double into nanoseconds as a long + * @param timeInSeconds an elapsed time in seconds, as a double + * @return an equivalent value in nanoseconds as a long + */ + private static long secondsToNano(final double timeInSeconds) { + return (long)(NANOSECONDS_PER_SECOND * timeInSeconds); + } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java index 9159f5657..68ccd8e58 100644 --- a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java +++ b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java @@ -133,7 +133,7 @@ public class ThreadEfficiencyMonitor { */ public synchronized void printUsageInformation(final Logger logger, final Priority priority) { logger.debug("Number of threads monitored: " + getnThreadsAnalyzed()); - logger.debug("Total runtime " + new AutoFormattingTime(TimeUnit.MILLISECONDS.toSeconds(getTotalTime()))); + logger.debug("Total runtime " + new AutoFormattingTime(TimeUnit.MILLISECONDS.toNanos(getTotalTime()))); for ( final State state : State.values() ) { logger.debug(String.format("\tPercent of time spent %s is %.2f", state.getUserFriendlyName(), getStatePercent(state))); } diff --git a/public/java/test/org/broadinstitute/sting/AutoFormattingTimeUnitTest.java b/public/java/test/org/broadinstitute/sting/AutoFormattingTimeUnitTest.java new file mode 100644 index 000000000..4292a5ef4 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/AutoFormattingTimeUnitTest.java @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting; + +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * UnitTests for the AutoFormatting + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class AutoFormattingTimeUnitTest extends BaseTest { + @DataProvider(name = "AutoFormattingTimeUnitSelection") + public Object[][] makeTimeData() { + List tests = new ArrayList(); + tests.add(new Object[]{TimeUnit.SECONDS.toNanos(10), "s"}); + tests.add(new Object[]{TimeUnit.MINUTES.toNanos(10), "m"}); + tests.add(new Object[]{TimeUnit.HOURS.toNanos(10), "h"}); + tests.add(new Object[]{TimeUnit.DAYS.toNanos(10), "d"}); + tests.add(new Object[]{TimeUnit.DAYS.toNanos(1000), "w"}); + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AutoFormattingTimeUnitSelection") + public void testUnitSelection(final long nano, final String expectedUnit) throws InterruptedException { + final AutoFormattingTime time = new AutoFormattingTime(nano); + testBasic(time, nano, time.getWidth(), time.getPrecision()); + Assert.assertTrue(time.toString().endsWith(expectedUnit), "TimeUnit " + time.toString() + " didn't contain expected time unit " + expectedUnit); + } + + @Test(dataProvider = "AutoFormattingTimeUnitSelection") + public void testSecondsAsDouble(final long nano, final String expectedUnit) throws InterruptedException { + final double inSeconds = nano * 1e-9; + final long nanoFromSeconds = (long)(inSeconds * 1e9); + final AutoFormattingTime time = new AutoFormattingTime(inSeconds); + testBasic(time, nanoFromSeconds, time.getWidth(), time.getPrecision()); + } + + @DataProvider(name = "AutoFormattingTimeWidthAndPrecision") + public Object[][] makeTimeWidthAndPrecision() { + List tests = new ArrayList(); + for ( final int width : Arrays.asList(-1, 1, 2, 6, 20) ) { + for ( final int precision : Arrays.asList(1, 2) ) { + tests.add(new Object[]{100.123456 * 1e9, width, precision}); + tests.add(new Object[]{0.123456 * 1e9, width, precision}); + } + } + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AutoFormattingTimeWidthAndPrecision") + public void testWidthAndPrecision(final double inSeconds, final int width, final int precision) throws InterruptedException { + final AutoFormattingTime time = new AutoFormattingTime(inSeconds, width, precision); + final long nanoFromSeconds = (long)(inSeconds * 1e9); + testBasic(time, nanoFromSeconds, width, precision); + final Matcher match = matchToString(time); + match.matches(); + final String widthString = match.group(1); + final String precisionString = match.group(2); + if ( width != -1 ) { + final int actualWidth = widthString.length() + 1 + precisionString.length(); + Assert.assertTrue(actualWidth >= width, "width string '" + widthString + "' not >= the expected width " + width); + } + Assert.assertEquals(precisionString.length(), precision, "precision string '" + precisionString + "' not the expected precision " + precision); + } + + private static Matcher matchToString(final AutoFormattingTime time) { + Pattern pattern = Pattern.compile("(\\s*\\d*)\\.(\\d*) \\w"); + return pattern.matcher(time.toString()); + } + + private static void testBasic(final AutoFormattingTime aft, final long nano, final int expectedWidth, final int expectedPrecision) { + Assert.assertEquals(aft.getTimeInNanoSeconds(), nano); + assertEqualsDoubleSmart(aft.getTimeInSeconds(), nano * 1e-9, 1e-3, "Time in seconds not within tolerance of nanoSeconds"); + Assert.assertEquals(aft.getWidth(), expectedWidth); + Assert.assertEquals(aft.getPrecision(), expectedPrecision); + Assert.assertNotNull(aft.toString(), "TimeUnit toString returned null"); + final Matcher match = matchToString(aft); + Assert.assertTrue(match.matches(), "toString " + aft.toString() + " doesn't match our expected format"); + } +} diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 76e25a3c0..a1f84b8c2 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -301,7 +301,11 @@ public abstract class BaseTest { Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps } - public static final void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { + public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { + assertEqualsDoubleSmart(actual, expected, tolerance, null); + } + + public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance, final String message) { if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately Assert.assertTrue(Double.isNaN(actual), "expected is nan, actual is not"); else if ( Double.isInfinite(expected) ) // NaN == NaN => false unfortunately @@ -309,7 +313,9 @@ public abstract class BaseTest { else { final double delta = Math.abs(actual - expected); final double ratio = Math.abs(actual / expected - 1.0); - Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual + " not within tolerance " + tolerance); + Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual + + " not within tolerance " + tolerance + + (message == null ? "" : "message: " + message)); } } } From a15f368bdc623fe5141e067852f8e746484d1e34 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Wed, 2 Jan 2013 10:10:09 -0500 Subject: [PATCH 17/22] Re-enable testIsActiveRangeLow/High --- .../gatk/traversals/TraverseActiveRegionsUnitTest.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 0e47c14c5..e82dc28e1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -179,15 +179,13 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { return activeIntervals; } - // TODO: fix this contracts issue and re-enable - @Test (enabled = false, expectedExceptions = PreconditionError.class) + @Test (expectedExceptions = PreconditionError.class) public void testIsActiveRangeLow () { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1); getActiveRegions(walker, intervals).values(); } - // TODO: fix this contracts issue and re-enable - @Test (enabled = false, expectedExceptions = PreconditionError.class) + @Test (expectedExceptions = PreconditionError.class) public void testIsActiveRangeHigh () { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1); getActiveRegions(walker, intervals).values(); From e1d09ab0db58ab302a31d082b74a968739ff14ea Mon Sep 17 00:00:00 2001 From: Chris Hartl Date: Wed, 2 Jan 2013 14:41:29 -0500 Subject: [PATCH 18/22] QD is now divided by the average length of the alternate allele (weighted by the allele count). The average length is stored in a related annotation, "AAL", which can be used to re-compute the "old" QD by simple multiplication. Integration tests *should* all pass. --- ...GenotyperGeneralPloidyIntegrationTest.java | 14 ++-- .../UnifiedGenotyperIntegrationTest.java | 76 +++++++++---------- .../HaplotypeCallerIntegrationTest.java | 16 ++-- .../gatk/walkers/annotator/QualByDepth.java | 40 +++++++++- .../variantutils/VariantsToBinaryPed.java | 6 +- .../VariantAnnotatorIntegrationTest.java | 26 +++---- 6 files changed, 108 insertions(+), 70 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index cdd31a5ef..5cdc15e5e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -55,36 +55,36 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSNP_ACS_Pools() { - PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES","LSV_SNP_ACS","SNP","df0e67c975ef74d593f1c704daab1705"); + PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES","LSV_SNP_ACS","SNP","651469eeacdb3ab9e2690cfb71f6a634"); } @Test(enabled = true) public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","7e5b28c9e21cc7e45c58c41177d8a0fc"); + PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","be7dc20bdb5f200d189706bcf1aeb7ee"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","ae6c276cc46785a794acff6f7d10ecf7"); + PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","25e5ea86d87b7d7ddaad834a6ed7481d"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","6987b89e04dcb604d3743bb09aa9587d"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","cdbf268d282e57189a88fb83f0e1fd72"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","d0780f70365ed1b431099fd3b4cec449"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","2ed40925cd112c1a45470d215b7ec4b3"); } @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","3fc6f4d458313616727c60e49c0e852b"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","33695a998bcc906cabcc758727004387"); } @Test(enabled = true) public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "1bebbc0f28bff6fd64736ccca8839df8"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "b2725242114bf9cc9bca14679705ba40"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index a8ba92634..e40a7ed38 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("847605f4efafef89529fe0e496315edd")); + Arrays.asList("2ba9af34d2a4d55caf152265a30ead46")); executeTest("test MultiSample Pilot1", spec); } @@ -38,7 +38,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("5b31b811072a4df04524e13604015f9b")); + Arrays.asList("0630c35c070d7a7e0cf22b3cce797f22")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -46,7 +46,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("d9992e55381afb43742cc9b30fcd7538")); + Arrays.asList("5857dcb4e6a8422ae0813e42d433b122")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -54,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("fea530fdc8677e10be4cc11625fa5376")); + Arrays.asList("489deda5d3276545364a06b7385f8bd9")); executeTest("test SingleSample Pilot2", spec); } @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("b41b95aaa2c453c9b75b3b29a9c2718e")); + Arrays.asList("595ba44c75d08dab98df222b8e61ab70")); executeTest("test Multiple SNP alleles", spec); } @@ -70,7 +70,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testBadRead() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, - Arrays.asList("d915535c1458733f09f82670092fcab6")); + Arrays.asList("360f9795facdaa14c0cb4b05207142e4")); executeTest("test bad read", spec); } @@ -78,7 +78,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("e14c9b1f9f34d6c16de445bfa385be89")); + Arrays.asList("4b4a62429f8eac1e2f27ba5e2edea9e5")); executeTest("test reverse trim", spec); } @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("935ee705ffe8cc6bf1d9efcceea271c8")); + Arrays.asList("cc892c91a93dbd8dbdf645803f35a0ee")); executeTest("test mismatched PLs", spec); } @@ -96,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "af8187e2baf516dde1cddea787a52b8a"; + private final static String COMPRESSED_OUTPUT_MD5 = "3fc7d2681ff753e2d68605d7cf8b63e3"; @Test public void testCompressedOutput() { @@ -149,7 +149,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("6ee6537e9ebc1bfc7c6cf8f04b1582ff")); + Arrays.asList("04dc83d7dfb42b8cada91647bd9f32f1")); executeTest("test min_base_quality_score 26", spec); } @@ -157,7 +157,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("55760482335497086458b09e415ecf54")); + Arrays.asList("4429a665a1048f958db3c204297cdb9f")); executeTest("test SLOD", spec); } @@ -165,7 +165,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("938e888a40182878be4c3cc4859adb69")); + Arrays.asList("f063e3573c513eaa9ce7d7df22143362")); executeTest("test NDA", spec); } @@ -173,7 +173,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("7dc186d420487e4e156a24ec8dea0951")); + Arrays.asList("d76e93e2676354dde832f08a508c6f88")); executeTest("test using comp track", spec); } @@ -187,17 +187,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "f99c7471127a6fb6f72e136bc873b2c9"); + testOutputParameters("-sites_only", "1a65172b9bd7a2023d48bc758747b34a"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9dbc9389db39cf9697e93e0bf529314f"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "3f1fa34d8440f6f21654ce60c0ba8f28"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "8b26088a035e579c4afd3b46737291e4"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "f240434b4d3c234f6f9e349e9ec05f4e"); } private void testOutputParameters(final String args, final String md5) { @@ -211,7 +211,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb")); + Arrays.asList("aec378bed312b3557c6dd7ec740c8091")); executeTest("test confidence 1", spec1); } @@ -222,12 +222,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "8dd37249e0a80afa86594c3f1e720760" ); + testHeterozosity( 0.01, "5da6b24033a6b02f466836443d49560e" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "040d169e20fda56f8de009a6015eb384" ); + testHeterozosity( 1.0 / 1850, "1f284c4af967a3c26687164f9441fb16" ); } private void testHeterozosity(final double arg, final String md5) { @@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("0e4713e4aa44f4f8fcfea7138295a627")); + Arrays.asList("cff553c53de970f64051ed5711407038")); executeTest(String.format("test multiple technologies"), spec); } @@ -270,7 +270,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("46ea5d1ceb8eed1d0db63c3577915d6c")); + Arrays.asList("f960a91963e614a6c8d8cda57836df24")); executeTest(String.format("test calling with BAQ"), spec); } @@ -289,7 +289,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("f6f8fbf733f20fbc1dd9ebaf8faefe6c")); + Arrays.asList("46a6d24c82ebb99d305462960fa09b7c")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -304,7 +304,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("4438ad0f03bbdd182d9bb59b15af0fa5")); + Arrays.asList("2be25321bbc6a963dba7ecba5dd76802")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -317,7 +317,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("27b4ace2ad5a83d8cccb040f97f29183")); + Arrays.asList("d6b2657cd5a4a949968cdab50efce515")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -327,7 +327,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("b8129bf754490cc3c76191d8cc4ec93f")); + Arrays.asList("9cff66a321284c362f393bc4db21f756")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -337,7 +337,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("591332fa0b5b22778cf820ee257049d2")); + Arrays.asList("90c8cfcf65152534c16ed81104fc3bcd")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -345,13 +345,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("d3d518448b01bf0f751824b3d946cd04")); + Arrays.asList("457b8f899cf1665de61e75084dbb79d0")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("2ea18a3e8480718a80a415d3fea79f54")); + Arrays.asList("a13fe7aa3b9e8e091b3cf3442a056ec1")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -361,7 +361,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, - Arrays.asList("d76eacc4021b78ccc0a9026162e814a7")); + Arrays.asList("d075ad318739c8c56bdce857da1e48b9")); executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); } @@ -373,7 +373,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 20:10,000,000-10,100,000", 1, - Arrays.asList("1e0d2c15546c3b0959b00ffb75488b56")); + Arrays.asList("91c632ab17a1dd89ed19ebb20324f905")); executeTest(String.format("test UG with base indel quality scores"), spec); } @@ -407,7 +407,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("90adefd39ed67865b0cb275ad0f07383")); + Arrays.asList("1d80e135d611fe19e1fb1882aa588a73")); executeTest("test minIndelFraction 0.0", spec); } @@ -415,7 +415,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("2fded43949e258f8e9f68893c61c1bdd")); + Arrays.asList("752139616752902fca13c312d8fe5e22")); executeTest("test minIndelFraction 0.25", spec); } @@ -423,7 +423,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction100() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 1", 1, - Arrays.asList("3f07efb768e08650a7ce333edd4f9a52")); + Arrays.asList("d66b9decf26e1704abda1a919ac149cd")); executeTest("test minIndelFraction 1.0", spec); } @@ -437,7 +437,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, - Arrays.asList("4d36969d4f8f1094f1fb6e7e085c19f6")); + Arrays.asList("b62ba9777efc05af4c36e2d4ce3ee67c")); executeTest("test calling on reads with Ns in CIGAR", spec); } @@ -451,18 +451,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("092e42a712afb660ec79ff11c55933e2")); + Arrays.asList("f72ecd00b2913f63788faa7dabb1d102")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "c0de74ab8f4f14eb3a2c5d55c200ac5f"); + testReducedCalling("SNP", "f059743858004ceee325f2a7761a2362"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "9d5418ddf1b227ae4d463995507f2b1c"); + testReducedCalling("INDEL", "04845ba1ec7d8d8b0eab2ca6bdb9c1a6"); } @@ -483,7 +483,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testContaminationDownsampling() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_to_filter 0.20", 1, - Arrays.asList("1f9071466fc40f4c6a0f58ac8e9135fb")); + Arrays.asList("b500ad5959bce69f888a2fac024647e5")); executeTest("test contamination_percentage_to_filter 0.20", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 8422d856e..bd1352165 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,12 +21,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "839de31b41d4186e2b12a5601525e894"); + HCTest(CEUTRIO_BAM, "", "7122d4f0ef94c5274aa3047cfebe08ed"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "2b68faa0e0493d92491d74b8f731963a"); + HCTest(NA12878_BAM, "", "6cd6e6787521c07a7bae98766fd628ab"); } // TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed @@ -44,7 +44,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd8d2ae8db9d98e932b0a7f345631eec"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "4a413eeb7a75cab0ab5370b4c08dcf8e"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -55,7 +55,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "0761ff5cbf279be467833fa6708bf360"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "77cf5b5273828dd1605bb23a5aeafcaa"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -66,20 +66,20 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "6380e25c1ec79c6ae2f891ced15bf4e1"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "87ca97f90e74caee35c35616c065821c"); } @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("3a096d6139d15dcab82f5b091d08489d")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("3df42d0550b51eb9b55aac61e8b3c452")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a518c7436544f2b5f71c9d9427ce1cce")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4dbc72b72e3e2d9d812d5a398490e213")); executeTest("HCTestStructuralIndels: ", spec); } @@ -93,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("8a400b0c46f41447fcc35a907e34f384")); + Arrays.asList("f8c2745bf71f2659a57494fcaa2c103b")); executeTest("HC calling on a ReducedRead BAM", spec); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index af27d9c6f..24bac9deb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -13,6 +13,7 @@ import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.GenotypesContext; import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.Allele; import java.util.Arrays; import java.util.HashMap; @@ -68,16 +69,49 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati return null; double QD = -10.0 * vc.getLog10PError() / (double)depth; - Map map = new HashMap(); + + if ( ! vc.isSNP() && ! vc.isSymbolic() ) { + // adjust for the event length + int averageLengthNum = 0; + int averageLengthDenom = 0; + int refLength = vc.getReference().length(); + for ( Allele a : vc.getAlternateAlleles() ) { + int numAllele = vc.getCalledChrCount(a); + int alleleSize; + if ( a.length() == refLength ) { + // SNP or MNP + byte[] a_bases = a.getBases(); + byte[] ref_bases = vc.getReference().getBases(); + int n_mismatch = 0; + for ( int idx = 0; idx < a_bases.length; idx++ ) { + if ( a_bases[idx] != ref_bases[idx] ) + n_mismatch++; + } + alleleSize = n_mismatch; + } + else if ( a.isSymbolic() ) { + alleleSize = 1; + } else { + alleleSize = Math.abs(refLength-a.length()); + } + averageLengthNum += alleleSize*numAllele; + averageLengthDenom += numAllele; + } + double averageLength = ( (double) averageLengthNum )/averageLengthDenom; + QD /= averageLength; + map.put(getKeyNames().get(1),String.format("%.2f",averageLength)); + } + map.put(getKeyNames().get(0), String.format("%.2f", QD)); return map; } - public List getKeyNames() { return Arrays.asList("QD"); } + public List getKeyNames() { return Arrays.asList("QD","AAL"); } public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); + return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth"), + new VCFInfoHeaderLine(getKeyNames().get(1), 1, VCFHeaderLineType.Float, "Average Allele Length")); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 98fe6636c..295e6f3cd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -80,6 +80,9 @@ public class VariantsToBinaryPed extends RodWalker { @Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele") boolean majorAlleleFirst = false; + @Argument(fullName="checkAlternateAlleles",required=false,doc="Checks that alternate alleles actually appear in samples, erroring out if they do not") + boolean checkAlternateAlleles = false; + enum OutputMode { INDIVIDUAL_MAJOR,SNP_MAJOR } private static double APPROX_CM_PER_BP = 1000000.0/750000.0; @@ -473,7 +476,8 @@ public class VariantsToBinaryPed extends RodWalker { System.arraycopy(ref.getBases(), 0, observedRefBases, 0, refLength); final Allele observedRefAllele = Allele.create(observedRefBases); vc.validateReferenceBases(reportedRefAllele, observedRefAllele); - vc.validateAlternateAlleles(); + if ( checkAlternateAlleles ) + vc.validateAlternateAlleles(); } private String getReferenceAllele(VariantContext vc) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index b097e3d34..90e1d5c34 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("fbfbd4d13b7ba3d76e8e186902e81378")); + Arrays.asList("a127623a26bac4c17c9df491e170ed88")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("19aef8914efc497192f89a9038310ca5")); + Arrays.asList("13e24e6b9dfa241df5baa2c3f53415b9")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("4f0b8033da18e6cf6e9b8d5d36c21ba2")); + Arrays.asList("07cb4d427235878aeec0066d7d298e54")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("64ca176d587dfa2b3b9dec9f7999305c")); + Arrays.asList("e579097677d5e56a5776151251947961")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -82,7 +82,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("f33f417fad98c05d9cd08ffa22943b0f")); + Arrays.asList("348314945436ace71ce6b1a52559d9ee")); executeTest("test exclude annotations", spec); } @@ -90,7 +90,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("0c810f6c4abef9d9dc5513ca872d3d22")); + Arrays.asList("ae7930e37a66c0aa4cfe0232736864fe")); executeTest("test overwriting header", spec); } @@ -98,7 +98,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoReads() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("1c423b7730b9805e7b885ece924286e0")); + Arrays.asList("a0ba056c2625033e5e859fd6bcec1256")); executeTest("not passing it any reads", spec); } @@ -106,7 +106,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("54d7d5bb9404652857adf5e50d995f30")); + Arrays.asList("0be7da17340111a94e8581ee3808c88a")); executeTest("getting DB tag with dbSNP", spec); } @@ -114,7 +114,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testMultipleIdsWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3withIDs.vcf -L " + privateTestDir + "vcfexample3withIDs.vcf", 1, - Arrays.asList("5fe63e511061ed4f91d938e72e7e3c39")); + Arrays.asList("e40e625302a496ede42eed61c2ce524b")); executeTest("adding multiple IDs with dbSNP", spec); } @@ -122,7 +122,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("cc7184263975595a6e2473d153227146")); + Arrays.asList("cb50876477d3e035b6eda5d720d7ba8d")); executeTest("getting DB tag with HM3", spec); } @@ -130,7 +130,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoQuals() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "noQual.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L " + privateTestDir + "noQual.vcf -A QualByDepth", 1, - Arrays.asList("aea983adc01cd059193538cc30adc17d")); + Arrays.asList("458412261d61797d39f802c1e03d63f6")); executeTest("test file doesn't have QUALs", spec); } @@ -138,7 +138,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpression() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("2b0e8cdfd691779befc5ac123d1a1887")); + Arrays.asList("39defa8108dca9fa3e54b22a7da43f77")); executeTest("using expression", spec); } @@ -146,7 +146,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpressionWithID() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.ID -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("3de1d1998203518098ffae233f3e2352")); + Arrays.asList("a917edd58a0c235e9395bfc2d2020a8c")); executeTest("using expression with ID", spec); } From dcb7735d3c8cf8254f0bd80b42f827c654961ee0 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Wed, 2 Jan 2013 13:46:33 -0500 Subject: [PATCH 19/22] Active Region extensions must stay on contig --- .../traversals/TraverseActiveRegionsUnitTest.java | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index e82dc28e1..4cda1455e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -241,6 +241,21 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location"); } + @Test + public void testActiveRegionExtensionOnContig() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + Collection activeRegions = getActiveRegions(walker, intervals).values(); + for (ActiveRegion activeRegion : activeRegions) { + GenomeLoc loc = activeRegion.getExtendedLoc(); + + // Contract: active region extensions must stay on the contig + Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig"); + int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength(); + Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig"); + } + } + @Test public void testPrimaryReadMapping() { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); From c515175313e802bdc7b9e87df6d1498897c2ee1a Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Wed, 2 Jan 2013 14:10:55 -0500 Subject: [PATCH 20/22] Ensure that active region extensions stay on contig --- .../src/org/broadinstitute/sting/utils/GenomeLocParser.java | 5 +++++ .../sting/utils/activeregion/ActiveRegion.java | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index dbffacfbc..c18eef23f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -287,6 +287,11 @@ public final class GenomeLocParser { return new GenomeLoc(contig, index, start, stop); } + public GenomeLoc createGenomeLocOnContig(final String contig, final int start, final int stop) { + GenomeLoc contigLoc = createOverEntireContig(contig); + return new GenomeLoc(contig, getContigIndex(contig), start, stop).intersect(contigLoc); + } + /** * validate a position or interval on the genome as valid * diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 0d12d53cc..c12dfcee9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -31,7 +31,7 @@ public class ActiveRegion implements HasGenomeLocation { this.isActive = isActive; this.genomeLocParser = genomeLocParser; this.extension = extension; - extendedLoc = genomeLocParser.createGenomeLoc(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); + extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); fullExtentReferenceLoc = extendedLoc; } From 3753209584c42b80e7cf2432e9141b6028e3dd8d Mon Sep 17 00:00:00 2001 From: Chris Hartl Date: Wed, 2 Jan 2013 15:09:28 -0500 Subject: [PATCH 21/22] One md5sum slipped past in the HC integration test. --- .../walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index bd1352165..bb9efe15d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -33,7 +33,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "a2d56179cd19a41f8bfb995e225320bb"); + "44df2a9da4fbd2162ae44c3f2a6ef01f"); } private void HCTestComplexVariants(String bam, String args, String md5) { From c7039a9b713703d0252ea073ea482e5416413442 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 2 Jan 2013 15:21:44 -0500 Subject: [PATCH 22/22] Pushing in implementation of the Bayesian estimate of Qemp for the BQSR. This isn't hooked up yet with BQSR; it's just a static method used in my testing walker. I'll hook this into BQSR after more testing and the addition of unit tests. Most of the changes in this commit are actually documentation-related. --- .../sting/utils/QualityUtils.java | 5 +- .../recalibration/BaseRecalibration.java | 8 +-- .../utils/recalibration/QualQuantizer.java | 15 ++-- .../sting/utils/recalibration/RecalDatum.java | 71 ++++++++++++++++++- .../recalibration/RecalibrationReport.java | 5 +- 5 files changed, 82 insertions(+), 22 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 861f172d9..844fa6b90 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -14,7 +14,8 @@ public class QualityUtils { public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE); public final static double MIN_REASONABLE_ERROR = 0.0001; - public final static byte MAX_REASONABLE_Q_SCORE = 60; // quals above this value are extremely suspicious + public final static byte MAX_REASONABLE_Q_SCORE = 60; // bams containing quals above this value are extremely suspicious and we should warn the user + public final static byte MAX_GATK_USABLE_Q_SCORE = 40; // quals above this value should be capped down to this value (because they are too high) public final static byte MIN_USABLE_Q_SCORE = 6; public final static int MAPPING_QUALITY_UNAVAILABLE = 255; @@ -73,7 +74,7 @@ public class QualityUtils { } public static double qualToErrorProb(final double qual) { - return Math.pow(10.0, ((double) qual)/-10.0); + return Math.pow(10.0, qual/-10.0); } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 567514f8c..3e0f36799 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -76,7 +76,7 @@ public class BaseRecalibration { quantizationInfo = recalibrationReport.getQuantizationInfo(); if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores quantizationInfo.noQuantization(); - else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report. + else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wants to use what's in the report. quantizationInfo.quantizeQualityScores(quantizationLevels); this.disableIndelQuals = disableIndelQuals; @@ -233,9 +233,9 @@ public class BaseRecalibration { * Note that this calculation is a constant for each rgKey and errorModel. We need only * compute this value once for all data. * - * @param rgKey - * @param errorModel - * @return + * @param rgKey read group key + * @param errorModel the event type + * @return global delta Q */ private double calculateGlobalDeltaQ(final int rgKey, final EventType errorModel) { double result = 0.0; diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java index a5a3104a0..b5370e268 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java @@ -175,8 +175,7 @@ public class QualQuantizer { } /** - * Human readable name of this interval: e.g., 10-12 - * @return + * @return Human readable name of this interval: e.g., 10-12 */ public String getName() { return qStart + "-" + qEnd; @@ -188,8 +187,7 @@ public class QualQuantizer { } /** - * Returns the error rate (in real space) of this interval, or 0 if there are no obserations - * @return + * @return the error rate (in real space) of this interval, or 0 if there are no observations */ @Ensures("result >= 0.0") public double getErrorRate() { @@ -202,9 +200,7 @@ public class QualQuantizer { } /** - * Returns the QUAL of the error rate of this interval, or the fixed - * qual if this interval was created with a fixed qual. - * @return + * @return the QUAL of the error rate of this interval, or the fixed qual if this interval was created with a fixed qual. */ @Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE") public byte getQual() { @@ -254,7 +250,7 @@ public class QualQuantizer { final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this; if ( left.qEnd + 1 != right.qStart ) - throw new ReviewedStingException("Attempting to merge non-continguous intervals: left = " + left + " right = " + right); + throw new ReviewedStingException("Attempting to merge non-contiguous intervals: left = " + left + " right = " + right); final long nCombinedObs = left.nObservations + right.nObservations; final long nCombinedErr = left.nErrors + right.nErrors; @@ -343,8 +339,7 @@ public class QualQuantizer { } /** - * Helper function that finds and mergest together the lowest penalty pair - * of intervals + * Helper function that finds and merges together the lowest penalty pair of intervals * @param intervals */ @Requires("! intervals.isEmpty()") diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 1ab3b10c4..9b58a5900 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -28,9 +28,10 @@ package org.broadinstitute.sting.utils.recalibration; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; import com.google.java.contract.Requires; +import org.apache.commons.math.optimization.fitting.GaussianFunction; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; -import java.util.Random; /** * An individual piece of recalibration data. Each bin counts up the number of observations and the number @@ -149,7 +150,7 @@ public class RecalDatum { //--------------------------------------------------------------------------------------------------------------- /** - * Returns the error rate (in real space) of this interval, or 0 if there are no obserations + * Returns the error rate (in real space) of this interval, or 0 if there are no observations * @return the empirical error rate ~= N errors / N obs */ @Ensures("result >= 0.0") @@ -262,6 +263,9 @@ public class RecalDatum { @Requires("empiricalQuality == UNINITIALIZED") @Ensures("empiricalQuality != UNINITIALIZED") private synchronized void calcEmpiricalQuality() { + + // TODO -- add code for Bayesian estimate of Qemp here + final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); } @@ -276,4 +280,65 @@ public class RecalDatum { private double calcExpectedErrors() { return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); } -} + + static final boolean DEBUG = false; + static final double RESOLUTION_BINS_PER_QUAL = 1.0; + + static public double bayesianEstimateOfEmpiricalQuality(final double nObservations, final double nErrors, final double QReported) { + + final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL; + + final double[] log10Posteriors = new double[numBins]; + + for ( int bin = 0; bin < numBins; bin++ ) { + + final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL; + + log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10Likelihood(QEmpOfBin, nObservations, nErrors); + + if ( DEBUG ) + System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin])); + } + + if ( DEBUG ) + System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors)); + + final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors); + final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors); + + final double Qemp = MLEbin / RESOLUTION_BINS_PER_QUAL; + return Qemp; + } + + static final double[] log10QempPriorCache = new double[QualityUtils.MAX_GATK_USABLE_Q_SCORE + 1]; + static { + // f(x) = a + b*exp(-((x - c)^2 / (2*d^2))) + // Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell". + final double GF_a = 0.0; + final double GF_b = 0.9; + final double GF_c = 0.0; + final double GF_d = 0.5; + + final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d); + for ( int i = 0; i <= QualityUtils.MAX_GATK_USABLE_Q_SCORE; i++ ) + log10QempPriorCache[i] = Math.log10(gaussian.value((double) i)); + } + + static public double log10QempPrior(final double Qempirical, final double Qreported) { + final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), QualityUtils.MAX_GATK_USABLE_Q_SCORE); + if ( DEBUG ) + System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference])); + return log10QempPriorCache[difference]; + } + + static public double log10Likelihood(final double Qempirical, final double nObservations, final double nErrors) { + // this is just a straight binomial PDF + double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical)); + if ( log10Prob == Double.NEGATIVE_INFINITY ) + log10Prob = -Double.MAX_VALUE; + + if ( DEBUG ) + System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob)); + return log10Prob; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 081221da4..6ecac1394 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -9,8 +9,7 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; -import java.io.File; -import java.io.PrintStream; +import java.io.*; import java.util.*; /** @@ -199,7 +198,7 @@ public class RecalibrationReport { private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { final double nObservations = asDouble(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)); - final double empiricalQuality = (Double) reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME); + final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME)); // the estimatedQreported column only exists in the ReadGroup table final double estimatedQReported = hasEstimatedQReportedColumn ?