diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java
deleted file mode 100644
index 68be1629c..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java
+++ /dev/null
@@ -1,2291 +0,0 @@
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.walkers.indels;
-
-import net.sf.samtools.*;
-import org.apache.commons.jexl2.Expression;
-import org.apache.commons.jexl2.JexlContext;
-import org.apache.commons.jexl2.JexlEngine;
-import org.apache.commons.jexl2.MapContext;
-import org.broad.tribble.Feature;
-import org.broadinstitute.sting.commandline.*;
-import org.broadinstitute.sting.gatk.CommandLineGATK;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
-import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
-import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
-import org.broadinstitute.sting.gatk.filters.Platform454Filter;
-import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
-import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
-import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
-import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
-import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
-import org.broadinstitute.sting.gatk.walkers.ReadFilters;
-import org.broadinstitute.sting.gatk.walkers.ReadWalker;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocSortedSet;
-import org.broadinstitute.sting.utils.SampleUtils;
-import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec;
-import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature;
-import org.broadinstitute.sting.utils.codecs.refseq.Transcript;
-import org.broadinstitute.variant.vcf.*;
-import org.broadinstitute.sting.utils.collections.CircularArray;
-import org.broadinstitute.sting.utils.collections.PrimitivePair;
-import org.broadinstitute.sting.utils.exceptions.StingException;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
-import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
-import org.broadinstitute.sting.utils.interval.IntervalUtils;
-import org.broadinstitute.sting.utils.interval.OverlappingIntervalIterator;
-import org.broadinstitute.sting.utils.sam.AlignmentUtils;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import org.broadinstitute.variant.variantcontext.*;
-import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
-
-import java.io.*;
-import java.util.*;
-
-
-/**
- * Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well,
- * but this latter functionality is now superceded by UnifiedGenotyper.
- *
- *
- * This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing
- * data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs
- * include additional statistics such as mismatches and base qualitites around the calls, read strandness (how many
- * forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional
- * statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will
- * attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional
- * metrics for the post-processing tools to make the final decision). The calls are performed by default
- * from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I
- * command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments,
- * respectively. Indels are called from the tumor sample and annotated as germline
- * if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic
- * if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling
- * is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold
- * in tumor sample, it will not be reported.
- *
- * To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input
- * bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged
- * on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups).
- *
- * Which (putative) calls will make it into the output file(s) is controlled by an expression/list of expressions passed with -filter
- * flag: if any of the expressions evaluate to TRUE, the site will be discarded. Otherwise the putative call and all the
- * associated statistics will be printed into the output. Expressions recognize the following variables(in paired-sample
- * somatic mode variables are prefixed with T_ and N_ for Tumor and Normal, e.g. N_COV and T_COV are defined instead of COV):
- * COV for coverage at the site, INDEL_F for fraction of reads supporting consensus indel at the site (wrt total coverage),
- * INDEL_CF for fraction of reads with consensus indel wrt all reads with an indel at the site, CONS_CNT for the count of
- * reads supporting the consensus indel at the site. Conventional arithmetic and logical operations are supported. For instance,
- * N_COV<4||T_COV<6||T_INDEL_F<0.3||T_INDEL_CF<0.7 instructs the tool to only output indel calls with at least 30% observed
- * allelic fraction and with consensus indel making at least 70% of all indel observations at the site, and only at the sites
- * where tumor coverage and normal coverage are at least 6 and 4, respectively.
- *
Input
- *
- * Tumor and normal bam files (or single sample bam file(s) in --unpaired mode).
- *
- *
- * Output
- *
- * Indel calls with associated metrics.
- *
- *
- * Examples
- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- * -R ref.fasta \
- * -T SomaticIndelDetector \
- * -o indels.vcf \
- * -verbose indels.txt
- * -I:normal normal.bam \
- * -I:tumor tumor.bam
- *
- *
- */
-
-@DocumentedGATKFeature( groupName = "Cancer-specific Variant Discovery Tools", extraDocs = {CommandLineGATK.class} )
-@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class})
-public class SomaticIndelDetector extends ReadWalker {
-// @Output
-// PrintStream out;
- @Output(doc="File to write variants (indels) in VCF format",required=true)
- protected VariantContextWriter vcf_writer = null;
-
- @Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true)
- @Deprecated
- java.io.File output_file;
-
- @Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print callability metrics output", required = false)
- public PrintStream metricsWriter = null;
-
-// @Argument(fullName="vcf_format", shortName="vcf", doc="generate output file in VCF format", required=false)
-// boolean FORMAT_VCF = false;
-
- @Hidden
- @Input(fullName = "genotype_intervals", shortName = "genotype",
- doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false)
- public IntervalBinding genotypeIntervalsFile = null;
-
- @Hidden
- @Argument(fullName="unpaired", shortName="unpaired",
- doc="Perform unpaired calls (no somatic status detection)", required=false)
- boolean call_unpaired = false;
- boolean call_somatic ;
-
- @Argument(fullName="verboseOutput", shortName="verbose",
- doc="Verbose output file in text format", required=false)
- java.io.File verboseOutput = null;
-
- @Argument(fullName="bedOutput", shortName="bed",
- doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
- java.io.File bedOutput = null;
-
- @Deprecated
- @Argument(fullName="minCoverage", shortName="minCoverage",
- doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+
- "with --unpaired (single sample) option, this value is used for minimum sample coverage. "+
- "INSTEAD USE: T_COV FILTER_EXPRESSIONS = new ArrayList();
-
-//@Argument(fullName="blacklistedLanes", shortName="BL",
-// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
-// "by this application, so they will not contribute indels to consider and will not be counted.", required=false)
-//PlatformUnitFilterHelper dummy;
-
- @Hidden
- @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",
- required=false) Boolean DEBUG = false;
- @Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+
- "May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+
- "window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+
- "than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+
- "if window is too small",required=false) int WINDOW_SIZE = 200;
- @Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+
- " the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000;
-
-
-
- private WindowContext tumor_context;
- private WindowContext normal_context;
- private int currentContigIndex = -1;
- private int contigLength = -1; // we see to much messy data with reads hanging out of contig ends...
- private int currentPosition = -1; // position of the last read we've seen on the current contig
- private String refName = null;
- private java.io.Writer output = null;
- private GenomeLoc location = null;
- private long normalCallsMade = 0L, tumorCallsMade = 0L;
-
- boolean outOfContigUserWarned = false;
-
- private LocationAwareSeekableRODIterator refseqIterator=null;
-
-// private Set normalReadGroups; // we are going to remember which read groups are normals and which are tumors in order to be able
-// private Set tumorReadGroups ; // to properly assign the reads coming from a merged stream
- private Set normalSamples; // we are going to remember which samples are normal and which are tumor:
- private Set tumorSamples ; // these are used only to generate genotypes for vcf output
-
- private int NQS_WIDTH = 5; // 5 bases on each side of the indel for NQS-style statistics
-
- private Writer bedWriter = null;
- private Writer verboseWriter = null;
-
-
- private static String annGenomic = "GENOMIC\t";
- private static String annIntron = "INTRON";
- private static String annUTR = "UTR";
- private static String annCoding = "CODING";
- private static String annUnknown = "UNKNOWN";
-
- enum CallType {
- NOCOVERAGE,
- BADCOVERAGE,
- NOEVIDENCE,
- GERMLINE,
- SOMATIC
- };
-
- private SAMRecord lastRead;
- private byte[] refBases;
- private ReferenceDataSource refData;
- private Iterator genotypeIntervalIterator = null;
-
- // the current interval in the list of intervals, for which we want to do full genotyping
- private GenomeLoc currentGenotypeInterval = null;
- private long lastGenotypedPosition = -1; // last position on the currentGenotypeInterval, for which a call was already printed;
- // can be 1 base before lastGenotyped start
-
- private JexlEngine jexlEngine = new JexlEngine();
- private ArrayList jexlExpressions = new ArrayList();
-
- // the following arrays store indel source-specific (normal/tumor) metric names
- // for fast access when populating JEXL expression contexts (see IndelPrecall.fillContext())
- private final static String[] normalMetricsCassette = new String[4];
- private final static String[] tumorMetricsCassette = new String[4];
- private final static String[] singleMetricsCassette = new String[4];
- private final static int C_COV=0;
- private final static int C_CONS_CNT=1;
- private final static int C_INDEL_F=2;
- private final static int C_INDEL_CF=3;
- static {
- normalMetricsCassette[C_COV] = "N_COV";
- tumorMetricsCassette[C_COV] = "T_COV";
- singleMetricsCassette[C_COV] = "COV";
- normalMetricsCassette[C_CONS_CNT] = "N_CONS_CNT";
- tumorMetricsCassette[C_CONS_CNT] = "T_CONS_CNT";
- singleMetricsCassette[C_CONS_CNT] = "CONS_CNT";
- normalMetricsCassette[C_INDEL_F] = "N_INDEL_F";
- tumorMetricsCassette[C_INDEL_F] = "T_INDEL_F";
- singleMetricsCassette[C_INDEL_F] = "INDEL_F";
- normalMetricsCassette[C_INDEL_CF] = "N_INDEL_CF";
- tumorMetricsCassette[C_INDEL_CF] = "T_INDEL_CF";
- singleMetricsCassette[C_INDEL_CF] = "INDEL_CF";
- }
-
- // "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt"
-
- private Set getVCFHeaderInfo() {
- Set headerInfo = new HashSet();
-
- // first, the basic info
- headerInfo.add(new VCFHeaderLine("source", "SomaticIndelDetector"));
- headerInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
- headerInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY));
-
- // FORMAT and INFO fields
-// headerInfo.addAll(VCFUtils.getSupportedHeaderStrings());
-
- headerInfo.addAll(VCFIndelAttributes.getAttributeHeaderLines());
- if ( call_somatic ) {
- headerInfo.add(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event"));
- } else {
- }
-
- // all of the arguments from the argument collection
- Set args = new HashSet();
- args.add(this);
- args.addAll(getToolkit().getFilters());
- Map commandLineArgs = getToolkit().getApproximateCommandLineArguments(args);
- for ( Map.Entry commandLineArg : commandLineArgs.entrySet() )
- headerInfo.add(new VCFHeaderLine(String.format("SID_%s", commandLineArg.getKey()), commandLineArg.getValue()));
- // also, the list of input bams
- for ( String fileName : getToolkit().getArguments().samFiles )
- headerInfo.add(new VCFHeaderLine("SID_bam_file_used", fileName));
-
- return headerInfo;
- }
-
-
- @Override
- public void initialize() {
-
- call_somatic = (call_unpaired ? false : true);
- normal_context = new WindowContext(0,WINDOW_SIZE);
- normalSamples = new HashSet();
-
- if ( bedOutput != null && output_file != null ) {
- throw new UserException.DeprecatedArgument("-O", "-O option is deprecated and -bed option replaces it; you can not use both at the same time");
- }
-
- if ( RefseqFileName != null ) {
- logger.info("Using RefSeq annotations from "+RefseqFileName);
-
- RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),
- getToolkit().getGenomeLocParser(),
- getToolkit().getArguments().unsafe);
- RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,new File(RefseqFileName));
-
- refseqIterator = new SeekableRODIterator(refseq.getHeader(),
- refseq.getSequenceDictionary(),
- getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),
- getToolkit().getGenomeLocParser(),
- refseq.getIterator());
- }
-
- if ( refseqIterator == null ) logger.info("No gene annotations available");
-
- int nSams = getToolkit().getArguments().samFiles.size();
-
- if ( call_somatic ) {
- if ( nSams < 2 ) throw new UserException.BadInput("In default (paired sample) mode at least two bam files (normal and tumor) must be specified");
- tumor_context = new WindowContext(0,WINDOW_SIZE);
- tumorSamples = new HashSet();
- }
-
- int nNorm = 0;
- int nTum = 0;
- for ( SAMReaderID rid : getToolkit().getReadsDataSource().getReaderIDs() ) {
- Tags tags = rid.getTags() ;
- if ( tags.getPositionalTags().isEmpty() && call_somatic )
- throw new UserException.BadInput("In default (paired sample) mode all input bam files must be tagged as either 'normal' or 'tumor'. Untagged file: "+
- getToolkit().getSourceFileForReaderID(rid));
- boolean normal = false;
- boolean tumor = false;
- for ( String s : tags.getPositionalTags() ) { // we allow additional unrelated tags (and we do not use them), but we REQUIRE one of Tumor/Normal to be present if --somatic is on
- if ( "NORMAL".equals(s.toUpperCase()) ) {
- normal = true;
- nNorm++;
- }
- if ( "TUMOR".equals(s.toUpperCase()) ) {
- tumor = true;
- nTum++ ;
- }
- }
- if ( call_somatic && normal && tumor ) throw new UserException.BadInput("Input bam file "+
- getToolkit().getSourceFileForReaderID(rid)+" is tagged both as normal and as tumor. Which one is it??");
- if ( call_somatic && !normal && ! tumor )
- throw new UserException.BadInput("In somatic mode all input bams must be tagged as either normal or tumor. Encountered untagged file: "+
- getToolkit().getSourceFileForReaderID(rid));
- if ( ! call_somatic && (normal || tumor) )
- System.out.println("WARNING: input bam file "+getToolkit().getSourceFileForReaderID(rid)
- +" is tagged as Normal and/or Tumor, but somatic mode is not on. Tags will ne IGNORED");
- if ( call_somatic && tumor ) {
- for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) {
- tumorSamples.add(rg.getSample());
- }
- } else {
- for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) {
- normalSamples.add(rg.getSample());
- }
- }
- if ( genotypeIntervalsFile != null ) {
-
- // read in the whole list of intervals for cleaning
- GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), genotypeIntervalsFile.getIntervals(getToolkit()), IntervalMergingRule.OVERLAPPING_ONLY);
- genotypeIntervalIterator = locs.iterator();
-
- // wrap intervals requested for genotyping inside overlapping iterator, so that we actually
- // genotype only on the intersections of the requested intervals with the -L intervals
- genotypeIntervalIterator = new OverlappingIntervalIterator(genotypeIntervalIterator, getToolkit().getIntervals().iterator() );
-
- currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null;
-
- if ( DEBUG) System.out.println("DEBUG>> first genotyping interval="+currentGenotypeInterval);
-
- if ( currentGenotypeInterval != null ) lastGenotypedPosition = currentGenotypeInterval.getStart()-1;
- }
-
- }
-
- location = getToolkit().getGenomeLocParser().createGenomeLoc(getToolkit().getSAMFileHeader().getSequence(0).getSequenceName(),1);
-
- try {
- // we already checked that bedOutput and output_file are not set simultaneously
- if ( bedOutput != null ) bedWriter = new FileWriter(bedOutput);
- if ( output_file != null ) bedWriter = new FileWriter(output_file);
- } catch (java.io.IOException e) {
- throw new UserException.CouldNotReadInputFile(bedOutput, "Failed to open BED file for writing.", e);
- }
- try {
- if ( verboseOutput != null ) verboseWriter = new FileWriter(verboseOutput);
- } catch (java.io.IOException e) {
- throw new UserException.CouldNotReadInputFile(verboseOutput, "Failed to open BED file for writing.", e);
- }
-
- vcf_writer.writeHeader(new VCFHeader(getVCFHeaderInfo(), SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()))) ;
- refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile);
-
- // Now initialize JEXL expressions:
- if ( FILTER_EXPRESSIONS.size() == 0 ) {
- if ( call_unpaired ) {
- FILTER_EXPRESSIONS.add("COV<6||INDEL_F<0.3||INDEL_CF<0.7");
- } else {
- FILTER_EXPRESSIONS.add("T_COV<6||N_COV<4||T_INDEL_F<0.3||T_INDEL_CF<0.7");
- }
- }
- for ( String s : FILTER_EXPRESSIONS ) {
- try {
- Expression e = jexlEngine.createExpression(s);
- jexlExpressions.add(e);
- } catch (Exception e) {
- throw new UserException.BadArgumentValue("Filter expression", "Invalid expression used (" + s + "). Please see the JEXL docs for correct syntax.") ;
- }
-
- }
- }
-
-
- @Override
- public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
-
- // if ( read.getReadName().equals("428EFAAXX090610:2:36:1384:639#0") ) System.out.println("GOT READ");
-
- if ( DEBUG ) {
- // System.out.println("DEBUG>> read at "+ read.getAlignmentStart()+"-"+read.getAlignmentEnd()+
- // "("+read.getCigarString()+")");
- if ( read.getDuplicateReadFlag() ) System.out.println("DEBUG>> Duplicated read (IGNORED)");
- }
-
- if ( AlignmentUtils.isReadUnmapped(read) ||
- read.getDuplicateReadFlag() ||
- read.getNotPrimaryAlignmentFlag() ||
- read.getMappingQuality() == 0 ) {
- return 0; // we do not need those reads!
- }
-
- if ( read.getReferenceIndex() != currentContigIndex ) {
- // we just jumped onto a new contig
- if ( DEBUG ) System.out.println("DEBUG>>> Moved to contig "+read.getReferenceName());
- if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal
- throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName()+": contig is out of order; input BAM file is unsorted");
-
- // print remaining indels from the previous contig (if any);
- if ( call_somatic ) emit_somatic(1000000000, true);
- else emit(1000000000,true);
-
- currentContigIndex = read.getReferenceIndex();
- currentPosition = read.getAlignmentStart();
- refName = new String(read.getReferenceName());
-
- location = getToolkit().getGenomeLocParser().createGenomeLoc(refName,location.getStart(),location.getStop());
- contigLength = getToolkit().getGenomeLocParser().getContigInfo(refName).getSequenceLength();
- outOfContigUserWarned = false;
-
- lastGenotypedPosition = -1;
-
- normal_context.clear(); // reset coverage window; this will also set reference position to 0
- if ( call_somatic) tumor_context.clear();
-
- refBases = new String(refData.getReference().getSequence(read.getReferenceName()).getBases()).toUpperCase().getBytes();
- }
-
- // we have reset the window to the new contig if it was required and emitted everything we collected
- // on a previous contig. At this point we are guaranteed that we are set up properly for working
- // with the contig of the current read.
-
- // NOTE: all the sanity checks and error messages below use normal_context only. We make sure that normal_context and
- // tumor_context are synchronized exactly (windows are always shifted together by emit_somatic), so it's safe
-
- if ( read.getAlignmentStart() < currentPosition ) // oops, read out of order?
- throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName() +" out of order on the contig\n"+
- "Read starts at "+refName+":"+read.getAlignmentStart()+"; last read seen started at "+refName+":"+currentPosition
- +"\nLast read was: "+lastRead.getReadName()+" RG="+lastRead.getAttribute("RG")+" at "+lastRead.getAlignmentStart()+"-"
- +lastRead.getAlignmentEnd()+" cigar="+lastRead.getCigarString());
-
- currentPosition = read.getAlignmentStart();
- lastRead = read;
-
- if ( read.getAlignmentEnd() > contigLength ) {
- if ( ! outOfContigUserWarned ) {
- System.out.println("WARNING: Reads aligned past contig length on "+ location.getContig()+"; all such reads will be skipped");
- outOfContigUserWarned = true;
- }
- return 0;
- }
-
- long alignmentEnd = read.getAlignmentEnd();
- Cigar c = read.getCigar();
- int lastNonClippedElement = 0; // reverse offset to the last unclipped element
- CigarOperator op = null;
- // moving backwards from the end of the cigar, skip trailing S or H cigar elements:
- do {
- lastNonClippedElement++;
- op = c.getCigarElement( c.numCigarElements()-lastNonClippedElement ).getOperator();
- } while ( op == CigarOperator.H || op == CigarOperator.S );
-
- // now op is the last non-S/H operator in the cigar.
-
- // a little trick here: we want to make sure that current read completely fits into the current
- // window so that we can accumulate indel observations over the whole length of the read.
- // The ::getAlignmentEnd() method returns the last position on the reference where bases from the
- // read actually match (M cigar elements). After our cleaning procedure, we can have reads that end
- // with I element, which is not gonna be counted into alignment length on the reference. On the other hand,
- // in this program we assign insertions, internally, to the first base *after* the insertion position.
- // Hence, we have to make sure that that extra base is already in the window or we will get IndexOutOfBounds.
-
- if ( op == CigarOperator.I) alignmentEnd++;
-
- if ( alignmentEnd > normal_context.getStop()) {
-
- // we don't emit anything until we reach a read that does not fit into the current window.
- // At that point we try shifting the window to the start of that read (or reasonably close) and emit everything prior to
- // that position. This is legitimate, since the reads are sorted and we are not gonna see any more coverage at positions
- // below the current read's start.
- // Clearly, we assume here that window is large enough to accomodate any single read, so simply shifting
- // the window to around the read's start will ensure that the read fits...
-
- if ( DEBUG) System.out.println("DEBUG>> Window at "+normal_context.getStart()+"-"+normal_context.getStop()+", read at "+
- read.getAlignmentStart()+": trying to emit and shift" );
- if ( call_somatic ) emit_somatic( read.getAlignmentStart(), false );
- else emit( read.getAlignmentStart(), false );
-
- // let's double check now that the read fits after the shift
- if ( read.getAlignmentEnd() > normal_context.getStop()) {
- // ooops, looks like the read does not fit into the window even after the latter was shifted!!
- // we used to die over such reads and require user to run with larger window size. Now we
- // just print a warning and discard the read (this means that our counts can be slightly off in
- // th epresence of such reads)
- //throw new UserException.BadArgumentValue("window_size", "Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small, so increase the value of the window_size argument.\n"+
- // "Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+
- // read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+
- // "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop());
- System.out.println("WARNING: Read "+read.getReadName()+
- " is out of coverage window bounds. Probably window is too small and the window_size value must be increased.\n"+
- " The read is ignored in this run (so all the counts/statistics reported will not include it).\n"+
- " Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+
- read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+
- "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop());
- return 1;
- }
- }
-
- if ( call_somatic ) {
-
- Tags tags = getToolkit().getReaderIDForRead(read).getTags();
- boolean assigned = false;
- for ( String s : tags.getPositionalTags() ) {
- if ( "NORMAL".equals(s.toUpperCase()) ) {
- normal_context.add(read,ref.getBases());
- assigned = true;
- break;
- }
- if ( "TUMOR".equals(s.toUpperCase()) ) {
- tumor_context.add(read,ref.getBases());
- assigned = true;
- break;
- }
- }
- if ( ! assigned )
- throw new StingException("Read "+read.getReadName()+" from "+getToolkit().getSourceFileForReaderID(getToolkit().getReaderIDForRead(read))+
- "has no Normal/Tumor tag associated with it");
-
-// String rg = (String)read.getExtendedAttribute("RG");
-// if ( rg == null )
-// throw new UserException.MalformedBam(read, "Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls.");
-
-// if ( normalReadGroups.contains(rg) ) {
-// normal_context.add(read,ref.getBases());
-// } else if ( tumorReadGroups.contains(rg) ) {
-// tumor_context.add(read,ref.getBases());
-// } else {
-// throw new UserException.MalformedBam(read, "Unrecognized read group in merged stream: "+rg);
-// }
-
- if ( tumor_context.getReads().size() > MAX_READ_NUMBER ) {
- System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+
- refName+':'+tumor_context.getStart()+'-'+tumor_context.getStop()+" in tumor sample. The whole window will be dropped.");
- tumor_context.shift(WINDOW_SIZE);
- normal_context.shift(WINDOW_SIZE);
- }
- if ( normal_context.getReads().size() > MAX_READ_NUMBER ) {
- System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+
- refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+" in normal sample. The whole window will be dropped");
- tumor_context.shift(WINDOW_SIZE);
- normal_context.shift(WINDOW_SIZE);
- }
-
-
- } else {
- normal_context.add(read, ref.getBases());
- if ( normal_context.getReads().size() > MAX_READ_NUMBER ) {
- System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+
- refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+". The whole window will be dropped");
- normal_context.shift(WINDOW_SIZE);
- }
- }
-
- return 1;
- }
-
- /** An auxiliary shortcut: returns true if position(location.getContig(), p) is past l */
- private boolean pastInterval(long p, GenomeLoc l) {
- return ( location.getContigIndex() > l.getContigIndex() ||
- location.getContigIndex() == l.getContigIndex() && p > l.getStop() );
- }
-
- /** Emit calls of the specified type across genotyping intervals, from position lastGenotypedPosition+1 to
- * pos-1, inclusive.
- * @param contigIndex
- * @param pos
- * @param call
- */
- /*
- private void emitNoCallsUpTo(int contigIndex, long pos, CallType call) {
-
- if ( contigIndex < currentGenotypeInterval.getContigIndex() ||
- contigIndex == currentGenotypeInterval.getContigIndex() && pos <= currentGenotypeInterval.getStart() ) return;
-
- if ( contigIndex == currentGenotypeInterval.getContigIndex() && pos >= currentGenotypeInterval.getStart() ) {
- for ( long p = lastGenotypedPosition+1; p < pos; p++ ) {
-
- }
- }
- while( currentGenotypeInterval != null ) {
-
- while ( )
- if ( genotypeIntervalIterator.hasNext() ) {
- currentGenotypeInterval = genotypeIntervalIterator.next() ;
- if ( pastInterval(p,currentGenotypeInterval) ) {
- // if we are about to jump over the whole next interval, we need to emit NO_COVERAGE calls there!
- emitNoCoverageCalls(currentGenotypeInterval);
- }
- } else {
- currentGenotypeInterval = null;
- }
- }
- }
-*/
-
- /** Output indel calls up to the specified position and shift the window: after this method is executed, the
- * first element of the window maps onto 'position', if possible, or at worst a few bases to the left of 'position' if we may need more
- * reads to get full NQS-style statistics for an indel in the close proximity of 'position'.
- *
- * @param position
- */
- private void emit(long position, boolean force) {
-
- long adjustedPosition = adjustPosition(position);
-
- if ( adjustedPosition == -1 ) {
- // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether
- normal_context.shift((int)(position-normal_context.getStart()));
- return;
- }
- long move_to = adjustedPosition;
-
- for ( int pos = normal_context.getStart() ; pos < Math.min(adjustedPosition,normal_context.getStop()+1) ; pos++ ) {
-
- boolean genotype = false;
- // first let's see if we need to genotype current position:
-
- final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf)
-
- if ( pos <= lastGenotypedPosition ) continue;
-
- while ( currentGenotypeInterval != null ) {
-
- // if we did not even reach next interval yet, no genotyping at current position:
- if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() ||
- location.getContigIndex() == currentGenotypeInterval.getContigIndex() &&
- p < currentGenotypeInterval.getStart() ) break;
- if ( pastInterval(p, currentGenotypeInterval) ) {
- // we are past current genotyping interval, so we are done with it; let's load next interval:
- currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null;
- continue; // re-enter the loop to check against the interval we just loaded
- }
-
- // we reach this point only if p is inside current genotyping interval; set the flag and bail out:
- genotype = true;
- break;
- }
-
-// if ( DEBUG ) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype);
-
- if ( normal_context.indelsAt(pos).size() == 0 && ! genotype ) continue;
-
- IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH);
- JexlContext jc = new MapContext();
- normalCall.fillContext(jc,singleMetricsCassette);
- boolean discard_event = false;
-
- for ( Expression e : jexlExpressions ) {
- if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { discard_event=true; break; }
- }
-
- if ( discard_event && ! genotype ) {
- normal_context.indelsAt(pos).clear();
- continue; //*
- }
-
-// if ( normalCall.getCoverage() < minCoverage && ! genotype ) {
-// if ( DEBUG ) {
-// System.out.println("DEBUG>> Indel at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)");
-// }
-// continue; // low coverage
-// }
-
- if ( DEBUG ) System.out.println("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" at "+pos);
-
- long left = Math.max( pos-NQS_WIDTH, normal_context.getStart() );
- long right = pos+( normalCall.getVariant() == null ? 0 : normalCall.getVariant().lengthOnRef())+NQS_WIDTH-1;
-
- if ( right >= adjustedPosition && ! force) {
- // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect
-
- // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it;
- // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage
- move_to = adjustPosition(left);
- if ( move_to == -1 ) {
- // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether
- normal_context.shift((int)(adjustedPosition-normal_context.getStart()));
- return;
- }
- if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to);
- break;
- }
-
- // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right:
- if ( right > normal_context.getStop() ) right = normal_context.getStop();
-
- // location = getToolkit().getGenomeLocParser().setStart(location,pos);
- // location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data
-
- location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(), pos);
-
- if ( ! discard_event ) normalCallsMade++;
- printVCFLine(vcf_writer,normalCall, discard_event);
- if ( bedWriter != null ) normalCall.printBedLine(bedWriter);
- if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, discard_event);
- lastGenotypedPosition = pos;
-
- normal_context.indelsAt(pos).clear();
- // we dealt with this indel; don't want to see it again
- // (we might otherwise in the case when 1) there is another indel that follows
- // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel)
- }
-
- if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")");
- normal_context.shift((int)(move_to - normal_context.getStart() ) );
- }
-
- /** A shortcut. Returns true if we got indels within the specified interval in single and only window context
- * (for single-sample calls) or in either of the two window contexts (for two-sample/somatic calls)
- *
- */
- private boolean indelsPresentInInterval(long start, long stop) {
- if ( tumor_context == null ) return normal_context.hasIndelsInInterval(start,stop);
- return tumor_context.hasIndelsInInterval(start,stop) ||
- normal_context.hasIndelsInInterval(start,stop);
- }
- /** Takes the position, to which window shift is requested, and tries to adjust it in such a way that no NQS window is broken.
- * Namely, this method checks, iteratively, if there is an indel within NQS_WIDTH bases ahead of initially requested or adjusted
- * shift position. If there is such an indel,
- * then shifting to that position would lose some or all NQS-window bases to the left of the indel (since it's not going to be emitted
- * just yet). Instead, this method tries to readjust the shift position leftwards so that full NQS window to the left of the next indel
- * is preserved. This method tries thie strategy 4 times (so that it would never walk away too far to the left), and if it fails to find
- * an appropriate adjusted shift position (which could happen if there are many indels following each other at short intervals), it will give up,
- * go back to the original requested shift position and try finding the first shift poisition that has no indel associated with it.
- */
-
- private long adjustPosition(long request) {
- long initial_request = request;
- int attempts = 0;
- boolean failure = false;
- while ( indelsPresentInInterval(request,request+NQS_WIDTH) ) {
- request -= NQS_WIDTH;
- if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request);
- attempts++;
- if ( attempts == 4 ) {
- if ( DEBUG ) System.out.println("DEBUG>> attempts to preserve full NQS window failed; now trying to find any suitable position.") ;
- failure = true;
- break;
- }
- }
-
- if ( failure ) {
- // we tried 4 times but did not find a good shift position that would preserve full nqs window
- // around all indels. let's fall back and find any shift position as long and there's no indel at the very
- // first position after the shift (this is bad for other reasons); if it breaks a nqs window, so be it
- request = initial_request;
- attempts = 0;
- while ( indelsPresentInInterval(request,request+1) ) {
- request--;
- if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request);
- attempts++;
- if ( attempts == 50 ) {
- System.out.println("WARNING: Indel at every position in the interval "+refName+":"+request+"-"+initial_request+
- ". Can not find a break to shift context window to; no calls will be attempted in the current window.");
- return -1;
- }
- }
- }
- if ( DEBUG ) System.out.println("DEBUG>> Found acceptable target position "+request);
- return request;
- }
-
- /** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed
- * first elements of the coverage arrays map onto 'position', or a few bases prior to the specified position
- * if there is an indel in close proximity to 'position' so that we may get more coverage around it later.
- *
- * @param position
- */
- private void emit_somatic(long position, boolean force) {
-
- long adjustedPosition = adjustPosition(position);
- if ( adjustedPosition == -1 ) {
- // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether
- normal_context.shift((int)(position-normal_context.getStart()));
- tumor_context.shift((int)(position-tumor_context.getStart()));
- return;
- }
- long move_to = adjustedPosition;
-
- if ( DEBUG ) System.out.println("DEBUG>> Emitting in somatic mode up to "+position+" force shift="+force+" current window="+tumor_context.getStart()+"-"+tumor_context.getStop());
-
- for ( int pos = tumor_context.getStart() ; pos < Math.min(adjustedPosition,tumor_context.getStop()+1) ; pos++ ) {
-
- boolean genotype = false;
- // first let's see if we need to genotype current position:
-
- final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf)
-
- if ( pos <= lastGenotypedPosition ) continue;
-
- while ( currentGenotypeInterval != null ) {
-
- // if we did not even reach next interval yet, no genotyping at current position:
- if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() ||
- location.getContigIndex() == currentGenotypeInterval.getContigIndex() &&
- p < currentGenotypeInterval.getStart() ) break;
- if ( pastInterval(p, currentGenotypeInterval) ) {
- // we are past current genotyping interval, so we are done with it; let's load next interval:
- currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null;
- continue; // re-enter the loop to check against the interval we just loaded
- }
-
- // we reach tjis point only if p is inside current genotyping interval; set the flag and bail out:
- genotype = true;
- break;
- }
-// if ( DEBUG) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype);
-
- if ( tumor_context.indelsAt(pos).size() == 0 && ! genotype ) continue; // no indels in tumor
-
- if ( DEBUG && genotype ) System.out.println("DEBUG>> Genotyping requested at "+pos);
-
- IndelPrecall tumorCall = new IndelPrecall(tumor_context,pos,NQS_WIDTH);
- IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH);
-
- JexlContext jc = new MapContext();
- tumorCall.fillContext(jc,tumorMetricsCassette);
- normalCall.fillContext(jc,normalMetricsCassette);
- boolean discard_event = false;
-
- for ( Expression e : jexlExpressions ) {
- if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { discard_event=true; break; }
- }
-
- if ( discard_event && ! genotype ) {
- tumor_context.indelsAt(pos).clear();
- normal_context.indelsAt(pos).clear();
- continue; //*
- }
-// if ( tumorCall.getCoverage() < minCoverage && ! genotype ) {
-// if ( DEBUG ) {
-// System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)");
-// }
-// continue; // low coverage
-// }
-// if ( normalCall.getCoverage() < minNormalCoverage && ! genotype ) {
-// if ( DEBUG ) {
-// System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)");
-// }
-// continue; // low coverage
-// }
-
- if ( DEBUG ) {
- System.out.print("DEBUG>> "+(tumorCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in tumor, ");
- System.out.print("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in normal at "+pos);
- }
-
- long left = Math.max( pos-NQS_WIDTH, tumor_context.getStart() );
- long right = pos+ ( tumorCall.getVariant() == null ? 0 : tumorCall.getVariant().lengthOnRef() )+NQS_WIDTH-1;
-
- if ( right >= adjustedPosition && ! force) {
- // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect
-
- // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it;
- // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage
- move_to = adjustPosition(left);
- if ( move_to == -1 ) {
- // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether
- normal_context.shift((int)(adjustedPosition-normal_context.getStart()));
- tumor_context.shift((int)(adjustedPosition-tumor_context.getStart()));
- return;
- }
- if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to);
- break;
- }
-
- if ( right > tumor_context.getStop() ) right = tumor_context.getStop(); // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right
-
- location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(),pos); // retrieve annotation data
-
-// boolean haveCall = tumorCall.isCall(); // cache the value
-
- if ( ! discard_event ) tumorCallsMade++;
-
- printVCFLine(vcf_writer,normalCall,tumorCall,discard_event);
-
- if ( bedWriter != null ) tumorCall.printBedLine(bedWriter);
-
- if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, tumorCall, discard_event );
- lastGenotypedPosition = pos;
-
- tumor_context.indelsAt(pos).clear();
- normal_context.indelsAt(pos).clear();
- // we dealt with this indel; don't want to see it again
- // (we might otherwise in the case when 1) there is another indel that follows
- // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel)
- }
-
- if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")");
- tumor_context.shift((int)(move_to - tumor_context.getStart() ) );
- normal_context.shift((int)(move_to - normal_context.getStart() ) );
- }
-
- private String makeFullRecord(IndelPrecall normalCall, IndelPrecall tumorCall) {
- StringBuilder fullRecord = new StringBuilder();
- if ( tumorCall.getVariant() != null || normalCall.getVariant() == null) {
- fullRecord.append(tumorCall.makeEventString());
- } else {
- fullRecord.append(normalCall.makeEventString());
- }
- fullRecord.append('\t');
- fullRecord.append(normalCall.makeStatsString("N_"));
- fullRecord.append('\t');
- fullRecord.append(tumorCall.makeStatsString("T_"));
- fullRecord.append('\t');
- return fullRecord.toString();
- }
-
- private String makeFullRecord(IndelPrecall normalCall) {
- StringBuilder fullRecord = new StringBuilder();
- fullRecord.append(normalCall.makeEventString());
- fullRecord.append('\t');
- fullRecord.append(normalCall.makeStatsString(""));
- fullRecord.append('\t');
- return fullRecord.toString();
- }
-
- private String getAnnotationString(RODRecordList ann) {
- if ( ann == null ) return annGenomic;
- else {
- StringBuilder b = new StringBuilder();
-
- if ( RefSeqFeature.isExon(ann) ) {
- if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence
- else b.append(annUTR); // exon but not coding = UTR
- } else {
- if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron
- else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon...
- }
- b.append('\t');
- b.append(((Transcript)ann.get(0).getUnderlyingObject()).getGeneName()); // there is at least one transcript in the list, guaranteed
-// while ( it.hasNext() ) { //
-// t.getGeneName()
-// }
- return b.toString();
- }
-
- }
-
- public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, boolean discard_event) {
- RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location));
- String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList));
-
- StringBuilder fullRecord = new StringBuilder();
- fullRecord.append(makeFullRecord(normalCall));
- fullRecord.append(annotationString);
- if ( discard_event && normalCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL");
- try {
- verboseWriter.write(fullRecord.toString());
- verboseWriter.write('\n');
- } catch (IOException e) {
- throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e);
- }
-
- }
-
-
- public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, IndelPrecall tumorCall, boolean discard_event) {
- RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location));
- String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList));
-
- StringBuilder fullRecord = new StringBuilder();
- fullRecord.append(makeFullRecord(normalCall,tumorCall));
-
- if ( normalCall.getVariant() == null && tumorCall.getVariant() == null ) {
- // did not observe anything
- if ( normalCall.getCoverage() >= minNormalCoverage && tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE");
- else {
- if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE"); // no coverage in normal but nothing in tumor
- else {
- // no coverage in tumor; if we have no coverage in normal, it can be anything; if we do have coverage in normal,
- // this still could be a somatic event. so either way it is 'unknown'
- fullRecord.append("UNKNOWN");
- }
- }
-
- }
-
- if ( normalCall.getVariant() == null && tumorCall.getVariant() != null ) {
- // looks like somatic call
- if ( normalCall.getCoverage() >= minNormalCoverage ) fullRecord.append("SOMATIC"); // we confirm there is nothing in normal
- else {
- // low coverage in normal
- fullRecord.append("EVENT_T"); // no coverage in normal, no idea whether it is germline or somatic
- }
- }
-
- if ( normalCall.getVariant() != null && tumorCall.getVariant() == null ) {
- // it's likely germline (with missing observation in tumor - maybe loh?
- if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("GERMLINE_LOH"); // we confirm there is nothing in tumor
- else {
- // low coverage in tumor, maybe we missed the event
- fullRecord.append("GERMLINE"); // no coverage in tumor but we already saw it in normal...
- }
- }
-
- if ( normalCall.getVariant() != null && tumorCall.getVariant() != null ) {
- // events in both T/N, got to be germline!
- fullRecord.append("GERMLINE");
- }
-
-
- fullRecord.append('\t');
- fullRecord.append(annotationString);
-
- if ( discard_event && tumorCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL");
-
- try {
- verboseWriter.write(fullRecord.toString());
- verboseWriter.write('\n');
- } catch (IOException e) {
- throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e);
- }
- }
-
- public void printVCFLine(VariantContextWriter vcf, IndelPrecall call, boolean discard_event) {
-
- long start = call.getPosition()-1;
- // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed.
- // The suggestion is instead of putting the base before the indel, to put the base after the indel.
- // For now, just don't print out that site.
- if ( start == 0 )
- return;
-
- long stop = start;
-
- List alleles = new ArrayList(2); // actual observed (distinct!) alleles at the site
- List homref_alleles = null; // when needed, will contain two identical copies of ref allele - needed to generate hom-ref genotype
-
- final byte referencePaddingBase = refBases[(int)start-1];
-
- if ( call.getVariant() == null ) {
- // we will need to create genotype with two (hom) ref alleles (below).
- // we can not use 'alleles' list here, since that list is supposed to contain
- // only *distinct* alleles observed at the site or VCFContext will frown upon us...
- alleles.add( Allele.create(referencePaddingBase,true) );
- homref_alleles = new ArrayList(2);
- homref_alleles.add( alleles.get(0));
- homref_alleles.add( alleles.get(0));
- } else {
- // we always create alt allele when we observe anything but the ref, even if it is not a call!
- // (Genotype will tell us whether it is an actual call or not!)
- int event_length = call.getVariant().lengthOnRef();
- if ( event_length < 0 ) event_length = 0;
- fillAlleleList(alleles,call,referencePaddingBase);
- stop += event_length;
- }
-
- GenotypesContext genotypes = GenotypesContext.create();
- for ( String sample : normalSamples ) {
- GenotypeBuilder gb = new GenotypeBuilder(sample);
-
- gb=call.addStatsAttributes(gb);
- gb.alleles(! discard_event
- ? alleles // we made a call - put actual het genotype here:
- : homref_alleles); // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all)
- genotypes.add(gb.make());
-
- }
- Set filters = null;
- if ( call.getVariant() != null && discard_event ) {
- filters = new HashSet();
- filters.add("NoCall");
- }
- VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles)
- .genotypes(genotypes).filters(filters).make();
- vcf.add(vc);
- }
-
- /** Fills l with appropriate alleles depending on whether call is insertion or deletion
- * (l MUST have a variant or this method will crash). It is guaranteed that the *first* allele added
- * to the list is ref, and the next one is alt.
- * @param l
- * @param call
- */
- private void fillAlleleList(List l, IndelPrecall call, byte referencePaddingBase) {
- int event_length = call.getVariant().lengthOnRef();
- if ( event_length == 0 ) { // insertion
-
- l.add( Allele.create(referencePaddingBase,true) );
- l.add( Allele.create((char)referencePaddingBase + new String(call.getVariant().getBases()), false ));
-
- } else { //deletion:
- l.add( Allele.create((char)referencePaddingBase + new String(call.getVariant().getBases()), true ));
- l.add( Allele.create(referencePaddingBase,false) );
- }
- }
-
- public void printVCFLine(VariantContextWriter vcf, IndelPrecall nCall, IndelPrecall tCall, boolean discard_event) {
-
- long start = tCall.getPosition()-1;
- long stop = start;
-
- // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed.
- // The suggestion is instead of putting the base before the indel, to put the base after the indel.
- // For now, just don't print out that site.
- if ( start == 0 )
- return;
-
- GenotypeBuilder nGB = new GenotypeBuilder();
- GenotypeBuilder tGB = new GenotypeBuilder();
-
- nCall.addStatsAttributes(nGB);
- tCall.addStatsAttributes(tGB);
-
- Map attrs = new HashMap();
-
- boolean isSomatic = false;
- if ( nCall.getVariant() == null && tCall.getVariant() != null ) {
- isSomatic = true;
- attrs.put(VCFConstants.SOMATIC_KEY,true);
- }
- List alleles = new ArrayList(2); // all alleles at the site
- // List normal_alleles = null; // all alleles at the site
- List homRefAlleles = null;
-
-// if ( nCall.getVariant() == null || tCall.getVariant() == null ) {
- homRefAlleles = new ArrayList(2) ; // we need this for somatic calls (since normal is ref-ref), and also for no-calls
-// }
- boolean homRefT = ( tCall.getVariant() == null );
- boolean homRefN = ( nCall.getVariant() == null );
- final byte referencePaddingBase = refBases[(int)start-1];
- if ( tCall.getVariant() == null && nCall.getVariant() == null) {
- // no indel at all ; create base-representation ref/ref alleles for genotype construction
- alleles.add( Allele.create(referencePaddingBase,true) );
- } else {
- // we got indel(s)
- int event_length = 0;
- if ( tCall.getVariant() != null ) {
- // indel in tumor
- event_length = tCall.getVariant().lengthOnRef();
- fillAlleleList(alleles, tCall, referencePaddingBase);
- } else {
- event_length = nCall.getVariant().lengthOnRef();
- fillAlleleList(alleles, nCall, referencePaddingBase);
- }
- if ( event_length > 0 ) stop += event_length;
- }
- homRefAlleles.add( alleles.get(0));
- homRefAlleles.add( alleles.get(0));
-
- GenotypesContext genotypes = GenotypesContext.create();
-
- for ( String sample : normalSamples ) {
- genotypes.add(nGB.name(sample).alleles(homRefN ? homRefAlleles : alleles).make());
- }
-
- for ( String sample : tumorSamples ) {
- genotypes.add(tGB.name(sample).alleles(homRefT ? homRefAlleles : alleles).make());
- }
-
- Set filters = null;
- if ( tCall.getVariant() != null && discard_event ) {
- filters = new HashSet();
- filters.add("NoCall");
- }
-
- VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles)
- .genotypes(genotypes).filters(filters).attributes(attrs).make();
- vcf.add(vc);
- }
-
- @Override
- public void onTraversalDone(Integer result) {
- if ( DEBUG ) {
- System.out.println("DEBUG>> Emitting last window at "+normal_context.getStart()+"-"+normal_context.getStop());
- }
- if ( call_somatic ) emit_somatic(1000000000, true);
- else emit(1000000000,true); // emit everything we might have left
-
- if ( metricsWriter != null ) {
- metricsWriter.println(String.format("Normal calls made %d", normalCallsMade));
- metricsWriter.println(String.format("Tumor calls made %d", tumorCallsMade));
- metricsWriter.close();
- }
-
- try {
- if ( bedWriter != null ) bedWriter.close();
- if ( verboseWriter != null ) verboseWriter.close();
- } catch (IOException e) {
- System.out.println("Failed to close output BED file gracefully, data may be lost");
- e.printStackTrace();
- }
- super.onTraversalDone(result);
- }
-
- @Override
- public Integer reduce(Integer value, Integer sum) {
- if ( value == -1 ) {
- onTraversalDone(sum);
- System.exit(1);
- }
- sum += value;
- return sum;
- }
-
- @Override
- public Integer reduceInit() {
- return 0;
- }
-
-
- static class IndelVariant {
- public static enum Type { I, D};
- private String bases;
- private Type type;
- private ArrayList fromStartOffsets = null;
- private ArrayList fromEndOffsets = null;
-
- private Set reads = new HashSet(); // keep track of reads that have this indel
- private Set samples = new HashSet(); // which samples had the indel described by this object
-
- public IndelVariant(ExpandedSAMRecord read , Type type, String bases) {
- this.type = type;
- this.bases = bases.toUpperCase();
- addObservation(read);
- fromStartOffsets = new ArrayList();
- fromEndOffsets = new ArrayList();
- }
-
- /** Adds another observation for the current indel. It is assumed that the read being registered
- * does contain the observation, no checks are performed. Read's sample is added to the list of samples
- * this indel was observed in as well.
- * @param read
- */
- public void addObservation(ExpandedSAMRecord read) {
- if ( reads.contains(read) ) {
- //TODO fix CleanedReadInjector and reinstate exception here: duplicate records may signal a problem with the bam
- // seeing the same read again can mean only one thing: the input bam file is corrupted and contains
- // duplicate records. We KNOW that this may happen for the time being due to bug in CleanedReadInjector
- // so this is a short-term patch: don't cry, but just ignore the duplicate record
-
- //throw new StingException("Attempting to add indel observation that was already registered");
- return;
- }
- reads.add(read);
- String sample = null;
- if ( read.getSAMRecord().getReadGroup() != null ) sample = read.getSAMRecord().getReadGroup().getSample();
- if ( sample != null ) samples.add(sample);
- }
-
-
- /** Returns length of the event on the reference (number of deleted bases
- * for deletions, -1 for insertions.
- * @return
- */
- public int lengthOnRef() {
- if ( type == Type.D ) return bases.length();
- else return 0;
- }
-
-
- public void addSample(String sample) {
- if ( sample != null )
- samples.add(sample);
- }
-
- public void addReadPositions(int fromStart, int fromEnd) {
- fromStartOffsets.add(fromStart);
- fromEndOffsets.add(fromEnd);
- }
-
- public List getOffsetsFromStart() { return fromStartOffsets ; }
- public List getOffsetsFromEnd() { return fromEndOffsets; }
-
- public String getSamples() {
- StringBuffer sb = new StringBuffer();
- Iterator i = samples.iterator();
- while ( i.hasNext() ) {
- sb.append(i.next());
- if ( i.hasNext() )
- sb.append(",");
- }
- return sb.toString();
- }
-
- public Set getReadSet() { return reads; }
-
- public int getCount() { return reads.size(); }
-
- public String getBases() { return bases; }
-
- public Type getType() { return type; }
-
- @Override
- public boolean equals(Object o) {
- if ( ! ( o instanceof IndelVariant ) ) return false;
- IndelVariant that = (IndelVariant)o;
- return ( this.type == that.type && this.bases.equals(that.bases) );
- }
-
- public boolean equals(Type type, String bases) {
- return ( this.type == type && this.bases.equals(bases.toUpperCase()) );
- }
- }
-
- /**
- * Utility class that encapsulates the logic related to collecting all the stats and counts required to
- * make (or discard) a call, as well as the calling heuristics that uses those data.
- */
- class IndelPrecall {
-// private boolean DEBUG = false;
- private int NQS_MISMATCH_CUTOFF = 1000000;
- private double AV_MISMATCHES_PER_READ = 1.5;
-
- private int nqs = 0;
- private IndelVariant consensus_indel = null; // indel we are going to call
- private long pos = -1 ; // position on the ref
- private int total_coverage = 0; // total number of reads overlapping with the event
- private int consensus_indel_count = 0; // number of reads, in which consensus indel was observed
- private int all_indel_count = 0 ; // number of reads, in which any indel was observed at current position
-
- private int total_mismatches_in_nqs_window = 0; // total number of mismatches in the nqs window around the indel
- private int total_bases_in_nqs_window = 0; // total number of bases in the nqs window (some reads may not fully span the window so it's not coverage*nqs_size)
- private int total_base_qual_in_nqs_window = 0; // sum of qualitites of all the bases in the nqs window
- private int total_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of all mismatching bases in the nqs window
-
- private int indel_read_mismatches_in_nqs_window = 0; // mismatches inside the nqs window in indel-containing reads only
- private int indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from indel-containing reads only
- private int indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from indel-containing reads only
- private int indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from indel-containing reads only
-
-
- private int consensus_indel_read_mismatches_in_nqs_window = 0; // mismatches within the nqs window from consensus indel reads only
- private int consensus_indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from consensus indel-containing reads only
- private int consensus_indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from consensus indel-containing reads only
- private int consensus_indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from consensus indel-containing reads only
-
-
- private double consensus_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain consensus indel
- private double all_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain any indel at given position
- private double all_read_total_mm = 0.0; // sum of all mismatches in all reads
-
- private double consensus_indel_read_total_mapq = 0.0; // sum of mapping qualitites of all reads with consensus indel
- private double all_indel_read_total_mapq = 0.0 ; // sum of mapping qualitites of all reads with (any) indel at current position
- private double all_read_total_mapq = 0.0; // sum of all mapping qualities of all reads
-
- private PrimitivePair.Int consensus_indel_read_orientation_cnt = new PrimitivePair.Int();
- private PrimitivePair.Int all_indel_read_orientation_cnt = new PrimitivePair.Int();
- private PrimitivePair.Int all_read_orientation_cnt = new PrimitivePair.Int();
-
- private int from_start_median = 0;
- private int from_start_mad = 0;
- private int from_end_median = 0;
- private int from_end_mad = 0;
-
- /** Makes an empty call (no-call) with all stats set to 0
- *
- * @param position
- */
- public IndelPrecall(long position) {
- this.pos = position;
- }
-
- public IndelPrecall(WindowContext context, long position, int nqs_width) {
- this.pos = position;
- this.nqs = nqs_width;
- total_coverage = context.coverageAt(pos,true);
- List variants = context.indelsAt(pos);
- findConsensus(variants);
-
- // pos is the first base after the event: first deleted base or first base after insertion.
- // hence, [pos-nqs, pos+nqs-1] (inclusive) is the window with nqs bases on each side of a no-event or an insertion
- // and [pos-nqs, pos+Ndeleted+nqs-1] is the window with nqs bases on each side of a deletion.
- // we initialize the nqs window for no-event/insertion case
- long left = Math.max( pos-nqs, context.getStart() );
- long right = Math.min(pos+nqs-1, context.getStop());
-//if ( pos == 3534096 ) System.out.println("pos="+pos +" total reads: "+context.getReads().size());
- Iterator read_iter = context.getReads().iterator();
-
-
- while ( read_iter.hasNext() ) {
- ExpandedSAMRecord rec = read_iter.next();
- SAMRecord read = rec.getSAMRecord();
- byte[] flags = rec.getExpandedMMFlags();
- byte[] quals = rec.getExpandedQuals();
- int mm = rec.getMMCount();
-
-
- if( read.getAlignmentStart() > pos || read.getAlignmentEnd() < pos ) continue;
-
- long local_right = right; // end of nqs window for this particular read. May need to be advanced further right
- // if read has a deletion. The gap in the middle of nqs window will be skipped
- // automatically since flags/quals are set to -1 there
-
- boolean read_has_a_variant = false;
- boolean read_has_consensus = ( consensus_indel!= null && consensus_indel.getReadSet().contains(rec) );
- for ( IndelVariant v : variants ) {
- if ( v.getReadSet().contains(rec) ) {
- read_has_a_variant = true;
- local_right += v.lengthOnRef();
- break;
- }
- }
-
- if ( read_has_consensus ) {
- consensus_indel_read_total_mm += mm;
- consensus_indel_read_total_mapq += read.getMappingQuality();
- if ( read.getReadNegativeStrandFlag() ) consensus_indel_read_orientation_cnt.second++;
- else consensus_indel_read_orientation_cnt.first++;
- }
- if ( read_has_a_variant ) {
- all_indel_read_total_mm += mm;
- all_indel_read_total_mapq += read.getMappingQuality();
- if ( read.getReadNegativeStrandFlag() ) all_indel_read_orientation_cnt.second++;
- else all_indel_read_orientation_cnt.first++;
- }
-
- all_read_total_mm+= mm;
- all_read_total_mapq += read.getMappingQuality();
- if ( read.getReadNegativeStrandFlag() ) all_read_orientation_cnt.second++;
- else all_read_orientation_cnt.first++;
-
- for ( int pos_in_flags = Math.max((int)(left - read.getAlignmentStart()),0);
- pos_in_flags <= Math.min((int)local_right-read.getAlignmentStart(),flags.length - 1);
- pos_in_flags++) {
-
- if ( flags[pos_in_flags] == -1 ) continue; // gap (deletion), skip it; we count only bases aligned to the ref
- total_bases_in_nqs_window++;
- if ( read_has_consensus ) consensus_indel_read_bases_in_nqs_window++;
- if ( read_has_a_variant ) indel_read_bases_in_nqs_window++;
-
- if ( quals[pos_in_flags] != -1 ) {
-
- total_base_qual_in_nqs_window += quals[pos_in_flags];
- if ( read_has_a_variant ) indel_read_base_qual_in_nqs_window += quals[pos_in_flags];
- if ( read_has_consensus ) consensus_indel_read_base_qual_in_nqs_window += quals[pos_in_flags];
- }
-
- if ( flags[pos_in_flags] == 1 ) { // it's a mismatch
- total_mismatches_in_nqs_window++;
- total_mismatching_base_qual_in_nqs_window += quals[pos_in_flags];
-
- if ( read_has_consensus ) {
- consensus_indel_read_mismatches_in_nqs_window++;
- consensus_indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags];
- }
-
- if ( read_has_a_variant ) {
- indel_read_mismatches_in_nqs_window++;
- indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags];
- }
- }
- }
-// if ( pos == 3534096 ) {
-// System.out.println(read.getReadName());
-// System.out.println(" cons nqs bases="+consensus_indel_read_bases_in_nqs_window);
-// System.out.println(" qual sum="+consensus_indel_read_base_qual_in_nqs_window);
-// }
-
- }
-
- // compute median/mad for offsets from the read starts/ends
- if ( consensus_indel != null ) {
- from_start_median = median(consensus_indel.getOffsetsFromStart()) ;
- from_start_mad = mad(consensus_indel.getOffsetsFromStart(),from_start_median);
- from_end_median = median(consensus_indel.getOffsetsFromEnd()) ;
- from_end_mad = mad(consensus_indel.getOffsetsFromEnd(),from_end_median);
- }
- }
-
- /** As a side effect will sort l!
- *
- * @param l
- * @return
- */
- private int median(List l) {
- Collections.sort(l);
- int k = l.size()/2;
- return ( l.size() % 2 == 0 ?
- (l.get(k-1).intValue()+l.get(k).intValue())/2 :
- l.get(k).intValue());
- }
-
- private int median(int[] l) {
- Arrays.sort(l);
- int k = l.length/2;
- return ( l.length % 2 == 0 ?
- (l[k-1]+l[k])/2 :
- l[k]);
- }
-
- private int mad(List l, int med) {
- int [] diff = new int[l.size()];
- for ( int i = 0; i < l.size(); i++ ) {
- diff[i] = Math.abs(l.get(i).intValue() - med);
- }
- return median(diff);
- }
-
- public long getPosition() { return pos; }
-
- public boolean hasObservation() { return consensus_indel != null; }
-
- public int getCoverage() { return total_coverage; }
-
- public double getTotalMismatches() { return all_read_total_mm; }
- public double getConsensusMismatches() { return consensus_indel_read_total_mm; }
- public double getAllVariantMismatches() { return all_indel_read_total_mm; }
-
- /** Returns average number of mismatches per consensus indel-containing read */
- public double getAvConsensusMismatches() {
- return ( consensus_indel_count != 0 ? consensus_indel_read_total_mm/consensus_indel_count : 0.0 );
- }
-
- /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants) */
- public double getAvRefMismatches() {
- int coverage_ref = total_coverage-all_indel_count;
- return ( coverage_ref != 0 ? (all_read_total_mm - all_indel_read_total_mm )/coverage_ref : 0.0 );
- }
-
- public PrimitivePair.Int getConsensusStrandCounts() {
- return consensus_indel_read_orientation_cnt;
- }
-
- public PrimitivePair.Int getRefStrandCounts() {
- return new PrimitivePair.Int(all_read_orientation_cnt.first-all_indel_read_orientation_cnt.first,
- all_read_orientation_cnt.second - all_indel_read_orientation_cnt.second);
- }
-
- /** Returns a sum of mapping qualities of all reads spanning the event. */
- public double getTotalMapq() { return all_read_total_mapq; }
-
- /** Returns a sum of mapping qualities of all reads, in which the consensus variant is observed. */
- public double getConsensusMapq() { return consensus_indel_read_total_mapq; }
-
- /** Returns a sum of mapping qualities of all reads, in which any variant is observed at the current event site. */
- public double getAllVariantMapq() { return all_indel_read_total_mapq; }
-
- /** Returns average mapping quality per consensus indel-containing read. */
- public double getAvConsensusMapq() {
- return ( consensus_indel_count != 0 ? consensus_indel_read_total_mapq/consensus_indel_count : 0.0 );
- }
-
- /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants). */
- public double getAvRefMapq() {
- int coverage_ref = total_coverage-all_indel_count;
- return ( coverage_ref != 0 ? (all_read_total_mapq - all_indel_read_total_mapq )/coverage_ref : 0.0 );
- }
-
- /** Returns fraction of bases in NQS window around the indel that are mismatches, across all reads,
- * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around
- * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]}
- * */
- public double getNQSConsensusMMRate() {
- if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0;
- return ((double)consensus_indel_read_mismatches_in_nqs_window)/consensus_indel_read_bases_in_nqs_window;
- }
-
- /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads
- * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref
- * reads is defined around the event start position, NOT around the actual consensus indel.
- * */
- public double getNQSRefMMRate() {
- int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window;
- if ( num_ref_bases == 0 ) return 0;
- return ((double)(total_mismatches_in_nqs_window - indel_read_mismatches_in_nqs_window))/num_ref_bases;
- }
-
- /** Returns average base quality in NQS window around the indel, across all reads,
- * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around
- * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]}
- * */
- public double getNQSConsensusAvQual() {
- if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0;
- return ((double)consensus_indel_read_base_qual_in_nqs_window)/consensus_indel_read_bases_in_nqs_window;
- }
-
- /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads
- * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref
- * reads is defined around the event start position, NOT around the actual consensus indel.
- * */
- public double getNQSRefAvQual() {
- int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window;
- if ( num_ref_bases == 0 ) return 0;
- return ((double)(total_base_qual_in_nqs_window - indel_read_base_qual_in_nqs_window))/num_ref_bases;
- }
-
- public int getTotalNQSMismatches() { return total_mismatches_in_nqs_window; }
-
- public int getAllVariantCount() { return all_indel_count; }
- public int getConsensusVariantCount() { return consensus_indel_count; }
-
-// public boolean failsNQSMismatch() {
-// //TODO wrong fraction: mismatches are counted only in indel-containing reads, but total_coverage is used!
-// return ( indel_read_mismatches_in_nqs_window > NQS_MISMATCH_CUTOFF ) ||
-// ( indel_read_mismatches_in_nqs_window > total_coverage * AV_MISMATCHES_PER_READ );
-// }
-
- public IndelVariant getVariant() { return consensus_indel; }
-
- public void fillContext(JexlContext context,String[] cassette) {
- context.set(cassette[C_INDEL_F],((double)consensus_indel_count)/total_coverage);
- context.set(cassette[C_INDEL_CF],((double)consensus_indel_count/all_indel_count));
- context.set(cassette[C_COV],total_coverage);
- context.set(cassette[C_CONS_CNT],consensus_indel_count);
- }
-/*
- public boolean isCall() {
- boolean ret = ( consensus_indel_count >= minIndelCount &&
- (double)consensus_indel_count > minFraction * total_coverage &&
- (double) consensus_indel_count > minConsensusFraction*all_indel_count && total_coverage >= minCoverage);
- if ( DEBUG && ! ret ) System.out.println("DEBUG>> NOT a call: count="+consensus_indel_count+
- " total_count="+all_indel_count+" cov="+total_coverage+
- " minConsensusF="+((double)consensus_indel_count)/all_indel_count+
- " minF="+((double)consensus_indel_count)/total_coverage);
-
- return ret;
-// return true;
- }
-*/
- /** Utility method: finds the indel variant with the largest count (ie consensus) among all the observed
- * variants, and sets the counts of consensus observations and all observations of any indels (including non-consensus)
- * @param variants
- * @return
- */
- private void findConsensus(List variants) {
- for ( IndelVariant var : variants ) {
- if ( DEBUG ) System.out.println("DEBUG>> Variant "+var.getBases()+" (cnt="+var.getCount()+")");
- int cnt = var.getCount();
- all_indel_count +=cnt;
- if ( cnt > consensus_indel_count ) {
- consensus_indel = var;
- consensus_indel_count = cnt;
- }
- }
- if ( DEBUG && consensus_indel != null ) System.out.println("DEBUG>> Returning: "+consensus_indel.getBases()+
- " (cnt="+consensus_indel.getCount()+") with total count of "+all_indel_count);
- }
-
-
-
- public void printBedLine(Writer bed) {
- int event_length;
- if ( consensus_indel == null ) event_length = 0;
- else {
- event_length = consensus_indel.lengthOnRef();
- if ( event_length < 0 ) event_length = 0;
- }
-
- StringBuffer message = new StringBuffer();
- message.append(refName+"\t"+(pos-1)+"\t");
- message.append((pos-1+event_length)+"\t");
- if ( consensus_indel != null ) {
- message.append((event_length>0? "-":"+")+consensus_indel.getBases());
- } else {
- message.append('.');
- }
- message.append(":"+all_indel_count+"/"+total_coverage);
- try {
- bed.write(message.toString()+"\n");
- } catch (IOException e) {
- throw new UserException.CouldNotCreateOutputFile(bedOutput, "Error encountered while writing into output BED file", e);
- }
- }
-
- public String makeEventString() {
- int event_length;
- if ( consensus_indel == null ) event_length = 0;
- else {
- event_length = consensus_indel.lengthOnRef();
- if ( event_length < 0 ) event_length = 0;
- }
- StringBuffer message = new StringBuffer();
- message.append(refName);
- message.append('\t');
- message.append(pos-1);
- message.append('\t');
- message.append(pos-1+event_length);
- message.append('\t');
- if ( consensus_indel != null ) {
- message.append((event_length>0?'-':'+'));
- message.append(consensus_indel.getBases());
- } else {
- message.append('.');
- }
- return message.toString();
- }
-
- public String makeStatsString(String prefix) {
- StringBuilder message = new StringBuilder();
- message.append(prefix+"OBS_COUNTS[C/A/T]:"+getConsensusVariantCount()+"/"+getAllVariantCount()+"/"+getCoverage());
- message.append('\t');
- message.append(prefix+"AV_MM[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMismatches(),
- getAvRefMismatches()));
- message.append('\t');
- message.append(prefix+"AV_MAPQ[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMapq(),
- getAvRefMapq()));
- message.append('\t');
- message.append(prefix+"NQS_MM_RATE[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusMMRate(),getNQSRefMMRate()));
- message.append('\t');
- message.append(prefix+"NQS_AV_QUAL[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusAvQual(),getNQSRefAvQual()));
-
- PrimitivePair.Int strand_cons = getConsensusStrandCounts();
- PrimitivePair.Int strand_ref = getRefStrandCounts();
- message.append('\t');
- message.append(prefix+"STRAND_COUNTS[C/C/R/R]:"+strand_cons.first+"/"+strand_cons.second+"/"+strand_ref.first+"/"+strand_ref.second);
-
- message.append('\t');
- message.append(prefix+"OFFSET_RSTART:"+from_start_median+"/"+from_start_mad);
- message.append('\t');
- message.append(prefix+"OFFSET_REND:"+from_end_median+"/"+from_end_mad);
-
- return message.toString();
-
- }
-
- /**
- * Places alignment statistics into attribute map and returns the map. If attr parameter is null,
- * a new map is allocated, filled and returned. If attr is not null, new attributes are added to that
- * preexisting map, and the same instance of the (updated) map is returned.
- *
- * @param attr
- * @return
- */
- public Map makeStatsAttributes(Map attr) {
- if ( attr == null ) attr = new HashMap();
-
- VCFIndelAttributes.recordDepth(getConsensusVariantCount(),getAllVariantCount(),getCoverage(),attr);
-
- VCFIndelAttributes.recordAvMM(getAvConsensusMismatches(),getAvRefMismatches(),attr);
-
- VCFIndelAttributes.recordAvMapQ(getAvConsensusMapq(),getAvRefMapq(),attr);
-
- VCFIndelAttributes.recordNQSMMRate(getNQSConsensusMMRate(),getNQSRefMMRate(),attr);
-
- VCFIndelAttributes.recordNQSAvQ(getNQSConsensusAvQual(),getNQSRefAvQual(),attr);
-
- VCFIndelAttributes.recordOffsetFromStart(from_start_median,from_start_mad,attr);
-
- VCFIndelAttributes.recordOffsetFromEnd(from_end_median,from_end_mad,attr);
-
- PrimitivePair.Int strand_cons = getConsensusStrandCounts();
- PrimitivePair.Int strand_ref = getRefStrandCounts();
-
- VCFIndelAttributes.recordStrandCounts(strand_cons.first,strand_cons.second,strand_ref.first,strand_ref.second,attr);
- return attr;
- }
-
-
- /**
- * Adds alignment statistics directly into the genotype builder object.
- *
- * @param gb
- * @return
- */
- public GenotypeBuilder addStatsAttributes(GenotypeBuilder gb) {
- if ( gb == null ) gb = new GenotypeBuilder();
-
- gb = VCFIndelAttributes.recordDepth(getConsensusVariantCount(),getAllVariantCount(),getCoverage(),gb);
-
- gb = VCFIndelAttributes.recordAvMM(getAvConsensusMismatches(),getAvRefMismatches(),gb);
-
- gb = VCFIndelAttributes.recordAvMapQ(getAvConsensusMapq(),getAvRefMapq(),gb);
-
- gb = VCFIndelAttributes.recordNQSMMRate(getNQSConsensusMMRate(),getNQSRefMMRate(),gb);
-
- gb = VCFIndelAttributes.recordNQSAvQ(getNQSConsensusAvQual(),getNQSRefAvQual(),gb);
-
- gb = VCFIndelAttributes.recordOffsetFromStart(from_start_median,from_start_mad,gb);
-
- gb = VCFIndelAttributes.recordOffsetFromEnd(from_end_median,from_end_mad,gb);
-
- PrimitivePair.Int strand_cons = getConsensusStrandCounts();
- PrimitivePair.Int strand_ref = getRefStrandCounts();
-
- gb = VCFIndelAttributes.recordStrandCounts(strand_cons.first,strand_cons.second,strand_ref.first,strand_ref.second,gb);
- return gb;
- }
-
- }
-
- interface IndelListener {
- public void addObservation(int pos, IndelVariant.Type t, String bases, int fromStart, int fromEnd, ExpandedSAMRecord r);
- }
-
- class WindowContext implements IndelListener {
- private Set reads;
- private int start=0; // where the window starts on the ref, 1-based
- private CircularArray< List< IndelVariant > > indels;
-
- private List emptyIndelList = new ArrayList();
-
-
- public WindowContext(int start, int length) {
- this.start = start;
- indels = new CircularArray< List >(length);
-// reads = new LinkedList();
- reads = new HashSet();
- }
-
- /** Returns 1-based reference start position of the interval this object keeps context for.
- *
- * @return
- */
- public int getStart() { return start; }
-
- /** Returns 1-based reference stop position (inclusive) of the interval this object keeps context for.
- *
- * @return
- */
- public int getStop() { return start + indels.length() - 1; }
-
- /** Resets reference start position to 0 and clears the context.
- *
- */
- public void clear() {
- start = 0;
- reads.clear();
- indels.clear();
- }
-
- /**
- * Returns true if any indel observations are present in the specified interval
- * [begin,end] (1-based, inclusive). Interval can be partially of fully outside of the
- * current context window: positions outside of the window will be ignored.
- * @param begin
- * @param end
- */
- public boolean hasIndelsInInterval(long begin, long end) {
- for ( long k = Math.max(start,begin); k < Math.min(getStop(),end); k++ ) {
- if ( indelsAt(k) != emptyIndelList ) return true;
- }
- return false;
- }
-
- public Set getReads() { return reads; }
-
- /** Returns the number of reads spanning over the specified reference position
- * (regardless of whether they have a base or indel at that specific location).
- * The second argument controls whether to count with indels in mind (this is relevant for insertions only,
- * deletions do not require any special treatment since they occupy non-zero length on the ref and since
- * alignment can not start or end with a deletion). For insertions, note that, internally, we assign insertions
- * to the reference position right after the actual event, and we count all events assigned to a given position.
- * This count (reads with indels) should be contrasted to reads without indels, or more rigorously, reads
- * that support the ref rather than the indel. Few special cases may occur here:
- * 1) an alignment that ends (as per getAlignmentEnd()) right before the current position but has I as its
- * last element: we have to count that read into the "coverage" at the current position for the purposes of indel
- * assessment, as the indel in that read will be counted at the current position, so the total coverage
- * should be consistent with that.
- */
- /* NOT IMPLEMENTED: 2) alsignments that start exactly at the current position do not count for the purpose of insertion
- * assessment since they do not contribute any evidence to either Ref or Alt=insertion hypothesis, unless
- * the alignment starts with I (so that we do have evidence for an indel assigned to the current position and
- * read should be counted). For deletions, reads starting at the current position should always be counted (as they
- * show no deletion=ref).
- * @param refPos position on the reference; must be within the bounds of the window
- */
- public int coverageAt(final long refPos, boolean countForIndels) {
- int cov = 0;
- for ( ExpandedSAMRecord read : reads ) {
- if ( read.getSAMRecord().getAlignmentStart() > refPos || read.getSAMRecord().getAlignmentEnd() < refPos ) {
- if ( countForIndels && read.getSAMRecord().getAlignmentEnd() == refPos - 1) {
- Cigar c = read.getSAMRecord().getCigar();
- if ( c.getCigarElement(c.numCigarElements()-1).getOperator() == CigarOperator.I ) cov++;
- }
- continue;
- }
- cov++;
- }
- return cov;
- }
-
-
- /** Shifts current window to the right along the reference contig by the specified number of bases.
- * The context will be updated accordingly (indels and reads that go out of scope will be dropped).
- * @param offset
- */
- public void shift(int offset) {
- start += offset;
-
- indels.shiftData(offset);
- if ( indels.get(0) != null && indels.get(0).size() != 0 ) {
- IndelVariant indel = indels.get(0).get(0);
-
- System.out.println("WARNING: Indel(s) at first position in the window ("+refName+":"+start+"): currently not supported: "+
- (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; read: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName()+"; site ignored");
- indels.get(0).clear();
-// throw new StingException("Indel found at the first position ("+start+") after a shift was performed: currently not supported: "+
-// (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; reads: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName());
- }
-
- Iterator read_iter = reads.iterator();
-
- while ( read_iter.hasNext() ) {
- ExpandedSAMRecord r = read_iter.next();
- if ( r.getSAMRecord().getAlignmentEnd() < start ) { // discard reads and associated data that went out of scope
- read_iter.remove();
- }
- }
- }
-
- public void add(SAMRecord read, byte [] ref) {
-
- if ( read.getAlignmentStart() < start ) return; // silently ignore reads starting before the window start
-
- ExpandedSAMRecord er = new ExpandedSAMRecord(read,ref,read.getAlignmentStart()-start,this);
- //TODO duplicate records may actually indicate a problem with input bam file; throw an exception when the bug in CleanedReadInjector is fixed
- if ( reads.contains(er)) return; // ignore duplicate records
- reads.add(er);
- }
-
- public void addObservation(int pos, IndelVariant.Type type, String bases, int fromStart, int fromEnd, ExpandedSAMRecord rec) {
- List indelsAtSite;
- try {
- indelsAtSite = indels.get(pos);
- } catch (IndexOutOfBoundsException e) {
- SAMRecord r = rec.getSAMRecord();
- System.out.println("Failed to add indel observation, probably out of coverage window bounds (trailing indel?):\nRead "+
- r.getReadName()+": "+
- "read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+
- r.getAlignmentStart()+"; end="+r.getAlignmentEnd()+"; window start="+getStart()+
- "; window end="+getStop());
- throw e;
- }
-
- if ( indelsAtSite == null ) {
- indelsAtSite = new ArrayList();
- indels.set(pos, indelsAtSite);
- }
-
- IndelVariant indel = null;
- for ( IndelVariant v : indelsAtSite ) {
- if ( ! v.equals(type, bases) ) continue;
-
- indel = v;
- indel.addObservation(rec);
- break;
- }
-
- if ( indel == null ) { // not found:
- indel = new IndelVariant(rec, type, bases);
- indelsAtSite.add(indel);
- }
- indel.addReadPositions(fromStart,fromEnd);
- }
-
- public List indelsAt( final long refPos ) {
- List l = indels.get((int)( refPos - start ));
- if ( l == null ) return emptyIndelList;
- else return l;
- }
-
-
- }
-
-
- class ExpandedSAMRecord {
- private SAMRecord read;
- private byte[] mismatch_flags;
- private byte[] expanded_quals;
- private int mms;
-
- public ExpandedSAMRecord(SAMRecord r, byte [] ref, long offset, IndelListener l) {
-
- read = r;
- final long rStart = read.getAlignmentStart();
- final long rStop = read.getAlignmentEnd();
- final byte[] readBases = read.getReadString().toUpperCase().getBytes();
-
- ref = new String(ref).toUpperCase().getBytes();
-
- mismatch_flags = new byte[(int)(rStop-rStart+1)];
- expanded_quals = new byte[(int)(rStop-rStart+1)];
-
- // now let's extract indels:
-
- Cigar c = read.getCigar();
- final int nCigarElems = c.numCigarElements();
-
-
- int readLength = 0; // length of the aligned part of the read NOT counting clipped bases
- for ( CigarElement cel : c.getCigarElements() ) {
-
- switch(cel.getOperator()) {
- case H:
- case S:
- case D:
- case N:
- case P:
- break; // do not count gaps or clipped bases
- case I:
- case M:
- case EQ:
- case X:
- readLength += cel.getLength();
- break; // advance along the gapless block in the alignment
- default :
- throw new IllegalArgumentException("Unexpected operator in cigar string: "+cel.getOperator());
- }
- }
-
- int fromStart = 0;
- int posOnRead = 0;
- int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read:
- // its start on the actual full reference contig is r.getAlignmentStart()
- for ( int i = 0 ; i < nCigarElems ; i++ ) {
-
- final CigarElement ce = c.getCigarElement(i);
- IndelVariant.Type type = null;
- String indel_bases = null;
- int eventPosition = posOnRef;
-
- switch(ce.getOperator()) {
- case H: break; // hard clipped reads do not have clipped indel_bases in their sequence, so we just ignore the H element...
- case I:
- type = IndelVariant.Type.I;
- indel_bases = read.getReadString().substring(posOnRead,posOnRead+ce.getLength());
- // will increment position on the read below, there's no 'break' statement yet...
- case S:
- // here we also skip soft-clipped indel_bases on the read; according to SAM format specification,
- // alignment start position on the reference points to where the actually aligned
- // (not clipped) indel_bases go, so we do not need to increment reference position here
- posOnRead += ce.getLength();
- break;
- case D:
- type = IndelVariant.Type.D;
- indel_bases = new String( ref, posOnRef, ce.getLength() );
- for( int k = 0 ; k < ce.getLength(); k++, posOnRef++ ) mismatch_flags[posOnRef] = expanded_quals[posOnRef] = -1;
-
- break;
- case M:
- case EQ:
- case X:
- for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) {
- if ( readBases[posOnRead] != ref[posOnRef] ) { // mismatch!
- mms++;
- mismatch_flags[posOnRef] = 1;
- }
- expanded_quals[posOnRef] = read.getBaseQualities()[posOnRead];
- }
- fromStart += ce.getLength();
- break; // advance along the gapless block in the alignment
- default :
- throw new IllegalArgumentException("Unexpected operator in cigar string: "+ce.getOperator());
- }
-
- if ( type == null ) continue; // element was not an indel, go grab next element...
-
- // we got an indel if we are here...
- if ( i == 0 ) logger.debug("Indel at the start of the read "+read.getReadName());
- if ( i == nCigarElems - 1) logger.debug("Indel at the end of the read "+read.getReadName());
-
- // note that here we will be assigning indels to the first deleted base or to the first
- // base after insertion, not to the last base before the event!
- int fromEnd = readLength - fromStart;
- if ( type == IndelVariant.Type.I ) fromEnd -= ce.getLength();
-
- l.addObservation((int)(offset+eventPosition), type, indel_bases, fromStart, fromEnd, this);
-
- if ( type == IndelVariant.Type.I ) fromStart += ce.getLength();
-
- }
- }
-
- public SAMRecord getSAMRecord() { return read; }
-
- public byte [] getExpandedMMFlags() { return mismatch_flags; }
-
- public byte [] getExpandedQuals() { return expanded_quals; }
-
- public int getMMCount() { return mms; }
-
- public boolean equals(Object o) {
- if ( this == o ) return true;
- if ( read == null ) return false;
- if ( o instanceof SAMRecord ) return read.equals(o);
- if ( o instanceof ExpandedSAMRecord ) return read.equals(((ExpandedSAMRecord)o).read);
- return false;
- }
-
-
- }
-
-}
-
-
-class VCFIndelAttributes {
- public static String ALLELIC_DEPTH_KEY = VCFConstants.GENOTYPE_ALLELE_DEPTHS;
- public static String DEPTH_TOTAL_KEY = VCFConstants.DEPTH_KEY;
-
- public static String MAPQ_KEY = "MQS";
-
- public static String MM_KEY = "MM";
-
- public static String NQS_MMRATE_KEY = "NQSMM";
-
- public static String NQS_AVQ_KEY = "NQSBQ";
-
- public static String STRAND_COUNT_KEY = "SC";
- public static String RSTART_OFFSET_KEY = "RStart";
- public static String REND_OFFSET_KEY = "REnd";
-
- public static Set extends VCFHeaderLine> getAttributeHeaderLines() {
- Set lines = new HashSet();
-
- lines.add(new VCFFormatHeaderLine(ALLELIC_DEPTH_KEY, 2, VCFHeaderLineType.Integer, "# of reads supporting consensus reference/indel at the site"));
- lines.add(new VCFFormatHeaderLine(DEPTH_TOTAL_KEY, 1, VCFHeaderLineType.Integer, "Total coverage at the site"));
-
- lines.add(new VCFFormatHeaderLine(MAPQ_KEY, 2, VCFHeaderLineType.Float, "Average mapping qualities of ref-/consensus indel-supporting reads"));
-
- lines.add(new VCFFormatHeaderLine(MM_KEY, 2, VCFHeaderLineType.Float, "Average # of mismatches per ref-/consensus indel-supporting read"));
-
- lines.add(new VCFFormatHeaderLine(NQS_MMRATE_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: fraction of mismatching bases in ref/consensus indel-supporting reads"));
-
- lines.add(new VCFFormatHeaderLine(NQS_AVQ_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: average quality of bases in ref-/consensus indel-supporting reads"));
-
- lines.add(new VCFFormatHeaderLine(STRAND_COUNT_KEY, 4, VCFHeaderLineType.Integer, "Strandness: counts of forward-/reverse-aligned reference and indel-supporting reads (FwdRef,RevRef,FwdIndel,RevIndel)"));
-
- lines.add(new VCFFormatHeaderLine(RSTART_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the starts of the reads"));
- lines.add(new VCFFormatHeaderLine(REND_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the ends of the reads"));
-
- return lines;
- }
-
- public static Map recordStrandCounts(int cnt_cons_fwd, int cnt_cons_rev, int cnt_ref_fwd, int cnt_ref_rev, Map attrs) {
- attrs.put(STRAND_COUNT_KEY, new Integer[] {cnt_cons_fwd, cnt_cons_rev, cnt_ref_fwd, cnt_ref_rev} );
- return attrs;
- }
-
- public static GenotypeBuilder recordStrandCounts(int cnt_cons_fwd, int cnt_cons_rev, int cnt_ref_fwd, int cnt_ref_rev,
- GenotypeBuilder gb) {
- return gb.attribute(STRAND_COUNT_KEY, new Integer[] {cnt_ref_fwd, cnt_ref_rev,cnt_cons_fwd, cnt_cons_rev } );
- }
-
- public static Map recordDepth(int cnt_cons, int cnt_indel, int cnt_total, Map attrs) {
- attrs.put(ALLELIC_DEPTH_KEY, new Integer[] {cnt_total-cnt_indel, cnt_cons} );
- attrs.put(DEPTH_TOTAL_KEY, cnt_total);
- return attrs;
- }
-
- public static GenotypeBuilder recordDepth(int cnt_cons, int cnt_indel, int cnt_total, GenotypeBuilder gb) {
- return gb.AD(new int[] {cnt_total-cnt_indel,cnt_cons} ).DP(cnt_total);
- }
-
- public static Map recordAvMapQ(double cons, double ref, Map attrs) {
- attrs.put(MAPQ_KEY, new Float[] {(float)ref, (float)cons} );
- return attrs;
- }
-
- public static GenotypeBuilder recordAvMapQ(double cons, double ref, GenotypeBuilder gb) {
- return gb.attribute(MAPQ_KEY,new float[] {(float)ref, (float)cons} );
- }
-
- public static Map recordAvMM(double cons, double ref, Map attrs) {
- attrs.put(MM_KEY, new Float[] {(float)ref, (float)cons} );
- return attrs;
- }
-
- public static GenotypeBuilder recordAvMM(double cons, double ref, GenotypeBuilder gb) {
- return gb.attribute(MM_KEY, new float[] {(float)ref, (float)cons} );
- }
-
- public static Map recordNQSMMRate(double cons, double ref, Map attrs) {
- attrs.put(NQS_MMRATE_KEY, new Float[] {(float)ref, (float)cons} );
- return attrs;
- }
-
- public static GenotypeBuilder recordNQSMMRate(double cons, double ref, GenotypeBuilder gb) {
- return gb.attribute(NQS_MMRATE_KEY, new float[] {(float)ref, (float)cons} );
- }
-
- public static Map recordNQSAvQ(double cons, double ref, Map attrs) {
- attrs.put(NQS_AVQ_KEY, new float[] {(float)ref, (float)cons} );
- return attrs;
- }
-
- public static GenotypeBuilder recordNQSAvQ(double cons, double ref, GenotypeBuilder gb) {
- return gb.attribute(NQS_AVQ_KEY, new float[] {(float)ref, (float)cons} );
- }
-
- public static Map recordOffsetFromStart(int median, int mad, Map attrs) {
- attrs.put(RSTART_OFFSET_KEY, new Integer[] {median, mad} );
- return attrs;
- }
-
- public static GenotypeBuilder recordOffsetFromStart(int median, int mad, GenotypeBuilder gb) {
- return gb.attribute(RSTART_OFFSET_KEY, new int[] {median, mad} );
- }
-
- public static Map recordOffsetFromEnd(int median, int mad, Map attrs) {
- attrs.put(REND_OFFSET_KEY, new Integer[] {median, mad} );
- return attrs;
- }
-
- public static GenotypeBuilder recordOffsetFromEnd(int median, int mad, GenotypeBuilder gb) {
- return gb.attribute(REND_OFFSET_KEY, new int[] {median, mad} );
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/CircularArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/CircularArray.java
deleted file mode 100644
index 5a669e65c..000000000
--- a/public/java/src/org/broadinstitute/sting/utils/collections/CircularArray.java
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.utils.collections;
-
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-
-
-/** This class, closely resembling a deque (except that it is not dynamically grown),
- * provides an object with array-like interface and efficient
- * implementation of shift operation. Use this class when some kind of sliding window is required:
- * e.g. an array (window) is populated from some stream of data, and then the window is shifted.
- * If most of the data in the window remains the same so that only a few old elements sjould be popped from
- * and a few new elements pushed onto the array, both re-populating the whole array from the data and
- * shifting a regular array would be grossly inefficient. Instead, shiftData(int N) method of circular array
- * efficiently pops out N first elements and makes last N elements available.
- *
- * Consider an example of reading a character stream A,B,C,D,....,Z into an array with requirement of keeping
- * last 5 letters. First, we would read first 5 letters same way as we would with a regular array:
- *
- *
- * CircularArray a(5);
- * for ( int i = 0; i < 5; i++ ) a.set(i, readChar());
- *
- *
- * and then on the arrival of each next character we shift the array:
- *
- *
- * a.shiftData(1); a.set(4, readChar() );
- *
- *
- * After the lines from the above example are executed, the array will logically look as:
- *
- * B,C,D,E,F,
- *
- * e.g. as if we had a regular array, shifted it one element down and added new element on the top.
- *
- *
- * @author asivache
- *
- */
-public class CircularArray {
-
-
- private Object[] data ;
- private int offset;
-
- /** Creates an array of fixed length */
- public CircularArray(int length) {
- if ( length <= 0 ) throw new ReviewedStingException("CircularArray length must be positive. Passed: "+length);
- data = new Object[length];
- offset = 0;
- }
-
- /** Returns length of the array */
- public int length() {
- return data.length;
- }
-
- /** Gets i-th element of the array
- *
- * @throws IndexOutOfBoundsException if value of i is illegal
- */
- @SuppressWarnings("unchecked")
- public T get(int i) {
- if ( i < 0 || i >= data.length )
- throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; element requested: "+i);
- return (T)(data [ ( offset + i ) % data.length ]);
- }
-
- /** Sets i-th element of the array to the specified value.
- *
- * @throws IndexOutOfBoundsException if value of i is illegal
- */
- public void set(int i, T value) {
- if ( i < 0 || i >= data.length )
- throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; set element request at: "+i);
- data [ ( offset + i ) % data.length ] = value;
- }
-
- /** Set all elements to null.
- *
- */
- public void clear() {
- for ( int i = 0 ; i < data.length ; i++ ) data[i] = null;
- offset = 0;
- }
-
- /** Efficient shift-down of the array data. After this operation, array.get(0), array.get(1), etc will
- * be returning what array.get(shift), array.get(shift+1),... were returning before the shift was performed,
- * and last shift elements of the array will be reset to 0.
- * @param shift
- */
- public void shiftData(int shift) {
- if ( shift >= data.length ) {
- // if we shift by more than the length of stored data, we lose
- // all that data completely, so we just re-initialize the array.
- // This is not the operating mode CircularArray is intended for
- // but we can handle it, just in case.
- for ( int i = 0 ; i < data.length ; i++ ) data[i] = null;
- offset = 0;
- return;
- }
-
- // shift < data.length, so at least some data should be preserved
-
- final int newOffset = ( offset+shift ) % data.length;
- if ( newOffset < offset ) {
- // wrap-around!
- for ( int i = offset ; i < data.length ; i++ ) data[i] = null;
- for ( int i = 0; i < newOffset ; i++ ) data[i] = null;
- } else {
- for ( int i = offset ; i < newOffset ; i++ ) data[i] = null;
- }
- offset = newOffset;
- }
-
-
-
- /** Implements primitive int type-based circular array. See CircularArray for details.
- *
- * @author asivache
- *
- */
- public static class Int {
- private int [] data ;
- private int offset;
-
- /** Creates an array of fixed length */
- public Int(int length) {
- if ( length <= 0 ) throw new ReviewedStingException("CircularArray length must be positive. Passed: "+length);
- data = new int[length]; // automaticaly initialized to zeros
- offset = 0;
- }
-
- /** Returns length of the array */
- public int length() {
- return data.length;
- }
-
- /** Gets i-th element of the array
- *
- * @throws IndexOutOfBoundsException if value of i is illegal
- */
- public int get(int i) {
- if ( i < 0 || i >= data.length )
- throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; element requested: "+i);
- return data [ ( offset + i ) % data.length ];
- }
-
- /** Sets i-th element of the array to the specified value.
- *
- * @throws IndexOutOfBoundsException if value of i is illegal
- */
- public void set(int i, int value) {
- if ( i < 0 || i >= data.length )
- throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; set element request at: "+i);
- data [ ( offset + i ) % data.length ] = value;
- }
-
- /** Increments i-th element of the array by the specified value (value can be negative).
- *
- * @throws IndexOutOfBoundsException if i is illegal
- */
- public void increment(int i, int value) {
- if ( i < 0 || i >= data.length )
- throw new IndexOutOfBoundsException("Length of CircularArray: "+data.length+"; increment element request at: "+i);
- data [ ( offset + i ) % data.length ] += value;
- }
-
- /** Set all elements to 0.
- *
- */
- public void clear() {
- for ( int i = 0 ; i < data.length ; i++ ) data[i] = 0;
- offset = 0;
- }
-
- /** Efficient shift-down of the array data. After this operation, array.get(0), array.get(1), etc will
- * be returning what array.get(shift), array.get(shift+1),... were returning before the shift was performed,
- * and last shift elements of the array will be reset to 0.
- * @param shift
- */
- public void shiftData(int shift) {
- if ( shift >= data.length ) {
- // if we shift by more than the length of stored data, we lose
- // all that data completely, so we just re-initialize the array.
- // This is not the operating mode CircularArray is intended for
- // but we can handle it, just in case.
- for ( int i = 0 ; i < data.length ; i++ ) data[i] = 0;
- offset = 0;
- return;
- }
-
- // shift < data.length, so at least some data should be preserved
-
- final int newOffset = ( offset+shift ) % data.length;
- if ( newOffset < offset ) {
- // wrap-around!
- for ( int i = offset ; i < data.length ; i++ ) data[i] = 0;
- for ( int i = 0; i < newOffset ; i++ ) data[i] = 0;
- } else {
- for ( int i = offset ; i < newOffset ; i++ ) data[i] = 0;
- }
- offset = newOffset;
- }
- }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java b/public/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java
deleted file mode 100755
index 29ffb13e4..000000000
--- a/public/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.utils.interval;
-
-import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
-import org.broadinstitute.sting.utils.GenomeLoc;
-
-import java.util.Iterator;
-
-/**
- * Created by IntelliJ IDEA.
- * User: asivache
- * Date: Oct 7, 2010
- * Time: 2:40:02 PM
- * To change this template use File | Settings | File Templates.
- */
-
-/** This class provides an adapter to Iterator that returns only (parts of) underlying iterator's
- * intervals overlapping with specified "master set" of bounding intervals. The underlying iterator must return
- * NON-overlapping intervals in coordinate-sorted order, otherwise the behavior is unspecified. If the master set is represented by
- * another interval iterator, it should return sorted and NON-overlapping intervals.
- *
- */
-public class OverlappingIntervalIterator implements Iterator {
- PushbackIterator iter = null;
- PushbackIterator boundBy = null;
-
- GenomeLoc prefetchedOverlap = null;
- GenomeLoc currentBound = null;
- GenomeLoc currentInterval = null;
-
-
- /** Creates new overlapping iterator that will internally traverse intervals and return only
- * overlaps of those with set of intervals returned by boundBy.
- * @param intervals
- * @param boundBy
- */
- public OverlappingIntervalIterator(Iterator intervals, Iterator boundBy) {
- this.iter = new PushbackIterator(intervals);
- this.boundBy = new PushbackIterator(boundBy);
-
- if ( iter.hasNext() && boundBy.hasNext() ) {
- currentInterval = iter.next(); // load first interval
- currentBound = boundBy.next(); // load first bounding interval
- fetchNextOverlap();
- }
- }
-
- /** Traverses both iterators in sync, until the first overlap between the two is reached. If no overlap is found
- * until the end of the either of the two streams, leaves prefetchedOverlap set to null
- */
- private void fetchNextOverlap() {
-
- prefetchedOverlap = null;
- // System.out.println("Fetching... (interval="+currentInterval+"; bound="+currentBound+")");
- while ( currentInterval != null && currentBound != null ) {
-
- if ( currentInterval.isBefore(currentBound) ) {
-// System.out.println(currentInterval +" is before "+currentBound );
- if ( ! iter.hasNext() ) currentInterval = null;
- else currentInterval = iter.next();
- continue;
- }
-
- if ( currentInterval.isPast(currentBound) ) {
-// System.out.println(currentInterval +" is past "+currentBound );
- if ( ! boundBy.hasNext() ) currentBound = null;
- else currentBound = boundBy.next();
- continue;
- }
-
- // we are at this point only if currentInterval overlaps with currentBound
-
- prefetchedOverlap = currentInterval.intersect(currentBound);
-// System.out.println("Fetched next overlap: "+prefetchedOverlap);
- // now we need to advance at least one of the iterators, so that we would not
- // call the same overlap again
-
- // however we still do not know if we are done with either current interval or current bound, because
- // two special situations are possible:
- //
- // 1) next interval overlaps with 2) current interval also overlaps with
- // the same bounding interval; next bounding interval; note that
- // note that in this case next in this case next bound necessarily
- // interval necessarily starts before starts before the next interval
- // the next bound
- //
- // curr. int next int. curr. int
- // ----- ------ --------------------------
- // ------------------- --------- -------------
- // curr. bound curr. bound next bound
-
- // To solve this issue we update either only currentInterval or only currentBound to their next value,
- // whichever of those next values (intervals) comes first on the reference genome;
- // the rest of the traversal to the next overlap will be performed on the next invocation of
- // fetchNextOverlap().
-
- advanceToNearest();
-
- break; // now that we computed the overlap and advanced (at least one of) the intervals/bounds to
- // the next location, we are done - bail out from the loop.
- }
-
- }
-
- private void advanceToNearest() {
- if ( ! iter.hasNext() ) {
- currentBound = boundBy.hasNext() ? boundBy.next() : null;
- } else {
- if ( ! boundBy.hasNext() ) currentInterval = iter.hasNext() ? iter.next() : null;
- else {
- // both intervals and bounds have next value available; let's check which comes first:
- GenomeLoc nextInterval = iter.next();
- GenomeLoc nextBound = boundBy.next();
-
- if ( nextInterval.compareTo(nextBound) < 0 ) {
- currentInterval = nextInterval;
- boundBy.pushback(nextBound);
- } else {
- currentBound = nextBound;
- iter.pushback(nextInterval);
- }
-
- }
- }
- }
-
- /**
- * Returns true if the iteration has more elements. (In other
- * words, returns true if next would return an element
- * rather than throwing an exception.)
- *
- * @return true if the iterator has more elements.
- */
- public boolean hasNext() {
- return prefetchedOverlap != null;
- }
-
- /**
- * Returns the next element in the iteration.
- *
- * @return the next element in the iteration.
- * @throws java.util.NoSuchElementException
- * iteration has no more elements.
- */
- public GenomeLoc next() {
- if ( prefetchedOverlap == null )
- throw new java.util.NoSuchElementException("Illegal call to next(): Overlapping iterator has no more overlaps");
- GenomeLoc ret = prefetchedOverlap; // cache current prefetched overlap
- fetchNextOverlap(); // prefetch next overlap
- return ret ;
- }
-
- /**
- * Removes from the underlying collection the last element returned by the
- * iterator (optional operation). This method can be called only once per
- * call to next . The behavior of an iterator is unspecified if
- * the underlying collection is modified while the iteration is in
- * progress in any way other than by calling this method.
- *
- * @throws UnsupportedOperationException if the remove
- * operation is not supported by this Iterator.
- * @throws IllegalStateException if the next method has not
- * yet been called, or the remove method has already
- * been called after the last call to the next
- * method.
- */
- public void remove() {
- throw new UnsupportedOperationException("remove() method is not supported by OverlappingIntervalIterator");
- //To change body of implemented methods use File | Settings | File Templates.
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
index 7ef05edd8..5ad4f5c8b 100644
--- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
@@ -236,18 +236,6 @@ public class AlignmentUtils {
return n;
}
- public static int getNumAlignedBases(final SAMRecord r) {
- int n = 0;
- final Cigar cigar = r.getCigar();
- if (cigar == null) return 0;
-
- for (final CigarElement e : cigar.getCigarElements())
- if (e.getOperator() == CigarOperator.M)
- n += e.getLength();
-
- return n;
- }
-
public static int getNumAlignedBasesCountingSoftClips(final SAMRecord r) {
int n = 0;
final Cigar cigar = r.getCigar();
@@ -512,47 +500,6 @@ public class AlignmentUtils {
return true;
}
- /**
- * Returns true is read is mapped and mapped uniquely (Q>0).
- *
- * @param read
- * @return
- */
- public static boolean isReadUniquelyMapped(SAMRecord read) {
- return (!AlignmentUtils.isReadUnmapped(read)) && read.getMappingQuality() > 0;
- }
-
- /**
- * Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from
- * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
- * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array
- * of read's base qualitites is inverted (in this case new array is allocated and returned).
- *
- * @param read
- * @return
- */
- public static byte[] getQualsInCycleOrder(SAMRecord read) {
- if (isReadUnmapped(read) || !read.getReadNegativeStrandFlag()) return read.getBaseQualities();
-
- return Utils.reverse(read.getBaseQualities());
- }
-
- /**
- * Returns the array of original base qualitites (before recalibration) in the order the bases were read on the machine (i.e. always starting from
- * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
- * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array
- * of read's base qualitites is inverted (in this case new array is allocated and returned). If no original base qualities
- * are available this method will throw a runtime exception.
- *
- * @param read
- * @return
- */
- public static byte[] getOriginalQualsInCycleOrder(SAMRecord read) {
- if (isReadUnmapped(read) || !read.getReadNegativeStrandFlag()) return read.getOriginalBaseQualities();
-
- return Utils.reverse(read.getOriginalBaseQualities());
- }
-
/**
* Takes the alignment of the read sequence readSeq to the reference sequence refSeq
* starting at 0-based position refIndex on the refSeq and specified by its cigar.