diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index cfbce58ee..43ea46002 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Iterator; import java.util.List; @@ -37,7 +38,7 @@ public class WindowMaker implements Iterable, I /** * The data source for reads. Will probably come directly from the BAM file. */ - private final Iterator sourceIterator; + private final PeekableIterator sourceIterator; /** * Stores the sequence of intervals that the windowmaker should be tracking. @@ -69,7 +70,7 @@ public class WindowMaker implements Iterable, I this.sourceInfo = shard.getReadProperties(); this.readIterator = iterator; - this.sourceIterator = new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleData); + this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleData)); this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; } @@ -100,11 +101,6 @@ public class WindowMaker implements Iterable, I */ private final GenomeLoc locus; - /** - * Signal not to advance the iterator because we're currently sitting at the next element. - */ - private boolean atNextElement = false; - public WindowMakerIterator(GenomeLoc locus) { this.locus = locus; advance(); @@ -124,58 +120,45 @@ public class WindowMaker implements Iterable, I public boolean hasNext() { advance(); - return atNextElement; + return currentAlignmentContext != null; } public AlignmentContext next() { - advance(); - if(!atNextElement) throw new NoSuchElementException("WindowMakerIterator is out of elements for this interval."); + if(!hasNext()) throw new NoSuchElementException("WindowMakerIterator is out of elements for this interval."); - // Prepare object state for no next element. + // Consume this alignment context. AlignmentContext toReturn = currentAlignmentContext; currentAlignmentContext = null; - atNextElement = false; // Return the current element. return toReturn; } private void advance() { - // No shard boundaries specified. If currentAlignmentContext has been consumed, grab the next one. - if(locus == null) { - if(!atNextElement && sourceIterator.hasNext()) { - currentAlignmentContext = sourceIterator.next(); - atNextElement = true; - } - return; - } - - // Can't possibly find another element. Skip out early. - if(currentAlignmentContext == null && !sourceIterator.hasNext()) - return; - // Need to find the next element that is not past shard boundaries. If we travel past the edge of // shard boundaries, stop and let the next interval pick it up. - while(sourceIterator.hasNext()) { - // Seed the current alignment context first time through the loop. - if(currentAlignmentContext == null) - currentAlignmentContext = sourceIterator.next(); + while(currentAlignmentContext == null && sourceIterator.hasNext()) { + // Advance the iterator and try again. + AlignmentContext candidateAlignmentContext = sourceIterator.peek(); - // Found a match. - if(locus.containsP(currentAlignmentContext.getLocation())) { - atNextElement = true; + if(locus == null) { + // No filter present. Return everything that LocusIteratorByState provides us. + currentAlignmentContext = sourceIterator.next(); + } + else if(locus.isPast(candidateAlignmentContext.getLocation())) + // Found a locus before the current window; claim this alignment context and throw it away. + sourceIterator.next(); + else if(locus.containsP(candidateAlignmentContext.getLocation())) { + // Found a locus within the current window; claim this alignment context and call it the next entry. + currentAlignmentContext = sourceIterator.next(); + } + else if(locus.isBefore(candidateAlignmentContext.getLocation())) { + // Whoops. Skipped passed the end of the region. Iteration for this window is complete. Do + // not claim this alignment context in case it is part of the next shard. break; } - // Whoops. Skipped passed the end of the region. Iteration for this window is complete. - if(locus.isBefore(currentAlignmentContext.getLocation())) - break; - - // No more elements to examine. Iteration is complete. - if(!sourceIterator.hasNext()) - break; - - // Advance the iterator and try again. - currentAlignmentContext = sourceIterator.next(); + else + throw new ReviewedStingException("BUG: filtering locus does not contain, is not before, and is not past the given alignment context"); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java deleted file mode 100644 index 9e5a95d10..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java +++ /dev/null @@ -1,130 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata.indexer; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.apache.log4j.Logger; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.Tribble; -import org.broad.tribble.index.Index; -import org.broad.tribble.index.IndexFactory; -import org.broad.tribble.util.LittleEndianOutputStream; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; - -import java.io.File; -import java.io.FileOutputStream; - -/** - * a utility class that can create an index, written to a target location. This is useful when you're unable to write to the directory - * in which an index is located, or if you'd like to pre-index files to save time. - */ -public class RMDIndexer extends CommandLineProgram { - @Argument(shortName="in", fullName="inputFile", doc="The reference meta data file to index", required = true) - File inputFileSource = null; - - @Argument(shortName="t", fullName="type", doc="The reference meta data file format (e.g. vcf, bed)", required = true) - String inputFileType = null; - - @Input(fullName = "referenceSequence", shortName = "R", doc = "The reference to use when indexing; this sequence will be set in the index", required = true) - public File referenceFile = null; - - @Input(shortName = "i", fullName = "indexFile", doc = "Where to write the index to (as a file), if not supplied we write to .idx", required = false) - public File indexFile = null; - - @Argument(shortName = "ba", fullName = "balanceApproach", doc="the index balancing approach to take", required=false) - IndexFactory.IndexBalanceApproach approach = IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME; - - private static Logger logger = Logger.getLogger(RMDIndexer.class); - private IndexedFastaSequenceFile ref = null; - private GenomeLocParser genomeLocParser = null; - - @Override - protected int execute() throws Exception { - - - // check parameters - // --------------------------------------------------------------------------------- - - // check the input parameters - if (referenceFile != null && !referenceFile.canRead()) - throw new IllegalArgumentException("We can't read the reference file: " - + referenceFile + ", check that it exists, and that you have permissions to read it"); - - - // set the index file to the default name if they didn't specify a file - if (indexFile == null && inputFileSource != null) - indexFile = new File(inputFileSource.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION); - - // check that we can create the output file - if (indexFile == null || indexFile.exists()) - throw new IllegalArgumentException("We can't write to the index file location: " - + indexFile + ", the index exists"); - - logger.info(String.format("attempting to index file: %s", inputFileSource)); - logger.info(String.format("using reference: %s", ((referenceFile != null) ? referenceFile.getAbsolutePath() : "(not supplied)"))); - logger.info(String.format("using type: %s", inputFileType)); - logger.info(String.format("writing to location: %s", indexFile.getAbsolutePath())); - - // try to index the file - // --------------------------------------------------------------------------------- - - // setup the reference - ref = new CachingIndexedFastaSequenceFile(referenceFile); - genomeLocParser = new GenomeLocParser(ref); - - // get a track builder - RMDTrackBuilder builder = new RMDTrackBuilder(ref.getSequenceDictionary(),genomeLocParser, ValidationExclusion.TYPE.ALL); - - // find the types available to the track builders - FeatureManager.FeatureDescriptor descriptor = builder.getFeatureManager().getByName(inputFileType); - - // check that the type is valid - if (descriptor == null) - throw new IllegalArgumentException("The type specified " + inputFileType + " is not a valid type. Valid type list: " + builder.getFeatureManager().userFriendlyListOfAvailableFeatures()); - - // create the codec - FeatureCodec codec = builder.getFeatureManager().createCodec(descriptor, "foo", genomeLocParser); - - // check if it's a reference dependent feature codec - if (codec instanceof ReferenceDependentFeatureCodec) - ((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(genomeLocParser); - - // get some timing info - long currentTime = System.currentTimeMillis(); - - Index index = IndexFactory.createIndex(inputFileSource, codec, approach); - - // add writing of the sequence dictionary, if supplied - builder.validateAndUpdateIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary()); - - // create the output stream, and write the index - LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); - index.write(stream); - stream.close(); - - // report and exit - logger.info("Successfully wrote the index to location: " + indexFile + " in " + ((System.currentTimeMillis() - currentTime)/1000) + " seconds"); - return 0; // return successfully - } - - - /** - * the generic call execute main - * @param argv the arguments from the command line - */ - public static void main(String[] argv) { - try { - RMDIndexer instance = new RMDIndexer(); - start(instance, argv); - System.exit(CommandLineProgram.result); - } catch (Exception e) { - exitSystemWithError(e); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java index ff409484d..ee08cfa3b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java @@ -42,7 +42,7 @@ import java.util.List; import java.util.Map; /** - * List all of the samples in the info field + * List all of the polymorphic samples. */ public class SampleList extends InfoFieldAnnotation { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 6d917325e..58b33924b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -81,7 +81,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); - pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,UAC.OUTPUT_DEBUG_INDEL_INFO); + pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY, + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.BANDED_INDEL_COMPUTATION); alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; @@ -100,10 +101,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood GenomeLoc loc = ref.getLocus(); ArrayList aList = new ArrayList(); - if (DEBUG) { - System.out.println("'''''''''''''''''''''"); - System.out.println("Loc:"+loc.toString()); - } HashMap consensusIndelStrings = new HashMap(); int insCount = 0, delCount = 0; @@ -137,12 +134,12 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood continue; } - if (DEBUG && p.isIndel()) { +/* if (DEBUG && p.isIndel()) { System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), p.getEventLength(),p.getType().toString(), p.getEventBases()); } - + */ String indelString = p.getEventBases(); if (p.isInsertion()) { @@ -212,7 +209,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } } - if (DEBUG) { +/* if (DEBUG) { int icount = indelPileup.getNumberOfInsertions(); int dcount = indelPileup.getNumberOfDeletions(); if (icount + dcount > 0) @@ -226,7 +223,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } System.out.println(); } - } + } */ } int maxAlleleCnt = 0; @@ -237,8 +234,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood maxAlleleCnt = curCnt; bestAltAllele = s; } - if (DEBUG) - System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); +// if (DEBUG) +// System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); } //gdebug- if (maxAlleleCnt < minIndelCountForGenotyping) @@ -366,10 +363,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1; final int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1; - if (DEBUG) - System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength, - (int)ref.getWindow().size(), loc.getStart(), numPrefBases); - //System.out.println(eventLength); haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), ref, hsize, numPrefBases); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index f5a107ee7..f437db680 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -153,6 +153,9 @@ public class UnifiedArgumentCollection { // @Hidden // @Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE") // public File INDEL_RECAL_FILE = new File("indel.recal_data.csv"); + @Hidden + @Argument(fullName = "bandedIndel", shortName = "bandedIndel", doc = "Banded Indel likelihood computation", required = false) + public boolean BANDED_INDEL_COMPUTATION = false; @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) @@ -199,7 +202,7 @@ public class UnifiedArgumentCollection { // todo- arguments to remove uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT; uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - + uac.BANDED_INDEL_COMPUTATION = BANDED_INDEL_COMPUTATION; return uac; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 68cbd4fb7..c8328771b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -32,6 +32,7 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -39,16 +40,12 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import java.io.File; +import java.io.FileWriter; +import java.io.PrintStream; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; -/*import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate; -import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager; -import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDatum; -import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalibrationArgumentCollection; -*/ - public class PairHMMIndelErrorModel { public static final int BASE_QUAL_THRESHOLD = 20; @@ -57,6 +54,7 @@ public class PairHMMIndelErrorModel { private final double logGapContinuationProbability; private boolean DEBUG = false; + private boolean bandedLikelihoods = false; private static final int MAX_CACHED_QUAL = 127; @@ -95,11 +93,11 @@ public class PairHMMIndelErrorModel { } } - public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb) { + public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean bandedLikelihoods) { this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob this.logGapContinuationProbability = -indelGCP/10.0; // QUAL to log prob this.DEBUG = deb; - + this.bandedLikelihoods = bandedLikelihoods; // fill gap penalty table, affine naive model: this.GAP_CONT_PROB_TABLE = new double[MAX_HRUN_GAP_IDX]; @@ -164,8 +162,35 @@ public class PairHMMIndelErrorModel { } + private void updateCell(final int indI, final int indJ, final int X_METRIC_LENGTH, final int Y_METRIC_LENGTH, byte[] readBases, byte[] readQuals, byte[] haplotypeBases, + double[] currentGOP, double[] currentGCP, double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { + if (indI > 0 && indJ > 0) { + final int im1 = indI -1; + final int jm1 = indJ - 1; + // update current point + final byte x = readBases[im1]; + final byte y = haplotypeBases[jm1]; + final byte qual = readQuals[im1] < 1 ? 1 : (readQuals[im1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[im1]); + + final double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; + + matchMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, XMetricArray[im1][jm1] + pBaseRead, + YMetricArray[im1][jm1] + pBaseRead); + + final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; + final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; + + XMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1); + + // update Y array + final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; + final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; + YMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2); + } + } + private double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, - double[] currentGOP, double[] currentGCP) { + double[] currentGOP, double[] currentGCP, int eventLength) { final int X_METRIC_LENGTH = readBases.length+1; final int Y_METRIC_LENGTH = haplotypeBases.length+1; @@ -175,62 +200,153 @@ public class PairHMMIndelErrorModel { final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY; + + final double DIAG_TOL = 20; // means that max - min element in diags have to be > this number for banding to take effect. + // default initialization for all arrays + for (int i=0; i < X_METRIC_LENGTH; i++) { + Arrays.fill(matchMetricArray[i],Double.NEGATIVE_INFINITY); + Arrays.fill(YMetricArray[i],Double.NEGATIVE_INFINITY); + Arrays.fill(XMetricArray[i],Double.NEGATIVE_INFINITY); + } for (int i=1; i < X_METRIC_LENGTH; i++) { //initialize first column - matchMetricArray[i][0] = Double.NEGATIVE_INFINITY; - YMetricArray[i][0] = Double.NEGATIVE_INFINITY; - XMetricArray[i][0] = END_GAP_COST*(i);//logGapOpenProbability + (i-1)*logGapContinuationProbability; + XMetricArray[i][0] = END_GAP_COST*(i); } for (int j=1; j < Y_METRIC_LENGTH; j++) { // initialize first row - matchMetricArray[0][j] = Double.NEGATIVE_INFINITY; - XMetricArray[0][j] = Double.NEGATIVE_INFINITY; - YMetricArray[0][j] = END_GAP_COST*(j);//logGapOpenProbability + (j-1) * logGapContinuationProbability; + YMetricArray[0][j] = END_GAP_COST*(j); } + matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY; + XMetricArray[0][0]= YMetricArray[0][0] = 0; - for (int indI=1; indI < X_METRIC_LENGTH; indI++) { - final int im1 = indI-1; - for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) { - final int jm1 = indJ-1; - final byte x = readBases[im1]; - final byte y = haplotypeBases[jm1]; - final byte qual = readQuals[im1] < 1 ? 1 : (readQuals[im1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[im1]); - final double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; - double bestMetric = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, - XMetricArray[im1][jm1] + pBaseRead, - YMetricArray[im1][jm1] + pBaseRead); - matchMetricArray[indI][indJ] = bestMetric; + final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH -1; + final int elemsInDiag = Math.min(X_METRIC_LENGTH, Y_METRIC_LENGTH); - // update X array - // State X(i,j): X(1:i) aligned to a gap in Y(1:j). - // When in last column of X, ie X(1:i) aligned to full Y, we don't want to penalize gaps + int idxWithMaxElement = 0; - final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; - final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; - bestMetric = MathUtils.softMax(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1); - XMetricArray[indI][indJ] = bestMetric; + double maxElementInDiag = Double.NEGATIVE_INFINITY; - // update Y array - //c = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]); - //d = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]); - final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; - final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; - bestMetric = MathUtils.softMax(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2); - YMetricArray[indI][indJ] = bestMetric; + for (int diag=0; diag < numDiags; diag++) { + // compute default I and J start positions at edge of diagonals + int indI = 0; + int indJ = diag; + if (diag >= Y_METRIC_LENGTH ) { + indI = diag-(Y_METRIC_LENGTH-1); + indJ = Y_METRIC_LENGTH-1; } + + // first pass: from max element to edge + int idxLow = bandedLikelihoods? idxWithMaxElement : 0; + + // reset diag max value before starting + if (bandedLikelihoods) { + maxElementInDiag = Double.NEGATIVE_INFINITY; + // set indI, indJ to correct values + indI += idxLow; + indJ -= idxLow; + if (indI >= X_METRIC_LENGTH || indJ < 0) { + idxLow--; + indI--; + indJ++; + } + + } + + for (int el = idxLow; el < elemsInDiag; el++) { + updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases, + currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); + // update max in diagonal + if (bandedLikelihoods) { + final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); + + // check if we've fallen off diagonal value by threshold + if (bestMetric > maxElementInDiag) { + maxElementInDiag = bestMetric; + idxWithMaxElement = el; + } + else if (bestMetric < maxElementInDiag - DIAG_TOL) + break; // done w/current diagonal + } + + indI++; + if (indI >=X_METRIC_LENGTH ) + break; + indJ--; + if (indJ <= 0) + break; + } + if (bandedLikelihoods && idxLow > 0) { + // now do second part in opposite direction + indI = 0; + indJ = diag; + if (diag >= Y_METRIC_LENGTH ) { + indI = diag-(Y_METRIC_LENGTH-1); + indJ = Y_METRIC_LENGTH-1; + } + + indI += idxLow-1; + indJ -= idxLow-1; + for (int el = idxLow-1; el >= 0; el--) { + + updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases, + currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); + // update max in diagonal + final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); + + // check if we've fallen off diagonal value by threshold + if (bestMetric > maxElementInDiag) { + maxElementInDiag = bestMetric; + idxWithMaxElement = el; + } + else if (bestMetric < maxElementInDiag - DIAG_TOL) + break; // done w/current diagonal + + indJ++; + if (indJ >= Y_METRIC_LENGTH ) + break; + indI--; + if (indI <= 0) + break; + } + } + // if (DEBUG) + // System.out.format("Max:%4.1f el:%d\n",maxElementInDiag, idxWithMaxElement); } final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1; final double bestMetric = MathUtils.softMax(matchMetricArray[bestI][bestJ], - XMetricArray[bestI][bestJ], - YMetricArray[bestI][bestJ]); - - if (DEBUG) - System.out.format("Likelihood: %5.4f\n", bestMetric); + XMetricArray[bestI][bestJ], + YMetricArray[bestI][bestJ]); + /* + if (DEBUG) { + PrintStream outx, outy, outm, outs; + double[][] sumMetrics = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + try { + outx = new PrintStream("../../UGOptim/datax.txt"); + outy = new PrintStream("../../UGOptim/datay.txt"); + outm = new PrintStream("../../UGOptim/datam.txt"); + outs = new PrintStream("../../UGOptim/datas.txt"); + double metrics[] = new double[3]; + for (int indI=0; indI < X_METRIC_LENGTH; indI++) { + for (int indJ=0; indJ < Y_METRIC_LENGTH; indJ++) { + metrics[0] = matchMetricArray[indI][indJ]; + metrics[1] = XMetricArray[indI][indJ]; + metrics[2] = YMetricArray[indI][indJ]; + //sumMetrics[indI][indJ] = MathUtils.softMax(metrics); + outx.format("%4.1f ", metrics[1]); + outy.format("%4.1f ", metrics[2]); + outm.format("%4.1f ", metrics[0]); + outs.format("%4.1f ", MathUtils.softMax(metrics)); + } + outx.println(); outm.println();outy.println(); outs.println(); + } + outm.close(); outx.close(); outy.close(); + } catch (java.io.IOException e) { throw new UserException("bla");} + } + */ return bestMetric; @@ -262,11 +378,6 @@ public class PairHMMIndelErrorModel { LinkedHashMap gapOpenProbabilityMap = new LinkedHashMap(); LinkedHashMap gapContProbabilityMap = new LinkedHashMap(); - if (DEBUG) { - System.out.println("Reference bases:"); - System.out.println(new String(ref.getBases())); - } - // will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes. // todo -- refactor into separate function for (Allele a: haplotypeMap.keySet()) { @@ -278,13 +389,6 @@ public class PairHMMIndelErrorModel { // get homopolymer length profile for current haplotype int[] hrunProfile = new int[haplotypeBases.length]; getContextHomopolymerLength(haplotypeBases,hrunProfile); - if (DEBUG) { - System.out.println("Haplotype bases:"); - System.out.println(new String(haplotypeBases)); - for (int i=0; i < hrunProfile.length; i++) - System.out.format("%d",hrunProfile[i]); - System.out.println(); - } fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities); @@ -414,18 +518,16 @@ public class PairHMMIndelErrorModel { // ok, we now figured out total number of clipped bases on both ends. // Figure out where we want to place the haplotype to score read against - if (DEBUG) - System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", - numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength()); - + /* + if (DEBUG) + System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", + numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength()); + */ LinkedHashMap readEl = new LinkedHashMap(); if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) { - if (DEBUG) - System.out.println("BAD READ!!"); - int j=0; for (Allele a: haplotypeMap.keySet()) { readEl.put(a,0.0); @@ -440,13 +542,6 @@ public class PairHMMIndelErrorModel { byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases, unclippedReadBases.length-numEndClippedBases); - double[] recalCDP = null; - - if (DEBUG) { - System.out.println("Read bases:"); - System.out.println(new String(readBases)); - } - int j=0; for (Allele a: haplotypeMap.keySet()) { @@ -465,14 +560,10 @@ public class PairHMMIndelErrorModel { byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(), (int)indStart, (int)indStop); - if (DEBUG) { - System.out.println("Haplotype to test:"); - System.out.println(new String(haplotypeBases)); - } - final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); - final double readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP); + final double readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, + currentContextGOP, currentContextGCP, eventLength); readEl.put(a,readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index 945d02837..5d07922a7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -159,10 +159,11 @@ public class CycleCovariate implements StandardCovariate { */ // todo -- this should be put into a common place in the code base - private static List PACBIO_NAMES = Arrays.asList("PACBIO"); private static List ILLUMINA_NAMES = Arrays.asList("ILLUMINA", "SLX", "SOLEXA"); private static List SOLID_NAMES = Arrays.asList("SOLID"); private static List LS454_NAMES = Arrays.asList("454"); + private static List COMPLETE_GENOMICS_NAMES = Arrays.asList("COMPLETE"); + private static List PACBIO_NAMES = Arrays.asList("PACBIO"); private static boolean isPlatform(SAMRecord read, List names) { String pl = read.getReadGroup().getPlatform().toUpperCase(); @@ -176,11 +177,10 @@ public class CycleCovariate implements StandardCovariate { public void getValues(SAMRecord read, Comparable[] comparable) { //----------------------------- - // ILLUMINA and SOLID + // Illumina, Solid, PacBio, and Complete Genomics //----------------------------- - - if( isPlatform(read, ILLUMINA_NAMES) || isPlatform(read, SOLID_NAMES) || isPlatform(read, PACBIO_NAMES)) { + if( isPlatform(read, ILLUMINA_NAMES) || isPlatform(read, SOLID_NAMES) || isPlatform(read, PACBIO_NAMES) || isPlatform(read, COMPLETE_GENOMICS_NAMES) ) { final int init; final int increment; if( !read.getReadNegativeStrandFlag() ) { @@ -222,6 +222,11 @@ public class CycleCovariate implements StandardCovariate { cycle += increment; } } + + //----------------------------- + // 454 + //----------------------------- + else if ( isPlatform(read, LS454_NAMES) ) { // Some bams have "LS454" and others have just "454" final int readLength = read.getReadLength(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index ac25d4f13..2daa8c025 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -245,8 +245,7 @@ public class RecalDataManager { readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); ((GATKSAMRecord)read).setReadGroup( readGroup ); } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() + - " Users must set both the default read group using the --default_read_group argument and the default platform using the --default_platform argument." ); + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() ); } } @@ -271,8 +270,7 @@ public class RecalDataManager { } readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() + - " Users must set the default platform using the --default_platform argument." ); + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() ); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java index f31e2fc5b..75de84cb4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; /** * Created by IntelliJ IDEA. @@ -41,22 +42,29 @@ public class RecalibrationArgumentCollection { ////////////////////////////////// // Shared Command Line Arguments ////////////////////////////////// + @Hidden @Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.") public String DEFAULT_READ_GROUP = null; + @Hidden @Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; + @Hidden @Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.") public String FORCE_READ_GROUP = null; + @Hidden @Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; + @Hidden @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) public int WINDOW_SIZE = 5; /** * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score. */ + @Hidden @Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false) public int HOMOPOLYMER_NBACK = 7; + @Hidden @Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false) public boolean EXCEPTION_IF_NO_TILE = false; diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java new file mode 100644 index 000000000..7aa9fdd65 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java @@ -0,0 +1,56 @@ +package org.broadinstitute.sting.utils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.Comparator; + +/** + * + * @author Mauricio Carneiro + * @since 9/28/11 + */ +public class GenomeLocComparator implements Comparator { + /** + * compares genomeLoc's contigs + * + * @param gl1 the genome loc to compare contigs + * @param gl2 the genome loc to compare contigs + * @return 0 if equal, -1 if gl2.contig is greater, 1 if gl1.contig is greater + */ + @Requires("gl2 != null") + @Ensures("result == 0 || result == 1 || result == -1") + public final int compareContigs( GenomeLoc gl1, GenomeLoc gl2 ) { + if (gl1.contigIndex == gl2.contigIndex) + return 0; + else if (gl1.contigIndex > gl2.contigIndex) + return 1; + return -1; + } + + @Requires("gl2 != null") + @Ensures("result == 0 || result == 1 || result == -1") + public int compare ( GenomeLoc gl1, GenomeLoc gl2 ) { + int result = 0; + + if ( gl1 == gl2 ) { + result = 0; + } + else if(GenomeLoc.isUnmapped(gl1)) + result = 1; + else if(GenomeLoc.isUnmapped(gl2)) + result = -1; + else { + final int cmpContig = compareContigs(gl1, gl2); + + if ( cmpContig != 0 ) { + result = cmpContig; + } else { + if ( gl1.getStart() < gl2.getStart() ) result = -1; + if ( gl1.getStart() > gl2.getStart() ) result = 1; + } + } + + return result; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 83db20238..5230381c0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -68,7 +68,7 @@ public class ReadClipper { return result; } - private SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { + protected SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); @@ -93,8 +93,9 @@ public class ReadClipper { public SAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { if (left == right) return new SAMRecord(read.getHeader()); - this.read = hardClipByReferenceCoordinates(right, -1); - return hardClipByReferenceCoordinates(-1, left); + SAMRecord leftTailRead = hardClipByReferenceCoordinates(right, -1); + ReadClipper clipper = new ReadClipper(leftTailRead); + return clipper.hardClipByReferenceCoordinatesLeftTail(left); } public SAMRecord hardClipLowQualEnds(byte lowQual) { diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java index 7ea77a939..20f9eccd3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java @@ -325,11 +325,14 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { return Arrays.toString((Object[])value); else throw new RuntimeException("Unexpected array type in prettyPrintValue. Value was " + value + " type is " + type); - } else if ( RodBinding.class.isAssignableFrom(value.getClass() ) ) + } else if ( RodBinding.class.isAssignableFrom(value.getClass() ) ) { // annoying special case to handle the UnBound() constructor return "none"; - - return value.toString(); + } else if ( value instanceof String ) { + return value.equals("") ? "\"\"" : value; + } else { + return value.toString(); + } } /** @@ -432,11 +435,17 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { * Returns a Pair of (main, synonym) names for argument with fullName s1 and * shortName s2. The main is selected to be the longest of the two, provided * it doesn't exceed MAX_DISPLAY_NAME, in which case the shorter is taken. - * @param s1 - * @param s2 - * @return + * + * @param s1 the short argument name without -, or null if not provided + * @param s2 the long argument name without --, or null if not provided + * @return A pair of fully qualified names (with - or --) for the argument. The first + * element is the primary display name while the second (potentially null) is a + * synonymous name. */ Pair displayNames(String s1, String s2) { + s1 = s1 == null ? null : "-" + s1; + s2 = s2 == null ? null : "--" + s2; + if ( s1 == null ) return new Pair(s2, null); if ( s2 == null ) return new Pair(s1, null); @@ -510,7 +519,7 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { */ protected Map docForArgument(FieldDoc fieldDoc, ArgumentSource source, ArgumentDefinition def) { Map root = new HashMap(); - Pair names = displayNames("-" + def.shortName, "--" + def.fullName); + Pair names = displayNames(def.shortName, def.fullName); root.put("name", names.getFirst() ); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 5fa410922..e0a3a5a53 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -825,7 +825,7 @@ public class ReadUtils { * @return the read coordinate corresponding to the requested reference coordinate. (see warning!) */ @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"}) - @Ensures({"result >= 0", "result < read.getReadLength()"}) + @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) public static Pair getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) { int readBases = 0; int refBases = 0; @@ -893,6 +893,10 @@ public class ReadUtils { // base before the deletion (see warning in function contracts) else if (fallsInsideDeletion && !endsWithinCigar) readBases += shift - 1; + + // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion + else if (fallsInsideDeletion && endsWithinCigar) + readBases--; } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java index acfefd627..e5cf80826 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java @@ -118,6 +118,20 @@ public abstract class LocusViewTemplate extends BaseTest { testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); } + @Test + public void readAndLocusOverlapAtLastBase() { + SAMRecord read = buildSAMRecord("chr1", 1, 5); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 5, 5))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + @Test public void readOverlappingStartTest() { SAMRecord read = buildSAMRecord("chr1", 1, 10); diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java new file mode 100644 index 000000000..1415379db --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.clipreads; + +import net.sf.samtools.*; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.LinkedList; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: roger + * Date: 9/28/11 + * Time: 9:54 PM + * To change this template use File | Settings | File Templates. + */ +public class ReadClipperUnitTest extends BaseTest { + + // TODO: Add error messages on failed tests + + SAMRecord read, expected; + ReadClipper readClipper; + final static String BASES = "ACTG"; + final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63 + + @BeforeClass + public void init() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); + read.setReadUnmappedFlag(true); + read.setReadBases(new String(BASES).getBytes()); + read.setBaseQualityString(new String(QUALS)); + + readClipper = new ReadClipper(read); + } + + @Test + public void testHardClipBothEndsByReferenceCoordinates() { + logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(0,0), new SAMRecord(read.getHeader())); + //clip 1 base + expected = readClipper.hardClipBothEndsByReferenceCoordinates(0,3); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,3)); + Assert.assertEquals(expected.getCigarString(), "1H2M1H"); + + } + + @Test + public void testHardClipByReadCoordinates() { + logger.warn("Executing testHardClipByReadCoordinates"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipByReadCoordinates(0,3), new SAMRecord(read.getHeader())); + + //clip 1 base at start + expected = readClipper.hardClipByReadCoordinates(0,0); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); + Assert.assertEquals(expected.getCigarString(), "1H3M"); + + //clip 1 base at end + expected = readClipper.hardClipByReadCoordinates(3,3); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); + Assert.assertEquals(expected.getCigarString(), "3M1H"); + + //clip 2 bases at start + expected = readClipper.hardClipByReadCoordinates(0,1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); + Assert.assertEquals(expected.getCigarString(), "2H2M"); + + //clip 2 bases at end + expected = readClipper.hardClipByReadCoordinates(2,3); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); + Assert.assertEquals(expected.getCigarString(), "2M2H"); + + } + + @Test + public void testHardClipByReferenceCoordinates() { + logger.warn("Executing testHardClipByReferenceCoordinates"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipByReferenceCoordinates(1,4), new SAMRecord(read.getHeader())); + + //clip 1 base at start + expected = readClipper.hardClipByReferenceCoordinates(-1,1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); + Assert.assertEquals(expected.getCigarString(), "1H3M"); + + //clip 1 base at end + expected = readClipper.hardClipByReferenceCoordinates(3,-1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); + Assert.assertEquals(expected.getCigarString(), "3M1H"); + + //clip 2 bases at start + expected = readClipper.hardClipByReferenceCoordinates(-1,2); + Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); + Assert.assertEquals(expected.getCigarString(), "2H2M"); + + //clip 2 bases at end + expected = readClipper.hardClipByReferenceCoordinates(2,-1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); + Assert.assertEquals(expected.getCigarString(), "2M2H"); + + } + + @Test + public void testHardClipByReferenceCoordinatesLeftTail() { + logger.warn("Executing testHardClipByReferenceCoordinatesLeftTail"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesLeftTail(4), new SAMRecord(read.getHeader())); + + //clip 1 base at start + expected = readClipper.hardClipByReferenceCoordinatesLeftTail(1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); + Assert.assertEquals(expected.getCigarString(), "1H3M"); + + //clip 2 bases at start + expected = readClipper.hardClipByReferenceCoordinatesLeftTail(2); + Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); + Assert.assertEquals(expected.getCigarString(), "2H2M"); + + } + + @Test + public void testHardClipByReferenceCoordinatesRightTail() { + logger.warn("Executing testHardClipByReferenceCoordinatesRightTail"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesRightTail(1), new SAMRecord(read.getHeader())); + + //clip 1 base at end + expected = readClipper.hardClipByReferenceCoordinatesRightTail(3); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); + Assert.assertEquals(expected.getCigarString(), "3M1H"); + + //clip 2 bases at end + expected = readClipper.hardClipByReferenceCoordinatesRightTail(2); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); + Assert.assertEquals(expected.getCigarString(), "2M2H"); + + } + + @Test + public void testHardClipLowQualEnds() { + logger.warn("Executing testHardClipByReferenceCoordinates"); + + + //Clip whole read + Assert.assertEquals(readClipper.hardClipLowQualEnds((byte)64), new SAMRecord(read.getHeader())); + + //clip 1 base at start + expected = readClipper.hardClipLowQualEnds((byte)34); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); + Assert.assertEquals(expected.getCigarString(), "1H3M"); + + //clip 2 bases at start + expected = readClipper.hardClipLowQualEnds((byte)44); + Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); + Assert.assertEquals(expected.getCigarString(), "2H2M"); + + // Reverse Quals sequence + readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 + + //clip 1 base at end + expected = readClipper.hardClipLowQualEnds((byte)34); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); + Assert.assertEquals(expected.getCigarString(), "3M1H"); + + //clip 2 bases at end + expected = readClipper.hardClipLowQualEnds((byte)44); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); + Assert.assertEquals(expected.getCigarString(), "2M2H"); + + // revert Qual sequence + readClipper.getRead().setBaseQualityString(QUALS); + } +}