From defa3cfe852c1a1e6a0a01a4cffb59353e2dbeed Mon Sep 17 00:00:00 2001 From: "Mark A. DePristo" Date: Thu, 30 Jun 2011 14:59:58 -0400 Subject: [PATCH] Moved around private walkers into appropriate directories in private gatk.walkers. Moved a few public walkers into private qc package, and some private qc walkers into the public directory. Removed several obviously broken and/or unused walkers. --- .../walkers/indels/RealignedReadCounter.java | 143 +++++++++ .../sting/gatk/walkers/qc/CountIntervals.java | 62 ++++ .../gatk/walkers/qc/ProfileRodSystem.java | 157 --------- .../walkers/qc/RodSystemValidationWalker.java | 153 +++++++++ .../gatk/walkers/qc/ValidateBAQWalker.java | 298 ------------------ .../sting/utils/baq/BAQUnitTest.java | 21 +- 6 files changed, 373 insertions(+), 461 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java new file mode 100755 index 000000000..fc196e712 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.commandline.Argument; + +import java.io.File; +import java.util.*; + +@By(DataSource.READS) +// walker to count realigned reads +public class RealignedReadCounter extends ReadWalker { + + public static final String ORIGINAL_CIGAR_TAG = "OC"; + public static final String ORIGINAL_POSITION_TAG = "OP"; + + @Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) + protected String intervalsFile = null; + + // the intervals input by the user + private Iterator intervals = null; + + // the current interval in the list + private GenomeLoc currentInterval = null; + + private long updatedIntervals = 0, updatedReads = 0, affectedBases = 0; + private boolean intervalWasUpdated = false; + + public void initialize() { + // prepare to read intervals one-by-one, as needed (assuming they are sorted). + intervals = new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); + currentInterval = intervals.hasNext() ? intervals.next() : null; + } + + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + if ( currentInterval == null ) { + return 0; + } + + GenomeLoc readLoc = ref.getGenomeLocParser().createGenomeLoc(read); + // hack to get around unmapped reads having screwy locations + if ( readLoc.getStop() == 0 ) + readLoc = ref.getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); + + if ( readLoc.isBefore(currentInterval) || ReadUtils.is454Read(read) ) + return 0; + + if ( readLoc.overlapsP(currentInterval) ) { + if ( doNotTryToClean(read) ) + return 0; + + if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) { + String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG); + // deal with an old bug + if ( read.getCigar().toString().equals(newCigar) ) { + //System.out.println(currentInterval + ": " + read.getReadName() + " " + read.getCigarString() + " " + newCigar); + return 0; + } + + if ( !intervalWasUpdated ) { + intervalWasUpdated = true; + updatedIntervals++; + affectedBases += 20 + getIndelSize(read); + } + updatedReads++; + + } + } else { + do { + intervalWasUpdated = false; + currentInterval = intervals.hasNext() ? intervals.next() : null; + } while ( currentInterval != null && currentInterval.isBefore(readLoc) ); + } + + return 0; + } + + private int getIndelSize(SAMRecord read) { + for ( CigarElement ce : read.getCigar().getCigarElements() ) { + if ( ce.getOperator() == CigarOperator.I ) + return 0; + if ( ce.getOperator() == CigarOperator.D ) + return ce.getLength(); + } + logger.warn("We didn't see an indel for this read: " + read.getReadName() + " " + read.getAlignmentStart() + " " + read.getCigar()); + return 0; + } + + private boolean doNotTryToClean(SAMRecord read) { + return read.getReadUnmappedFlag() || + read.getNotPrimaryAlignmentFlag() || + read.getReadFailsVendorQualityCheckFlag() || + read.getMappingQuality() == 0 || + read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || + (BadMateFilter.hasBadMate(read)); + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + System.out.println(updatedIntervals + " intervals were updated"); + System.out.println(updatedReads + " reads were updated"); + System.out.println(affectedBases + " bases were affected"); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java new file mode 100755 index 000000000..feb5f62af --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.List; +import java.io.PrintStream; + +/** + * Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but + * very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to. + * This was its very first use. + */ +public class CountIntervals extends RefWalker { + @Output + PrintStream out; + + @Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false) + int numOverlaps = 2; + + public Long reduceInit() { + return 0l; + } + + public boolean isReduceByInterval() { return true; } + + public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) { + return null; + } + + List checkIntervals = tracker.getGATKFeatureMetaData("check",false); + return (long) checkIntervals.size(); + } + + public Long reduce(Long loc, Long prev) { + if ( loc == null ) { + return 0l; + } else { + return Math.max(prev,loc); + } + } + + public void onTraversalDone(List> finalReduce) { + long count = 0; + for ( Pair g : finalReduce ) { + if ( g.second >= numOverlaps) { + count ++; + } + } + out.printf("Number of contiguous intervals: %d",count); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java deleted file mode 100755 index 67879a442..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; -import org.broad.tribble.readers.AsciiLineReader; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.SimpleTimer; - -import java.io.PrintStream; -import java.io.File; -import java.io.FileInputStream; -import java.util.*; - -/** - * Emits specific fields as dictated by the user from one or more VCF files. - */ -@Requires(value={}) -public class ProfileRodSystem extends RodWalker { - @Output(doc="File to which results should be written",required=true) - protected PrintStream out; - - @Argument(fullName="nIterations", shortName="N", doc="Number of raw reading iterations to perform", required=false) - int nIterations = 1; - - @Argument(fullName="maxRecords", shortName="M", doc="Max. number of records to process", required=false) - int MAX_RECORDS = -1; - - SimpleTimer timer = new SimpleTimer("myTimer"); - - public void initialize() { - File rodFile = getRodFile(); - - out.printf("# walltime is in seconds%n"); - out.printf("# file is %s%n", rodFile); - out.printf("# file size is %d bytes%n", rodFile.length()); - out.printf("operation\titeration\twalltime%n"); - for ( int i = 0; i < nIterations; i++ ) { - out.printf("read.bytes\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_BYTE)); - out.printf("read.line\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_LINE)); - out.printf("line.and.parts\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_PARTS)); - out.printf("decode.loc\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE_LOC)); - out.printf("full.decode\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE)); - } - - timer.start(); // start up timer for map itself - } - - private enum ReadMode { BY_BYTE, BY_LINE, BY_PARTS, DECODE_LOC, DECODE }; - - private final double readFile(File f, ReadMode mode) { - timer.start(); - - try { - byte[] data = new byte[100000]; - FileInputStream s = new FileInputStream(f); - - if ( mode == ReadMode.BY_BYTE ) { - while (true) { - if ( s.read(data) == -1 ) - break; - } - } else { - int counter = 0; - VCFCodec codec = new VCFCodec(); - String[] parts = new String[100000]; - AsciiLineReader lineReader = new AsciiLineReader(s); - - if ( mode == ReadMode.DECODE_LOC || mode == ReadMode.DECODE ) - codec.readHeader(lineReader); - - while (counter++ < MAX_RECORDS || MAX_RECORDS == -1) { - String line = lineReader.readLine(); - if ( line == null ) - break; - else if ( mode == ReadMode.BY_PARTS ) { - ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR); - } - else if ( mode == ReadMode.DECODE_LOC ) { - codec.decodeLoc(line); - } - else if ( mode == ReadMode.DECODE ) { - processOneVC((VariantContext)codec.decode(line)); - } - } - } - } catch ( Exception e ) { - throw new RuntimeException(e); - } - - return timer.getElapsedTime(); - } - - private File getRodFile() { - List rods = this.getToolkit().getRodDataSources(); - ReferenceOrderedDataSource rod = rods.get(0); - return rod.getFile(); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) // RodWalkers can make funky map calls - return 0; - - VariantContext vc = tracker.getVariantContext(ref, "rod", context.getLocation()); - processOneVC(vc); - - return 0; - } - - private static final void processOneVC(VariantContext vc) { - vc.getNSamples(); // force us to parse the samples - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - public void onTraversalDone(Integer sum) { - out.printf("gatk.traversal\t%d\t%.2f%n", 0, timer.getElapsedTime()); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java new file mode 100644 index 000000000..9cb715507 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java @@ -0,0 +1,153 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.*; +import java.math.BigInteger; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Collection; +import java.util.List; + +/** + * a walker for validating (in the style of validating pile-up) the ROD system. + */ +@Reference(window=@Window(start=-40,stop=40)) +public class RodSystemValidationWalker extends RodWalker { + + // the divider to use in some of the text output + private static final String DIVIDER = ","; + + @Output + public PrintStream out; + + @Argument(fullName="PerLocusEqual",required=false,doc="Should we check that all records at the same site produce equivilent variant contexts") + public boolean allRecordsVariantContextEquivalent = false; + + // used to calculate the MD5 of a file + MessageDigest digest = null; + + // we sometimes need to know what rods the engine's seen + List rodList; + + /** + * emit the md5 sums for each of the input ROD files (will save up a lot of time if and when the ROD files change + * underneath us). + */ + public void initialize() { + // setup the MD5-er + try { + digest = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + throw new ReviewedStingException("Unable to find MD5 checksumer"); + } + out.println("Header:"); + // enumerate the list of ROD's we've loaded + rodList = this.getToolkit().getRodDataSources(); + for (ReferenceOrderedDataSource rod : rodList) { + out.println(rod.getName() + DIVIDER + rod.getType()); + out.println(rod.getName() + DIVIDER + rod.getFile()); + out.println(rod.getName() + DIVIDER + md5sum(rod.getFile())); + } + out.println("Data:"); + } + + /** + * + * @param tracker the ref meta data tracker to get RODs + * @param ref reference context + * @param context the reads + * @return an 1 for each site with a rod(s), 0 otherwise + */ + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + int ret = 0; + if (tracker != null && tracker.getAllRods().size() > 0) { + out.print(context.getLocation() + DIVIDER); + Collection features = tracker.getAllRods(); + for (GATKFeature feat : features) + out.print(feat.getName() + DIVIDER); + out.println(";"); + ret++; + } + + // if the argument was set, check for equivalence + if (allRecordsVariantContextEquivalent && tracker != null) { + Collection col = tracker.getAllVariantContexts(ref); + VariantContext con = null; + for (VariantContext contextInList : col) + if (con == null) con = contextInList; + else if (!con.equals(col)) out.println("FAIL: context " + col + " doesn't match " + con); + } + return ret; + } + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + @Override + public Integer reduceInit() { + return 0; + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + @Override + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + @Override + public void onTraversalDone(Integer result) { + // Double check traversal result to make count is the same. + // TODO: Is this check necessary? + out.println("[REDUCE RESULT] Traversal result is: " + result); + } + + // shamelessly absconded and adapted from http://www.javalobby.org/java/forums/t84420.html + private String md5sum(File f) { + InputStream is; + try { + is = new FileInputStream(f); + } catch (FileNotFoundException e) { + return "Not a file"; + } + byte[] buffer = new byte[8192]; + int read = 0; + try { + while ((read = is.read(buffer)) > 0) { + digest.update(buffer, 0, read); + } + byte[] md5sum = digest.digest(); + BigInteger bigInt = new BigInteger(1, md5sum); + return bigInt.toString(16); + } + catch (IOException e) { + throw new RuntimeException("Unable to process file for MD5", e); + } + finally { + try { + is.close(); + } + catch (IOException e) { + throw new RuntimeException("Unable to close input stream for MD5 calculation", e); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java deleted file mode 100755 index eda01bdb4..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java +++ /dev/null @@ -1,298 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.qc; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.Argument; - -import java.io.PrintStream; - -/** - * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. - * Can also count the number of reads matching a given criterion using read filters (see the - * --read-filter command line argument). Simplest example of a read-backed analysis. - */ -@BAQMode(QualityMode = BAQ.QualityMode.DONT_MODIFY, ApplicationTime = BAQ.ApplicationTime.HANDLED_IN_WALKER) -@Reference(window=@Window(start=-5,stop=5)) -@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) -public class ValidateBAQWalker extends ReadWalker { - @Output(doc="File to which results should be written",required=true) - protected PrintStream out; - - @Argument(doc="maximum read length to apply the BAQ calculation too",required=false) - protected int maxReadLen = 1000; - - @Argument(doc="",required=false) - protected int bw = 7; - - @Argument(doc="",required=false) - protected boolean samtoolsMode = false; - - @Argument(doc="only operates on reads with this name",required=false) - protected String readName = null; - - @Argument(doc="If true, all differences are errors", required=false) - protected boolean strict = false; - - @Argument(doc="prints info for each read", required=false) - protected boolean printEachRead = false; - - @Argument(doc="Also prints out detailed comparison information when for known calculation differences", required=false) - protected boolean alsoPrintWarnings = false; - - @Argument(doc="Include reads without BAQ tag", required=false) - protected boolean includeReadsWithoutBAQTag = false; - - @Argument(doc="x each read is processed", required=false) - protected int magnification = 1; - - @Argument(doc="Profile performance", required=false) - protected boolean profile = false; - - int counter = 0; - - BAQ baqHMM = null; // matches current samtools parameters - - public void initialize() { - if ( samtoolsMode ) - baqHMM = new BAQ(1e-3, 0.1, bw, (byte)0, true); - else - baqHMM = new BAQ(); - } - - long goodReads = 0, badReads = 0; - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { - - if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && (includeReadsWithoutBAQTag || BAQ.hasBAQTag(read) ) ) { - if ( baqHMM.excludeReadFromBAQ(read) ) - return 0; - - if ( profile ) { - profileBAQ(ref, read); - } else { - validateBAQ(ref, read); - } - - return 1; - } - - return 0; - } - - SimpleTimer tagTimer = new SimpleTimer("from.tag"); - SimpleTimer baqReadTimer = new SimpleTimer("baq.read"); - SimpleTimer glocalTimer = new SimpleTimer("hmm.glocal"); - - private void profileBAQ(ReferenceContext ref, SAMRecord read) { - IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference(); - BAQ.BAQCalculationResult baq = null; - - tagTimer.restart(); - for ( int i = 0; i < magnification; i++ ) { BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); } - tagTimer.stop(); - - baqReadTimer.restart(); - for ( int i = 0; i < magnification; i++ ) { baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY ); } - baqReadTimer.stop(); - - glocalTimer.restart(); - for ( int i = 0; i < magnification; i++ ) - baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY); - glocalTimer.stop(); - } - - - private void validateBAQ(ReferenceContext ref, SAMRecord read) { - IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference(); - byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); - if (counter++ % 1000 == 0 || printEachRead) out.printf("Checking read %s (%d)%n", read.getReadName(), counter); - - BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader); - - boolean fail = false; - boolean print = false; - int badi = 0; - - if ( BAQ.hasBAQTag(read) ) { - for ( badi = 0; badi < baqFromTag.length; badi++ ) { - if ( baqFromTag[badi] != baq.bq[badi] ) { - if ( cigarLength(read) != read.getReadLength() ) { - print = true; - fail = false; - out.printf(" different, but cigar length != read length%n"); - break; - } - if (MathUtils.arrayMin(read.getBaseQualities()) == 0) { - print = true; - fail = strict; - out.printf(" different, but Q0 base detected%n"); - break; - } - else if (readHasSoftClip(read) && ! samtoolsMode) { - print = true; - fail = strict; - out.printf(" different, but soft clip detected%n"); - break; - } else if (readHasDeletion(read) ) { // && ! samtoolsMode) { - print = true; - fail = strict; - out.printf(" different, but deletion detected%n"); - break; - } else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) { - print = fail = true; - out.printf(" Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual()); - break; - } else { - print = fail = true; - break; - } - } - } - if ( fail || print ) - badReads++; - else - goodReads++; - } - - if ( fail || printEachRead || ( print && alsoPrintWarnings ) ) { - byte[] pos = new byte[baq.bq.length]; - for ( int i = 0; i < pos.length; i++ ) pos[i] = (byte)i; - - out.printf(" read length : %d%n", read.getReadLength()); - out.printf(" read start : %d (%d unclipped)%n", read.getAlignmentStart(), read.getUnclippedStart()); - out.printf(" cigar : %s%n", read.getCigarString()); - out.printf(" ref bases : %s%n", new String(baq.refBases)); - out.printf(" read bases : %s%n", new String(read.getReadBases())); - out.printf(" ref length : %d%n", baq.refBases.length); - out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG)); - if ( BAQ.hasBAQTag(read) ) printQuals(" BQ deltas : ", getBAQDeltas(read), true); - printQuals(" original quals: ", read.getBaseQualities(), true); - printQuals(" baq quals: ", baq.bq, true); - printQuals(" positions : ", pos, true); - printQuals(" original quals: ", read.getBaseQualities()); - if ( BAQ.hasBAQTag(read) ) printQuals(" tag quals: ", baqFromTag); - printQuals(" hmm quals: ", baq.bq); - out.printf(" read bases : %s%n", new String(read.getReadBases())); - out.println(Utils.dupString('-', 80)); - } - - - if ( fail ) - throw new StingException(String.format("BAQ from read and from HMM differ in read %s at position %d: tag qual = %d, hmm qual = %d", - read.getReadName(), badi, baqFromTag[badi], baq.bq[badi])); - } - - private final static boolean readHasSoftClip(SAMRecord read) { - for (CigarElement e : read.getCigar().getCigarElements()) { - if ( e.getOperator() == CigarOperator.SOFT_CLIP ) - return true; - } - - return false; - } - - private final static boolean readHasDeletion(SAMRecord read) { - for (CigarElement e : read.getCigar().getCigarElements()) { - if ( e.getOperator() == CigarOperator.DELETION ) - return true; - } - - return false; - } - - public final void printQuals( String prefix, byte[] quals ) { - printQuals(prefix, quals, false); - } - - public final void printQuals( String prefix, byte[] quals, boolean asInt ) { - printQuals(out, prefix, quals, asInt); - } - - public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) { - out.print(prefix); - for ( int i = 0; i < quals.length; i++) { - if ( asInt ) { - out.printf("%2d", (int)quals[i]); - if ( i+1 != quals.length ) out.print(","); - } else - out.print((char)(quals[i]+33)); - } - out.println(); - } - - /** - * Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present. - * @param read - * @return - */ - public static byte[] getBAQDeltas(SAMRecord read) { - byte[] baq = BAQ.getBAQTag(read); - if ( baq != null ) { - byte[] deltas = new byte[baq.length]; - for ( int i = 0; i < deltas.length; i++) - deltas[i] = (byte)(-1 * (baq[i] - 64)); - return deltas; - } else - return null; - } - - private int cigarLength(SAMRecord read) { - int readI = 0; - for ( CigarElement elt : read.getCigar().getCigarElements() ) { - int l = elt.getLength(); - switch (elt.getOperator()) { - case N: // cannot handle these - return 0; - case H : case P : // ignore pads and hard clips - break; - case S : - case I : - readI += l; - break; - case D : break; - case M : - readI += l; - break; - default: - throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); - } - } - return readI; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - public void onTraversalDone(Integer nreads) { - if ( profile ) { - out.printf("n.reads baq.per.read calculation time.in.secs%n"); - printTimer(nreads, tagTimer); - printTimer(nreads, glocalTimer); - printTimer(nreads, baqReadTimer); - } else { - out.printf("total reads BAQ'd %d; concordant BAQ reads %d %.4f; discordant BAQ reads %d %.4f%n", nreads, - goodReads, (100.0 * goodReads) / nreads, - badReads, (100.0 * badReads) / nreads); - } - } - - private final void printTimer(int nreads, SimpleTimer timer) { - out.printf("%d %d %s %.2f%n", nreads, magnification, timer.getName(), timer.getElapsedTime()); - } -} - diff --git a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java index 08be4f742..f4c3163cf 100644 --- a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java @@ -12,19 +12,16 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.BeforeMethod; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.Utils; import java.io.File; import java.io.FileNotFoundException; -import java.util.Arrays; +import java.io.PrintStream; import java.util.List; import java.util.ArrayList; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.*; /** @@ -188,8 +185,8 @@ public class BAQUnitTest extends BaseTest { System.out.println(Utils.dupString('-', 40)); System.out.println("reads : " + new String(test.readBases)); - ValidateBAQWalker.printQuals(System.out, "in-quals:", test.quals, false); - ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false); + printQuals(System.out, "in-quals:", test.quals, false); + printQuals(System.out, "bq-quals:", result.bq, false); for (int i = 0; i < test.quals.length; i++) { //result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i); Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || test.expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality"); @@ -197,4 +194,16 @@ public class BAQUnitTest extends BaseTest { } } + + public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) { + out.print(prefix); + for ( int i = 0; i < quals.length; i++) { + if ( asInt ) { + out.printf("%2d", (int)quals[i]); + if ( i+1 != quals.length ) out.print(","); + } else + out.print((char)(quals[i]+33)); + } + out.println(); + } } \ No newline at end of file