Moved around private walkers into appropriate directories in private gatk.walkers. Moved a few public walkers into private qc package, and some private qc walkers into the public directory. Removed several obviously broken and/or unused walkers.
This commit is contained in:
parent
f4ae6edb92
commit
defa3cfe85
|
|
@ -0,0 +1,143 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.indels;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
@By(DataSource.READS)
|
||||
// walker to count realigned reads
|
||||
public class RealignedReadCounter extends ReadWalker<Integer, Integer> {
|
||||
|
||||
public static final String ORIGINAL_CIGAR_TAG = "OC";
|
||||
public static final String ORIGINAL_POSITION_TAG = "OP";
|
||||
|
||||
@Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true)
|
||||
protected String intervalsFile = null;
|
||||
|
||||
// the intervals input by the user
|
||||
private Iterator<GenomeLoc> intervals = null;
|
||||
|
||||
// the current interval in the list
|
||||
private GenomeLoc currentInterval = null;
|
||||
|
||||
private long updatedIntervals = 0, updatedReads = 0, affectedBases = 0;
|
||||
private boolean intervalWasUpdated = false;
|
||||
|
||||
public void initialize() {
|
||||
// prepare to read intervals one-by-one, as needed (assuming they are sorted).
|
||||
intervals = new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY );
|
||||
currentInterval = intervals.hasNext() ? intervals.next() : null;
|
||||
}
|
||||
|
||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
if ( currentInterval == null ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
GenomeLoc readLoc = ref.getGenomeLocParser().createGenomeLoc(read);
|
||||
// hack to get around unmapped reads having screwy locations
|
||||
if ( readLoc.getStop() == 0 )
|
||||
readLoc = ref.getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart());
|
||||
|
||||
if ( readLoc.isBefore(currentInterval) || ReadUtils.is454Read(read) )
|
||||
return 0;
|
||||
|
||||
if ( readLoc.overlapsP(currentInterval) ) {
|
||||
if ( doNotTryToClean(read) )
|
||||
return 0;
|
||||
|
||||
if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) {
|
||||
String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG);
|
||||
// deal with an old bug
|
||||
if ( read.getCigar().toString().equals(newCigar) ) {
|
||||
//System.out.println(currentInterval + ": " + read.getReadName() + " " + read.getCigarString() + " " + newCigar);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ( !intervalWasUpdated ) {
|
||||
intervalWasUpdated = true;
|
||||
updatedIntervals++;
|
||||
affectedBases += 20 + getIndelSize(read);
|
||||
}
|
||||
updatedReads++;
|
||||
|
||||
}
|
||||
} else {
|
||||
do {
|
||||
intervalWasUpdated = false;
|
||||
currentInterval = intervals.hasNext() ? intervals.next() : null;
|
||||
} while ( currentInterval != null && currentInterval.isBefore(readLoc) );
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private int getIndelSize(SAMRecord read) {
|
||||
for ( CigarElement ce : read.getCigar().getCigarElements() ) {
|
||||
if ( ce.getOperator() == CigarOperator.I )
|
||||
return 0;
|
||||
if ( ce.getOperator() == CigarOperator.D )
|
||||
return ce.getLength();
|
||||
}
|
||||
logger.warn("We didn't see an indel for this read: " + read.getReadName() + " " + read.getAlignmentStart() + " " + read.getCigar());
|
||||
return 0;
|
||||
}
|
||||
|
||||
private boolean doNotTryToClean(SAMRecord read) {
|
||||
return read.getReadUnmappedFlag() ||
|
||||
read.getNotPrimaryAlignmentFlag() ||
|
||||
read.getReadFailsVendorQualityCheckFlag() ||
|
||||
read.getMappingQuality() == 0 ||
|
||||
read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ||
|
||||
(BadMateFilter.hasBadMate(read));
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum + value;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
System.out.println(updatedIntervals + " intervals were updated");
|
||||
System.out.println(updatedReads + " reads were updated");
|
||||
System.out.println(affectedBases + " bases were affected");
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RefWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.List;
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but
|
||||
* very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to.
|
||||
* This was its very first use.
|
||||
*/
|
||||
public class CountIntervals extends RefWalker<Long, Long> {
|
||||
@Output
|
||||
PrintStream out;
|
||||
|
||||
@Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false)
|
||||
int numOverlaps = 2;
|
||||
|
||||
public Long reduceInit() {
|
||||
return 0l;
|
||||
}
|
||||
|
||||
public boolean isReduceByInterval() { return true; }
|
||||
|
||||
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<GATKFeature> checkIntervals = tracker.getGATKFeatureMetaData("check",false);
|
||||
return (long) checkIntervals.size();
|
||||
}
|
||||
|
||||
public Long reduce(Long loc, Long prev) {
|
||||
if ( loc == null ) {
|
||||
return 0l;
|
||||
} else {
|
||||
return Math.max(prev,loc);
|
||||
}
|
||||
}
|
||||
|
||||
public void onTraversalDone(List<Pair<GenomeLoc,Long>> finalReduce) {
|
||||
long count = 0;
|
||||
for ( Pair<GenomeLoc,Long> g : finalReduce ) {
|
||||
if ( g.second >= numOverlaps) {
|
||||
count ++;
|
||||
}
|
||||
}
|
||||
out.printf("Number of contiguous intervals: %d",count);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,157 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Emits specific fields as dictated by the user from one or more VCF files.
|
||||
*/
|
||||
@Requires(value={})
|
||||
public class ProfileRodSystem extends RodWalker<Integer, Integer> {
|
||||
@Output(doc="File to which results should be written",required=true)
|
||||
protected PrintStream out;
|
||||
|
||||
@Argument(fullName="nIterations", shortName="N", doc="Number of raw reading iterations to perform", required=false)
|
||||
int nIterations = 1;
|
||||
|
||||
@Argument(fullName="maxRecords", shortName="M", doc="Max. number of records to process", required=false)
|
||||
int MAX_RECORDS = -1;
|
||||
|
||||
SimpleTimer timer = new SimpleTimer("myTimer");
|
||||
|
||||
public void initialize() {
|
||||
File rodFile = getRodFile();
|
||||
|
||||
out.printf("# walltime is in seconds%n");
|
||||
out.printf("# file is %s%n", rodFile);
|
||||
out.printf("# file size is %d bytes%n", rodFile.length());
|
||||
out.printf("operation\titeration\twalltime%n");
|
||||
for ( int i = 0; i < nIterations; i++ ) {
|
||||
out.printf("read.bytes\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_BYTE));
|
||||
out.printf("read.line\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_LINE));
|
||||
out.printf("line.and.parts\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_PARTS));
|
||||
out.printf("decode.loc\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE_LOC));
|
||||
out.printf("full.decode\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE));
|
||||
}
|
||||
|
||||
timer.start(); // start up timer for map itself
|
||||
}
|
||||
|
||||
private enum ReadMode { BY_BYTE, BY_LINE, BY_PARTS, DECODE_LOC, DECODE };
|
||||
|
||||
private final double readFile(File f, ReadMode mode) {
|
||||
timer.start();
|
||||
|
||||
try {
|
||||
byte[] data = new byte[100000];
|
||||
FileInputStream s = new FileInputStream(f);
|
||||
|
||||
if ( mode == ReadMode.BY_BYTE ) {
|
||||
while (true) {
|
||||
if ( s.read(data) == -1 )
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
int counter = 0;
|
||||
VCFCodec codec = new VCFCodec();
|
||||
String[] parts = new String[100000];
|
||||
AsciiLineReader lineReader = new AsciiLineReader(s);
|
||||
|
||||
if ( mode == ReadMode.DECODE_LOC || mode == ReadMode.DECODE )
|
||||
codec.readHeader(lineReader);
|
||||
|
||||
while (counter++ < MAX_RECORDS || MAX_RECORDS == -1) {
|
||||
String line = lineReader.readLine();
|
||||
if ( line == null )
|
||||
break;
|
||||
else if ( mode == ReadMode.BY_PARTS ) {
|
||||
ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR);
|
||||
}
|
||||
else if ( mode == ReadMode.DECODE_LOC ) {
|
||||
codec.decodeLoc(line);
|
||||
}
|
||||
else if ( mode == ReadMode.DECODE ) {
|
||||
processOneVC((VariantContext)codec.decode(line));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch ( Exception e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return timer.getElapsedTime();
|
||||
}
|
||||
|
||||
private File getRodFile() {
|
||||
List<ReferenceOrderedDataSource> rods = this.getToolkit().getRodDataSources();
|
||||
ReferenceOrderedDataSource rod = rods.get(0);
|
||||
return rod.getFile();
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null ) // RodWalkers can make funky map calls
|
||||
return 0;
|
||||
|
||||
VariantContext vc = tracker.getVariantContext(ref, "rod", context.getLocation());
|
||||
processOneVC(vc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private static final void processOneVC(VariantContext vc) {
|
||||
vc.getNSamples(); // force us to parse the samples
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer counter, Integer sum) {
|
||||
return counter + sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer sum) {
|
||||
out.printf("gatk.traversal\t%d\t%.2f%n", 0, timer.getElapsedTime());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,153 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.*;
|
||||
import java.math.BigInteger;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* a walker for validating (in the style of validating pile-up) the ROD system.
|
||||
*/
|
||||
@Reference(window=@Window(start=-40,stop=40))
|
||||
public class RodSystemValidationWalker extends RodWalker<Integer,Integer> {
|
||||
|
||||
// the divider to use in some of the text output
|
||||
private static final String DIVIDER = ",";
|
||||
|
||||
@Output
|
||||
public PrintStream out;
|
||||
|
||||
@Argument(fullName="PerLocusEqual",required=false,doc="Should we check that all records at the same site produce equivilent variant contexts")
|
||||
public boolean allRecordsVariantContextEquivalent = false;
|
||||
|
||||
// used to calculate the MD5 of a file
|
||||
MessageDigest digest = null;
|
||||
|
||||
// we sometimes need to know what rods the engine's seen
|
||||
List<ReferenceOrderedDataSource> rodList;
|
||||
|
||||
/**
|
||||
* emit the md5 sums for each of the input ROD files (will save up a lot of time if and when the ROD files change
|
||||
* underneath us).
|
||||
*/
|
||||
public void initialize() {
|
||||
// setup the MD5-er
|
||||
try {
|
||||
digest = MessageDigest.getInstance("MD5");
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new ReviewedStingException("Unable to find MD5 checksumer");
|
||||
}
|
||||
out.println("Header:");
|
||||
// enumerate the list of ROD's we've loaded
|
||||
rodList = this.getToolkit().getRodDataSources();
|
||||
for (ReferenceOrderedDataSource rod : rodList) {
|
||||
out.println(rod.getName() + DIVIDER + rod.getType());
|
||||
out.println(rod.getName() + DIVIDER + rod.getFile());
|
||||
out.println(rod.getName() + DIVIDER + md5sum(rod.getFile()));
|
||||
}
|
||||
out.println("Data:");
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param tracker the ref meta data tracker to get RODs
|
||||
* @param ref reference context
|
||||
* @param context the reads
|
||||
* @return an 1 for each site with a rod(s), 0 otherwise
|
||||
*/
|
||||
@Override
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
int ret = 0;
|
||||
if (tracker != null && tracker.getAllRods().size() > 0) {
|
||||
out.print(context.getLocation() + DIVIDER);
|
||||
Collection<GATKFeature> features = tracker.getAllRods();
|
||||
for (GATKFeature feat : features)
|
||||
out.print(feat.getName() + DIVIDER);
|
||||
out.println(";");
|
||||
ret++;
|
||||
}
|
||||
|
||||
// if the argument was set, check for equivalence
|
||||
if (allRecordsVariantContextEquivalent && tracker != null) {
|
||||
Collection<VariantContext> col = tracker.getAllVariantContexts(ref);
|
||||
VariantContext con = null;
|
||||
for (VariantContext contextInList : col)
|
||||
if (con == null) con = contextInList;
|
||||
else if (!con.equals(col)) out.println("FAIL: context " + col + " doesn't match " + con);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provide an initial value for reduce computations.
|
||||
*
|
||||
* @return Initial value of reduce.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return accumulator with result of the map taken into account.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
// Double check traversal result to make count is the same.
|
||||
// TODO: Is this check necessary?
|
||||
out.println("[REDUCE RESULT] Traversal result is: " + result);
|
||||
}
|
||||
|
||||
// shamelessly absconded and adapted from http://www.javalobby.org/java/forums/t84420.html
|
||||
private String md5sum(File f) {
|
||||
InputStream is;
|
||||
try {
|
||||
is = new FileInputStream(f);
|
||||
} catch (FileNotFoundException e) {
|
||||
return "Not a file";
|
||||
}
|
||||
byte[] buffer = new byte[8192];
|
||||
int read = 0;
|
||||
try {
|
||||
while ((read = is.read(buffer)) > 0) {
|
||||
digest.update(buffer, 0, read);
|
||||
}
|
||||
byte[] md5sum = digest.digest();
|
||||
BigInteger bigInt = new BigInteger(1, md5sum);
|
||||
return bigInt.toString(16);
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException("Unable to process file for MD5", e);
|
||||
}
|
||||
finally {
|
||||
try {
|
||||
is.close();
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException("Unable to close input stream for MD5 calculation", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,298 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
|
||||
* Can also count the number of reads matching a given criterion using read filters (see the
|
||||
* --read-filter command line argument). Simplest example of a read-backed analysis.
|
||||
*/
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.DONT_MODIFY, ApplicationTime = BAQ.ApplicationTime.HANDLED_IN_WALKER)
|
||||
@Reference(window=@Window(start=-5,stop=5))
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
|
||||
public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
|
||||
@Output(doc="File to which results should be written",required=true)
|
||||
protected PrintStream out;
|
||||
|
||||
@Argument(doc="maximum read length to apply the BAQ calculation too",required=false)
|
||||
protected int maxReadLen = 1000;
|
||||
|
||||
@Argument(doc="",required=false)
|
||||
protected int bw = 7;
|
||||
|
||||
@Argument(doc="",required=false)
|
||||
protected boolean samtoolsMode = false;
|
||||
|
||||
@Argument(doc="only operates on reads with this name",required=false)
|
||||
protected String readName = null;
|
||||
|
||||
@Argument(doc="If true, all differences are errors", required=false)
|
||||
protected boolean strict = false;
|
||||
|
||||
@Argument(doc="prints info for each read", required=false)
|
||||
protected boolean printEachRead = false;
|
||||
|
||||
@Argument(doc="Also prints out detailed comparison information when for known calculation differences", required=false)
|
||||
protected boolean alsoPrintWarnings = false;
|
||||
|
||||
@Argument(doc="Include reads without BAQ tag", required=false)
|
||||
protected boolean includeReadsWithoutBAQTag = false;
|
||||
|
||||
@Argument(doc="x each read is processed", required=false)
|
||||
protected int magnification = 1;
|
||||
|
||||
@Argument(doc="Profile performance", required=false)
|
||||
protected boolean profile = false;
|
||||
|
||||
int counter = 0;
|
||||
|
||||
BAQ baqHMM = null; // matches current samtools parameters
|
||||
|
||||
public void initialize() {
|
||||
if ( samtoolsMode )
|
||||
baqHMM = new BAQ(1e-3, 0.1, bw, (byte)0, true);
|
||||
else
|
||||
baqHMM = new BAQ();
|
||||
}
|
||||
|
||||
long goodReads = 0, badReads = 0;
|
||||
|
||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
|
||||
|
||||
if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && (includeReadsWithoutBAQTag || BAQ.hasBAQTag(read) ) ) {
|
||||
if ( baqHMM.excludeReadFromBAQ(read) )
|
||||
return 0;
|
||||
|
||||
if ( profile ) {
|
||||
profileBAQ(ref, read);
|
||||
} else {
|
||||
validateBAQ(ref, read);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
SimpleTimer tagTimer = new SimpleTimer("from.tag");
|
||||
SimpleTimer baqReadTimer = new SimpleTimer("baq.read");
|
||||
SimpleTimer glocalTimer = new SimpleTimer("hmm.glocal");
|
||||
|
||||
private void profileBAQ(ReferenceContext ref, SAMRecord read) {
|
||||
IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference();
|
||||
BAQ.BAQCalculationResult baq = null;
|
||||
|
||||
tagTimer.restart();
|
||||
for ( int i = 0; i < magnification; i++ ) { BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); }
|
||||
tagTimer.stop();
|
||||
|
||||
baqReadTimer.restart();
|
||||
for ( int i = 0; i < magnification; i++ ) { baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY ); }
|
||||
baqReadTimer.stop();
|
||||
|
||||
glocalTimer.restart();
|
||||
for ( int i = 0; i < magnification; i++ )
|
||||
baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY);
|
||||
glocalTimer.stop();
|
||||
}
|
||||
|
||||
|
||||
private void validateBAQ(ReferenceContext ref, SAMRecord read) {
|
||||
IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference();
|
||||
byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag);
|
||||
if (counter++ % 1000 == 0 || printEachRead) out.printf("Checking read %s (%d)%n", read.getReadName(), counter);
|
||||
|
||||
BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader);
|
||||
|
||||
boolean fail = false;
|
||||
boolean print = false;
|
||||
int badi = 0;
|
||||
|
||||
if ( BAQ.hasBAQTag(read) ) {
|
||||
for ( badi = 0; badi < baqFromTag.length; badi++ ) {
|
||||
if ( baqFromTag[badi] != baq.bq[badi] ) {
|
||||
if ( cigarLength(read) != read.getReadLength() ) {
|
||||
print = true;
|
||||
fail = false;
|
||||
out.printf(" different, but cigar length != read length%n");
|
||||
break;
|
||||
}
|
||||
if (MathUtils.arrayMin(read.getBaseQualities()) == 0) {
|
||||
print = true;
|
||||
fail = strict;
|
||||
out.printf(" different, but Q0 base detected%n");
|
||||
break;
|
||||
}
|
||||
else if (readHasSoftClip(read) && ! samtoolsMode) {
|
||||
print = true;
|
||||
fail = strict;
|
||||
out.printf(" different, but soft clip detected%n");
|
||||
break;
|
||||
} else if (readHasDeletion(read) ) { // && ! samtoolsMode) {
|
||||
print = true;
|
||||
fail = strict;
|
||||
out.printf(" different, but deletion detected%n");
|
||||
break;
|
||||
} else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) {
|
||||
print = fail = true;
|
||||
out.printf(" Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual());
|
||||
break;
|
||||
} else {
|
||||
print = fail = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( fail || print )
|
||||
badReads++;
|
||||
else
|
||||
goodReads++;
|
||||
}
|
||||
|
||||
if ( fail || printEachRead || ( print && alsoPrintWarnings ) ) {
|
||||
byte[] pos = new byte[baq.bq.length];
|
||||
for ( int i = 0; i < pos.length; i++ ) pos[i] = (byte)i;
|
||||
|
||||
out.printf(" read length : %d%n", read.getReadLength());
|
||||
out.printf(" read start : %d (%d unclipped)%n", read.getAlignmentStart(), read.getUnclippedStart());
|
||||
out.printf(" cigar : %s%n", read.getCigarString());
|
||||
out.printf(" ref bases : %s%n", new String(baq.refBases));
|
||||
out.printf(" read bases : %s%n", new String(read.getReadBases()));
|
||||
out.printf(" ref length : %d%n", baq.refBases.length);
|
||||
out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG));
|
||||
if ( BAQ.hasBAQTag(read) ) printQuals(" BQ deltas : ", getBAQDeltas(read), true);
|
||||
printQuals(" original quals: ", read.getBaseQualities(), true);
|
||||
printQuals(" baq quals: ", baq.bq, true);
|
||||
printQuals(" positions : ", pos, true);
|
||||
printQuals(" original quals: ", read.getBaseQualities());
|
||||
if ( BAQ.hasBAQTag(read) ) printQuals(" tag quals: ", baqFromTag);
|
||||
printQuals(" hmm quals: ", baq.bq);
|
||||
out.printf(" read bases : %s%n", new String(read.getReadBases()));
|
||||
out.println(Utils.dupString('-', 80));
|
||||
}
|
||||
|
||||
|
||||
if ( fail )
|
||||
throw new StingException(String.format("BAQ from read and from HMM differ in read %s at position %d: tag qual = %d, hmm qual = %d",
|
||||
read.getReadName(), badi, baqFromTag[badi], baq.bq[badi]));
|
||||
}
|
||||
|
||||
private final static boolean readHasSoftClip(SAMRecord read) {
|
||||
for (CigarElement e : read.getCigar().getCigarElements()) {
|
||||
if ( e.getOperator() == CigarOperator.SOFT_CLIP )
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private final static boolean readHasDeletion(SAMRecord read) {
|
||||
for (CigarElement e : read.getCigar().getCigarElements()) {
|
||||
if ( e.getOperator() == CigarOperator.DELETION )
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public final void printQuals( String prefix, byte[] quals ) {
|
||||
printQuals(prefix, quals, false);
|
||||
}
|
||||
|
||||
public final void printQuals( String prefix, byte[] quals, boolean asInt ) {
|
||||
printQuals(out, prefix, quals, asInt);
|
||||
}
|
||||
|
||||
public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) {
|
||||
out.print(prefix);
|
||||
for ( int i = 0; i < quals.length; i++) {
|
||||
if ( asInt ) {
|
||||
out.printf("%2d", (int)quals[i]);
|
||||
if ( i+1 != quals.length ) out.print(",");
|
||||
} else
|
||||
out.print((char)(quals[i]+33));
|
||||
}
|
||||
out.println();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present.
|
||||
* @param read
|
||||
* @return
|
||||
*/
|
||||
public static byte[] getBAQDeltas(SAMRecord read) {
|
||||
byte[] baq = BAQ.getBAQTag(read);
|
||||
if ( baq != null ) {
|
||||
byte[] deltas = new byte[baq.length];
|
||||
for ( int i = 0; i < deltas.length; i++)
|
||||
deltas[i] = (byte)(-1 * (baq[i] - 64));
|
||||
return deltas;
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
|
||||
private int cigarLength(SAMRecord read) {
|
||||
int readI = 0;
|
||||
for ( CigarElement elt : read.getCigar().getCigarElements() ) {
|
||||
int l = elt.getLength();
|
||||
switch (elt.getOperator()) {
|
||||
case N: // cannot handle these
|
||||
return 0;
|
||||
case H : case P : // ignore pads and hard clips
|
||||
break;
|
||||
case S :
|
||||
case I :
|
||||
readI += l;
|
||||
break;
|
||||
case D : break;
|
||||
case M :
|
||||
readI += l;
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
|
||||
}
|
||||
}
|
||||
return readI;
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer nreads) {
|
||||
if ( profile ) {
|
||||
out.printf("n.reads baq.per.read calculation time.in.secs%n");
|
||||
printTimer(nreads, tagTimer);
|
||||
printTimer(nreads, glocalTimer);
|
||||
printTimer(nreads, baqReadTimer);
|
||||
} else {
|
||||
out.printf("total reads BAQ'd %d; concordant BAQ reads %d %.4f; discordant BAQ reads %d %.4f%n", nreads,
|
||||
goodReads, (100.0 * goodReads) / nreads,
|
||||
badReads, (100.0 * badReads) / nreads);
|
||||
}
|
||||
}
|
||||
|
||||
private final void printTimer(int nreads, SimpleTimer timer) {
|
||||
out.printf("%d %d %s %.2f%n", nreads, magnification, timer.getName(), timer.getElapsedTime());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -12,19 +12,16 @@ import org.testng.annotations.DataProvider;
|
|||
import org.testng.annotations.BeforeMethod;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.Arrays;
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.*;
|
||||
|
||||
/**
|
||||
|
|
@ -188,8 +185,8 @@ public class BAQUnitTest extends BaseTest {
|
|||
|
||||
System.out.println(Utils.dupString('-', 40));
|
||||
System.out.println("reads : " + new String(test.readBases));
|
||||
ValidateBAQWalker.printQuals(System.out, "in-quals:", test.quals, false);
|
||||
ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false);
|
||||
printQuals(System.out, "in-quals:", test.quals, false);
|
||||
printQuals(System.out, "bq-quals:", result.bq, false);
|
||||
for (int i = 0; i < test.quals.length; i++) {
|
||||
//result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i);
|
||||
Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || test.expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality");
|
||||
|
|
@ -197,4 +194,16 @@ public class BAQUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) {
|
||||
out.print(prefix);
|
||||
for ( int i = 0; i < quals.length; i++) {
|
||||
if ( asInt ) {
|
||||
out.printf("%2d", (int)quals[i]);
|
||||
if ( i+1 != quals.length ) out.print(",");
|
||||
} else
|
||||
out.print((char)(quals[i]+33));
|
||||
}
|
||||
out.println();
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue