Added NeighborhoodQualityWalker.java and ReadQualityScoreWalker.java which are used to calculate a read quality score based on attributes of the read and the reads in the neighborhood.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1922 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
68fa6da788
commit
06ff81efe5
|
|
@ -0,0 +1,162 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Iterator;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Oct 23, 2009
|
||||
*
|
||||
* This walker is designed to work as the first pass in a two-pass processing step.
|
||||
* It does a by-locus traversal calculating a neighborhood quality score based on a number of factors:
|
||||
* 1.) Number of reads in neighborhood whose mate is mapped to a different chromosome.
|
||||
* 2.) Average mapping quality of reads in the neighborhood.
|
||||
* 3.) Average reference mismatch rate for all reads in the neighborhood.
|
||||
* The output file is a list of: (GenomeLoc QualityScore) for every locus
|
||||
*
|
||||
* This walker is designed to be used in conjunction with ReadQualityScoreWalker.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Example of what the output should look like:
|
||||
*
|
||||
* 1:10999919 25.781164
|
||||
* 1:10999920 30.321754
|
||||
* 1:10999921 30.321754
|
||||
* 1:10999922 30.321754
|
||||
* 1:10999923 30.005175
|
||||
* 1:10999924 29.82714
|
||||
* 1:10999925 29.901012
|
||||
* 1:10999926 24.971085
|
||||
* 1:10999927 24.634737
|
||||
* 1:10999928 21.552652
|
||||
* 1:10999929 21.95971
|
||||
* 1:10999930 21.95971
|
||||
* 1:10999931 20.272423
|
||||
* 1:10999932 18.20454
|
||||
* 1:10999933 18.20454
|
||||
* 1:10999934 18.20454
|
||||
*/
|
||||
|
||||
public class NeighborhoodQualityWalker extends LocusWalker<Integer, Long> {
|
||||
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
float neighborhoodQualityScore = 0.0f;
|
||||
int numReadsMismatchedMate = 0; // num of reads in this locus whose mate is mapped to different chromosome
|
||||
float percentMismatchedMate = 0.0f; // percentage of reads at this locus whose mate is mapped to diff chromosome
|
||||
float avgMappingQuality = 0.0f; // mean mapping quality for all reads at this locus
|
||||
float avgMismatchRate = 0.0f; // mean mismatch with reference rate over all reads at this locus
|
||||
|
||||
int numValidMappingQuality = 0;
|
||||
long sumMappingQuality = 0L;
|
||||
float sumMismatchRate = 0.0f;
|
||||
int mappingQuality = 0; // preallocate for use in while loop below
|
||||
boolean isGoodPair = false; // preallocate for use in while loop below
|
||||
SAMRecord read = null; // preallocate for use in while loop below
|
||||
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
Iterator<SAMRecord> readsIter = reads.iterator();
|
||||
assert reads.size() > 0 : "This locus has no reads.";
|
||||
|
||||
while( readsIter.hasNext() ) { // for each read in this context
|
||||
read = readsIter.next();
|
||||
|
||||
// Only consider reads for this calculation whose mapping quality isn't 0 or 255
|
||||
mappingQuality = read.getMappingQuality();
|
||||
if ( mappingQuality > 0 && mappingQuality < 255 ) {
|
||||
|
||||
// Generate sum of mapping quality for all reads at this locus
|
||||
sumMappingQuality += mappingQuality;
|
||||
numValidMappingQuality++;
|
||||
|
||||
// Look to see if mate was mapped to different chromosome
|
||||
//isGoodPair = ( read.getReadPairedFlag() ? read.getProperPairFlag() : true );
|
||||
isGoodPair = ( !read.getReadPairedFlag() || read.getProperPairFlag() ); // optimized version of above line
|
||||
if ( !isGoodPair ) { numReadsMismatchedMate++; }
|
||||
|
||||
// Generate sum number of mismatches for all reads at this locus
|
||||
if( read.getAttribute("NM") != null ) {
|
||||
sumMismatchRate += ((float) Integer.parseInt(read.getAttribute("NM").toString())) / ((float) read.getReadLength());
|
||||
} else {
|
||||
sumMismatchRate += 1.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate averages from sums which accumulated during while loop above
|
||||
if ( numValidMappingQuality == 0 ) { numValidMappingQuality = 1; }
|
||||
percentMismatchedMate = ((float) numReadsMismatchedMate) / ((float) numValidMappingQuality);
|
||||
avgMappingQuality = sumMappingQuality / ((float) numValidMappingQuality);
|
||||
avgMismatchRate = sumMismatchRate / ((float) numValidMappingQuality);
|
||||
|
||||
// Calculate the three metrics that go into a neighborhood quality score using exponential decay model
|
||||
// BUGBUG: some analysis is needed to determine reasonable rates and scale factors for the exponential functions
|
||||
float scoreMates = 40.0f * (float) Math.exp( -16.0f * (float) percentMismatchedMate );
|
||||
// exp decay with rate 16.0, scaled to Q=40 when mismatched mates is 0%
|
||||
float scoreMapping = 40.0f * (float) Math.exp( -0.02f * Math.max( 99.0f - avgMappingQuality, 0.0f ) );
|
||||
// exp decay with rate 0.02, scaled to Q=40 when avg map quality is 99
|
||||
float scoreMismatch = 40.0f * (float) Math.exp( -27.0f * avgMismatchRate );
|
||||
// exp decay with rate 27.0, scaled to Q=40 when avg mismatch rate in reads is 0
|
||||
|
||||
// BUGBUG: some analysis is needed to determine reasonable weights for each metric
|
||||
neighborhoodQualityScore = 0.35f * scoreMates + 0.05f * scoreMapping + 0.6f * scoreMismatch;
|
||||
assert neighborhoodQualityScore >= 0.0f : "Neighborhood quality score must be nonnegative.";
|
||||
if( neighborhoodQualityScore < 1.0f ) { neighborhoodQualityScore = 1.0f; }
|
||||
|
||||
// verbose debug printing lines
|
||||
logger.debug( context.getLocation() + " " + neighborhoodQualityScore );
|
||||
logger.debug( "mate mismatch% =\t" + percentMismatchedMate + " --> " + scoreMates );
|
||||
logger.debug( "mean mappingQ =\t" + avgMappingQuality + " --> " + scoreMapping );
|
||||
logger.debug( "mean mismatchRate =\t" + avgMismatchRate + " --> " + scoreMismatch );
|
||||
|
||||
// This printout useful for making histograms of scores in Matlab
|
||||
//out.println( neighborhoodQualityScore + " " + scoreMates + " " + scoreMapping + " " + scoreMismatch);
|
||||
|
||||
out.println( context.getLocation() + " " + neighborhoodQualityScore );
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Long reduceInit() {
|
||||
return 0L;
|
||||
}
|
||||
|
||||
public Long reduce( Integer value, Long sum ) {
|
||||
return 0L; // nothing to do here
|
||||
}
|
||||
|
||||
public void onTraversalDone( Long reduceResult ) {
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,200 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
|
||||
import java.io.FileReader;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Oct 23, 2009
|
||||
*
|
||||
* This walker is designed to work as the second pass in a two-pass processing step.
|
||||
* It does a by-read traversal calculating a read quality score based on a number of factors:
|
||||
* 1.) Average neighborhood quality score for the length of the read (data generated by NeighborhoodQualityWalker)
|
||||
* 2.) Is this read's mate mapped to a different chromosome?
|
||||
* 3.) The mapping quality for this read.
|
||||
* 4.) Number of reference mismatches in this read.
|
||||
* This walker creates a new bam file in which each read is annotated by this read quality score
|
||||
* in addition if the read quality score is below the given threshold, the read is flagged.
|
||||
*
|
||||
* This walker requires as input the file of (GenomeLoc QualityScore)'s generated by NeighborhoodQualityWalker.
|
||||
* This walker accepts as input a threshold in order to flag reads which are of unacceptable read quality.
|
||||
*
|
||||
* This walker is designed to be used in conjunction with NeighborhoodQualityWalker.
|
||||
*/
|
||||
|
||||
public class ReadQualityScoreWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
@Argument(fullName = "inputQualityFile", shortName = "if", doc = "Input quality score file generated by NeighborhoodQualityWalker", required = true)
|
||||
protected String inputQualityFile = null;
|
||||
@Argument(fullName = "outputBamFile", shortName = "of", doc = "Write output to this BAM filename instead of STDOUT", required = false)
|
||||
protected SAMFileWriter outputBamFile = null;
|
||||
@Argument(fullName = "threshold", shortName = "th", doc="Flag reads whose read quality score is below this threshold", required = false)
|
||||
protected int qualityThreshold = 13;
|
||||
|
||||
protected static BufferedReader inputReader = null;
|
||||
protected static String line = null;
|
||||
|
||||
public SAMRecord map( char[] ref, SAMRecord read ) {
|
||||
return read; // all the work is done in the reduce step for this walker
|
||||
}
|
||||
|
||||
public SAMFileWriter reduceInit() {
|
||||
try {
|
||||
inputReader = new BufferedReader( new FileReader ( inputQualityFile ) );
|
||||
} catch ( FileNotFoundException e) {
|
||||
throw new StingException("Could not find required input file: " + inputQualityFile);
|
||||
} catch (IOException e) {
|
||||
throw new StingException("Failed while reading data from input file: " + inputQualityFile);
|
||||
}
|
||||
return outputBamFile;
|
||||
}
|
||||
|
||||
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
|
||||
|
||||
int readQualityScore = 0;
|
||||
float meanNeighborhoodQuality = 0.0f;
|
||||
|
||||
// The large block of code below is parsing through the input file and calculating the meanNeighborhoodQuality over the length of the read
|
||||
// It does this by first skipping in the file to where the current read starts and marking that location
|
||||
// Next it continues reading lines for the length of the read generating a sum of neighborhood quality
|
||||
// When it reaches the end of the read it jumps back to the marker so that it can be used by the next read
|
||||
// BUGBUG: This assumes reads will be sorted by start location
|
||||
float sumNeighborhoodQuality = 0.0f;
|
||||
int numLines = 0;
|
||||
GenomeLoc readLoc = GenomeLocParser.createGenomeLoc( read );
|
||||
if( readLoc.size() > 0 ) { // only calculate mean NQS if the read has a well formed GenomeLoc, if not NQS will be zero
|
||||
try {
|
||||
if( line == null ) {
|
||||
line = inputReader.readLine();
|
||||
if( line == null ) { throw new StingException( "Input file is empty: " + inputQualityFile ); }
|
||||
}
|
||||
String[] halves = line.split( " ", 2 );
|
||||
GenomeLoc curLoc = GenomeLocParser.parseGenomeLoc( halves[0] );
|
||||
while( curLoc.isBefore( readLoc ) ) { // Loop until the beginning of the read
|
||||
line = inputReader.readLine();
|
||||
if( line == null ) { throw new StingException( "Input file doesn't encompass all reads. Can't find beginning of read: " + readLoc ); }
|
||||
halves = line.split( " ", 2 );
|
||||
curLoc = GenomeLocParser.parseGenomeLoc( halves[0] );
|
||||
}
|
||||
// now we have skipped ahead in the input file to where this read starts
|
||||
logger.debug( "Starting: " + curLoc + ", read: " + readLoc + "\t size: " + readLoc.size() );
|
||||
inputReader.mark( 30 * ( (int)readLoc.size() + 3 ) ); // BUGBUG: Is this a sufficient buffer size?
|
||||
String savedLine = line;
|
||||
|
||||
while( !curLoc.isPast( readLoc ) ) { // Loop until just past the end of the read
|
||||
sumNeighborhoodQuality += Float.parseFloat( halves[1] );
|
||||
numLines++;
|
||||
line = inputReader.readLine();
|
||||
if( line == null ) { throw new StingException( "Input file doesn't encompass all reads. Can't find end of read: " + readLoc ); }
|
||||
halves = line.split( " ", 2 );
|
||||
curLoc = GenomeLocParser.parseGenomeLoc( halves[0] );
|
||||
}
|
||||
// now we have parsed the input file up to where the read ends
|
||||
// reset back to the mark in order to parse the next read in the next call to the reduce function
|
||||
inputReader.reset();
|
||||
line = savedLine;
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new StingException( "Could not find required input file: " + inputQualityFile );
|
||||
} catch (IOException e ) {
|
||||
throw new StingException( "Failed while reading data from input file: " + inputQualityFile + " Also, " + e.getMessage() );
|
||||
}
|
||||
|
||||
meanNeighborhoodQuality = sumNeighborhoodQuality / ((float) numLines);
|
||||
}
|
||||
|
||||
|
||||
// Find out if this read's mate mapped to a different chromosome
|
||||
//boolean isGoodPair = ( read.getReadPairedFlag() ? read.getProperPairFlag() : true );
|
||||
boolean isGoodPair = ( !read.getReadPairedFlag() || read.getProperPairFlag() ); // optimized version of above line
|
||||
|
||||
// Get the mapping quality for this read
|
||||
int mappingQuality = read.getMappingQuality();
|
||||
|
||||
// Get the number of reference mismatches in this read
|
||||
assert read.getReadLength() > 0 : "Read length must be greater than zero.";
|
||||
float mismatchRate = 1.0f;
|
||||
if( read.getAttribute("NM") != null ) {
|
||||
mismatchRate = ((float) Integer.parseInt(read.getAttribute("NM").toString())) / ((float) read.getReadLength());
|
||||
}
|
||||
|
||||
|
||||
// Calculate the three additional metrics that go into a read quality score
|
||||
// BUGBUG: some analysis is needed to determine reasonable quality values and rates for the exponentials
|
||||
float scoreMate = ( isGoodPair ? 40.0f : 2.0f );
|
||||
float scoreMapping = 40.0f * (float) Math.exp( -0.02f * Math.max( 99.0f - mappingQuality, 0.0f ) );
|
||||
// exp decay with rate 0.02, scaled to Q=40 when mapping quality is 99
|
||||
float scoreMismatch = 40.0f * (float) Math.exp( -27.0f * mismatchRate );
|
||||
// exp decay with rate 27.0, scaled to Q=40 when the mismatch rate is 0% for this read
|
||||
|
||||
// BUGBUG: some analysis is needed to determine reasonable weights for each metric
|
||||
readQualityScore = Math.round( 0.6f * meanNeighborhoodQuality + 0.1f * scoreMate + 0.05f * scoreMapping + 0.25f * scoreMismatch );
|
||||
if( readQualityScore == 0 ) { readQualityScore = 1; }
|
||||
assert readQualityScore > 0 : "Read quality score must be positive and nonzero.";
|
||||
|
||||
// Add the read quality score to the read in the new bam file and flag it if quality is below the given threshold
|
||||
// BUGBUG: which attributes should be set here?
|
||||
read.setAttribute( "XR", readQualityScore );
|
||||
if( readQualityScore < qualityThreshold ) {
|
||||
read.setAttribute( "ZR", 1 );
|
||||
}
|
||||
|
||||
// verbose debug printing lines
|
||||
logger.debug( read.getReadName() + " " + readQualityScore );
|
||||
logger.debug( "neighborhood quality =\t" + meanNeighborhoodQuality );
|
||||
logger.debug( "mate mismatch? =\t" + isGoodPair + " --> " + scoreMate );
|
||||
logger.debug( "mapping quality =\t" + mappingQuality + " --> " + scoreMapping );
|
||||
logger.debug( "ref mismatch rate =\t" + mismatchRate + " --> " + scoreMismatch );
|
||||
|
||||
// This printout useful for making histograms of scores in Matlab
|
||||
//out.println( readQualityScore + " " + meanNeighborhoodQuality + " " + scoreMate + " " + scoreMapping + " " + scoreMismatch );
|
||||
|
||||
// Add the read to the output bam file or output to STDOUT
|
||||
if ( output != null ) {
|
||||
output.addAlignment( read );
|
||||
} else {
|
||||
out.println( read.format() );
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
public void onTraversalDone( SAMFileWriter reduceResult ) {
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue