Added NeighborhoodQualityWalker.java and ReadQualityScoreWalker.java which are used to calculate a read quality score based on attributes of the read and the reads in the neighborhood.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1922 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-10-28 13:24:11 +00:00
parent 68fa6da788
commit 06ff81efe5
2 changed files with 362 additions and 0 deletions

View File

@ -0,0 +1,162 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.Iterator;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Oct 23, 2009
*
* This walker is designed to work as the first pass in a two-pass processing step.
* It does a by-locus traversal calculating a neighborhood quality score based on a number of factors:
* 1.) Number of reads in neighborhood whose mate is mapped to a different chromosome.
* 2.) Average mapping quality of reads in the neighborhood.
* 3.) Average reference mismatch rate for all reads in the neighborhood.
* The output file is a list of: (GenomeLoc QualityScore) for every locus
*
* This walker is designed to be used in conjunction with ReadQualityScoreWalker.
*/
/**
* Example of what the output should look like:
*
* 1:10999919 25.781164
* 1:10999920 30.321754
* 1:10999921 30.321754
* 1:10999922 30.321754
* 1:10999923 30.005175
* 1:10999924 29.82714
* 1:10999925 29.901012
* 1:10999926 24.971085
* 1:10999927 24.634737
* 1:10999928 21.552652
* 1:10999929 21.95971
* 1:10999930 21.95971
* 1:10999931 20.272423
* 1:10999932 18.20454
* 1:10999933 18.20454
* 1:10999934 18.20454
*/
public class NeighborhoodQualityWalker extends LocusWalker<Integer, Long> {
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
float neighborhoodQualityScore = 0.0f;
int numReadsMismatchedMate = 0; // num of reads in this locus whose mate is mapped to different chromosome
float percentMismatchedMate = 0.0f; // percentage of reads at this locus whose mate is mapped to diff chromosome
float avgMappingQuality = 0.0f; // mean mapping quality for all reads at this locus
float avgMismatchRate = 0.0f; // mean mismatch with reference rate over all reads at this locus
int numValidMappingQuality = 0;
long sumMappingQuality = 0L;
float sumMismatchRate = 0.0f;
int mappingQuality = 0; // preallocate for use in while loop below
boolean isGoodPair = false; // preallocate for use in while loop below
SAMRecord read = null; // preallocate for use in while loop below
List<SAMRecord> reads = context.getReads();
Iterator<SAMRecord> readsIter = reads.iterator();
assert reads.size() > 0 : "This locus has no reads.";
while( readsIter.hasNext() ) { // for each read in this context
read = readsIter.next();
// Only consider reads for this calculation whose mapping quality isn't 0 or 255
mappingQuality = read.getMappingQuality();
if ( mappingQuality > 0 && mappingQuality < 255 ) {
// Generate sum of mapping quality for all reads at this locus
sumMappingQuality += mappingQuality;
numValidMappingQuality++;
// Look to see if mate was mapped to different chromosome
//isGoodPair = ( read.getReadPairedFlag() ? read.getProperPairFlag() : true );
isGoodPair = ( !read.getReadPairedFlag() || read.getProperPairFlag() ); // optimized version of above line
if ( !isGoodPair ) { numReadsMismatchedMate++; }
// Generate sum number of mismatches for all reads at this locus
if( read.getAttribute("NM") != null ) {
sumMismatchRate += ((float) Integer.parseInt(read.getAttribute("NM").toString())) / ((float) read.getReadLength());
} else {
sumMismatchRate += 1.0f;
}
}
}
// Calculate averages from sums which accumulated during while loop above
if ( numValidMappingQuality == 0 ) { numValidMappingQuality = 1; }
percentMismatchedMate = ((float) numReadsMismatchedMate) / ((float) numValidMappingQuality);
avgMappingQuality = sumMappingQuality / ((float) numValidMappingQuality);
avgMismatchRate = sumMismatchRate / ((float) numValidMappingQuality);
// Calculate the three metrics that go into a neighborhood quality score using exponential decay model
// BUGBUG: some analysis is needed to determine reasonable rates and scale factors for the exponential functions
float scoreMates = 40.0f * (float) Math.exp( -16.0f * (float) percentMismatchedMate );
// exp decay with rate 16.0, scaled to Q=40 when mismatched mates is 0%
float scoreMapping = 40.0f * (float) Math.exp( -0.02f * Math.max( 99.0f - avgMappingQuality, 0.0f ) );
// exp decay with rate 0.02, scaled to Q=40 when avg map quality is 99
float scoreMismatch = 40.0f * (float) Math.exp( -27.0f * avgMismatchRate );
// exp decay with rate 27.0, scaled to Q=40 when avg mismatch rate in reads is 0
// BUGBUG: some analysis is needed to determine reasonable weights for each metric
neighborhoodQualityScore = 0.35f * scoreMates + 0.05f * scoreMapping + 0.6f * scoreMismatch;
assert neighborhoodQualityScore >= 0.0f : "Neighborhood quality score must be nonnegative.";
if( neighborhoodQualityScore < 1.0f ) { neighborhoodQualityScore = 1.0f; }
// verbose debug printing lines
logger.debug( context.getLocation() + " " + neighborhoodQualityScore );
logger.debug( "mate mismatch% =\t" + percentMismatchedMate + " --> " + scoreMates );
logger.debug( "mean mappingQ =\t" + avgMappingQuality + " --> " + scoreMapping );
logger.debug( "mean mismatchRate =\t" + avgMismatchRate + " --> " + scoreMismatch );
// This printout useful for making histograms of scores in Matlab
//out.println( neighborhoodQualityScore + " " + scoreMates + " " + scoreMapping + " " + scoreMismatch);
out.println( context.getLocation() + " " + neighborhoodQualityScore );
return 0;
}
public Long reduceInit() {
return 0L;
}
public Long reduce( Integer value, Long sum ) {
return 0L; // nothing to do here
}
public void onTraversalDone( Long reduceResult ) {
}
}

View File

@ -0,0 +1,200 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileWriter;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Oct 23, 2009
*
* This walker is designed to work as the second pass in a two-pass processing step.
* It does a by-read traversal calculating a read quality score based on a number of factors:
* 1.) Average neighborhood quality score for the length of the read (data generated by NeighborhoodQualityWalker)
* 2.) Is this read's mate mapped to a different chromosome?
* 3.) The mapping quality for this read.
* 4.) Number of reference mismatches in this read.
* This walker creates a new bam file in which each read is annotated by this read quality score
* in addition if the read quality score is below the given threshold, the read is flagged.
*
* This walker requires as input the file of (GenomeLoc QualityScore)'s generated by NeighborhoodQualityWalker.
* This walker accepts as input a threshold in order to flag reads which are of unacceptable read quality.
*
* This walker is designed to be used in conjunction with NeighborhoodQualityWalker.
*/
public class ReadQualityScoreWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
@Argument(fullName = "inputQualityFile", shortName = "if", doc = "Input quality score file generated by NeighborhoodQualityWalker", required = true)
protected String inputQualityFile = null;
@Argument(fullName = "outputBamFile", shortName = "of", doc = "Write output to this BAM filename instead of STDOUT", required = false)
protected SAMFileWriter outputBamFile = null;
@Argument(fullName = "threshold", shortName = "th", doc="Flag reads whose read quality score is below this threshold", required = false)
protected int qualityThreshold = 13;
protected static BufferedReader inputReader = null;
protected static String line = null;
public SAMRecord map( char[] ref, SAMRecord read ) {
return read; // all the work is done in the reduce step for this walker
}
public SAMFileWriter reduceInit() {
try {
inputReader = new BufferedReader( new FileReader ( inputQualityFile ) );
} catch ( FileNotFoundException e) {
throw new StingException("Could not find required input file: " + inputQualityFile);
} catch (IOException e) {
throw new StingException("Failed while reading data from input file: " + inputQualityFile);
}
return outputBamFile;
}
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
int readQualityScore = 0;
float meanNeighborhoodQuality = 0.0f;
// The large block of code below is parsing through the input file and calculating the meanNeighborhoodQuality over the length of the read
// It does this by first skipping in the file to where the current read starts and marking that location
// Next it continues reading lines for the length of the read generating a sum of neighborhood quality
// When it reaches the end of the read it jumps back to the marker so that it can be used by the next read
// BUGBUG: This assumes reads will be sorted by start location
float sumNeighborhoodQuality = 0.0f;
int numLines = 0;
GenomeLoc readLoc = GenomeLocParser.createGenomeLoc( read );
if( readLoc.size() > 0 ) { // only calculate mean NQS if the read has a well formed GenomeLoc, if not NQS will be zero
try {
if( line == null ) {
line = inputReader.readLine();
if( line == null ) { throw new StingException( "Input file is empty: " + inputQualityFile ); }
}
String[] halves = line.split( " ", 2 );
GenomeLoc curLoc = GenomeLocParser.parseGenomeLoc( halves[0] );
while( curLoc.isBefore( readLoc ) ) { // Loop until the beginning of the read
line = inputReader.readLine();
if( line == null ) { throw new StingException( "Input file doesn't encompass all reads. Can't find beginning of read: " + readLoc ); }
halves = line.split( " ", 2 );
curLoc = GenomeLocParser.parseGenomeLoc( halves[0] );
}
// now we have skipped ahead in the input file to where this read starts
logger.debug( "Starting: " + curLoc + ", read: " + readLoc + "\t size: " + readLoc.size() );
inputReader.mark( 30 * ( (int)readLoc.size() + 3 ) ); // BUGBUG: Is this a sufficient buffer size?
String savedLine = line;
while( !curLoc.isPast( readLoc ) ) { // Loop until just past the end of the read
sumNeighborhoodQuality += Float.parseFloat( halves[1] );
numLines++;
line = inputReader.readLine();
if( line == null ) { throw new StingException( "Input file doesn't encompass all reads. Can't find end of read: " + readLoc ); }
halves = line.split( " ", 2 );
curLoc = GenomeLocParser.parseGenomeLoc( halves[0] );
}
// now we have parsed the input file up to where the read ends
// reset back to the mark in order to parse the next read in the next call to the reduce function
inputReader.reset();
line = savedLine;
} catch ( FileNotFoundException e ) {
throw new StingException( "Could not find required input file: " + inputQualityFile );
} catch (IOException e ) {
throw new StingException( "Failed while reading data from input file: " + inputQualityFile + " Also, " + e.getMessage() );
}
meanNeighborhoodQuality = sumNeighborhoodQuality / ((float) numLines);
}
// Find out if this read's mate mapped to a different chromosome
//boolean isGoodPair = ( read.getReadPairedFlag() ? read.getProperPairFlag() : true );
boolean isGoodPair = ( !read.getReadPairedFlag() || read.getProperPairFlag() ); // optimized version of above line
// Get the mapping quality for this read
int mappingQuality = read.getMappingQuality();
// Get the number of reference mismatches in this read
assert read.getReadLength() > 0 : "Read length must be greater than zero.";
float mismatchRate = 1.0f;
if( read.getAttribute("NM") != null ) {
mismatchRate = ((float) Integer.parseInt(read.getAttribute("NM").toString())) / ((float) read.getReadLength());
}
// Calculate the three additional metrics that go into a read quality score
// BUGBUG: some analysis is needed to determine reasonable quality values and rates for the exponentials
float scoreMate = ( isGoodPair ? 40.0f : 2.0f );
float scoreMapping = 40.0f * (float) Math.exp( -0.02f * Math.max( 99.0f - mappingQuality, 0.0f ) );
// exp decay with rate 0.02, scaled to Q=40 when mapping quality is 99
float scoreMismatch = 40.0f * (float) Math.exp( -27.0f * mismatchRate );
// exp decay with rate 27.0, scaled to Q=40 when the mismatch rate is 0% for this read
// BUGBUG: some analysis is needed to determine reasonable weights for each metric
readQualityScore = Math.round( 0.6f * meanNeighborhoodQuality + 0.1f * scoreMate + 0.05f * scoreMapping + 0.25f * scoreMismatch );
if( readQualityScore == 0 ) { readQualityScore = 1; }
assert readQualityScore > 0 : "Read quality score must be positive and nonzero.";
// Add the read quality score to the read in the new bam file and flag it if quality is below the given threshold
// BUGBUG: which attributes should be set here?
read.setAttribute( "XR", readQualityScore );
if( readQualityScore < qualityThreshold ) {
read.setAttribute( "ZR", 1 );
}
// verbose debug printing lines
logger.debug( read.getReadName() + " " + readQualityScore );
logger.debug( "neighborhood quality =\t" + meanNeighborhoodQuality );
logger.debug( "mate mismatch? =\t" + isGoodPair + " --> " + scoreMate );
logger.debug( "mapping quality =\t" + mappingQuality + " --> " + scoreMapping );
logger.debug( "ref mismatch rate =\t" + mismatchRate + " --> " + scoreMismatch );
// This printout useful for making histograms of scores in Matlab
//out.println( readQualityScore + " " + meanNeighborhoodQuality + " " + scoreMate + " " + scoreMapping + " " + scoreMismatch );
// Add the read to the output bam file or output to STDOUT
if ( output != null ) {
output.addAlignment( read );
} else {
out.println( read.format() );
}
return output;
}
public void onTraversalDone( SAMFileWriter reduceResult ) {
}
}