From 06ff81efe57c48d865282e977ca71e178fb44234 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Wed, 28 Oct 2009 13:24:11 +0000 Subject: [PATCH] Added NeighborhoodQualityWalker.java and ReadQualityScoreWalker.java which are used to calculate a read quality score based on attributes of the read and the reads in the neighborhood. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1922 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/NeighborhoodQualityWalker.java | 162 ++++++++++++++ .../gatk/walkers/ReadQualityScoreWalker.java | 200 ++++++++++++++++++ 2 files changed, 362 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/NeighborhoodQualityWalker.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/ReadQualityScoreWalker.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/NeighborhoodQualityWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/NeighborhoodQualityWalker.java new file mode 100755 index 000000000..42faa79a6 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/NeighborhoodQualityWalker.java @@ -0,0 +1,162 @@ +package org.broadinstitute.sting.playground.gatk.walkers; + +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import net.sf.samtools.SAMRecord; + +import java.util.List; +import java.util.Iterator; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Oct 23, 2009 + * + * This walker is designed to work as the first pass in a two-pass processing step. + * It does a by-locus traversal calculating a neighborhood quality score based on a number of factors: + * 1.) Number of reads in neighborhood whose mate is mapped to a different chromosome. + * 2.) Average mapping quality of reads in the neighborhood. + * 3.) Average reference mismatch rate for all reads in the neighborhood. + * The output file is a list of: (GenomeLoc QualityScore) for every locus + * + * This walker is designed to be used in conjunction with ReadQualityScoreWalker. + */ + +/** + * Example of what the output should look like: + * + * 1:10999919 25.781164 + * 1:10999920 30.321754 + * 1:10999921 30.321754 + * 1:10999922 30.321754 + * 1:10999923 30.005175 + * 1:10999924 29.82714 + * 1:10999925 29.901012 + * 1:10999926 24.971085 + * 1:10999927 24.634737 + * 1:10999928 21.552652 + * 1:10999929 21.95971 + * 1:10999930 21.95971 + * 1:10999931 20.272423 + * 1:10999932 18.20454 + * 1:10999933 18.20454 + * 1:10999934 18.20454 + */ + +public class NeighborhoodQualityWalker extends LocusWalker { + + public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + float neighborhoodQualityScore = 0.0f; + int numReadsMismatchedMate = 0; // num of reads in this locus whose mate is mapped to different chromosome + float percentMismatchedMate = 0.0f; // percentage of reads at this locus whose mate is mapped to diff chromosome + float avgMappingQuality = 0.0f; // mean mapping quality for all reads at this locus + float avgMismatchRate = 0.0f; // mean mismatch with reference rate over all reads at this locus + + int numValidMappingQuality = 0; + long sumMappingQuality = 0L; + float sumMismatchRate = 0.0f; + int mappingQuality = 0; // preallocate for use in while loop below + boolean isGoodPair = false; // preallocate for use in while loop below + SAMRecord read = null; // preallocate for use in while loop below + + List reads = context.getReads(); + Iterator readsIter = reads.iterator(); + assert reads.size() > 0 : "This locus has no reads."; + + while( readsIter.hasNext() ) { // for each read in this context + read = readsIter.next(); + + // Only consider reads for this calculation whose mapping quality isn't 0 or 255 + mappingQuality = read.getMappingQuality(); + if ( mappingQuality > 0 && mappingQuality < 255 ) { + + // Generate sum of mapping quality for all reads at this locus + sumMappingQuality += mappingQuality; + numValidMappingQuality++; + + // Look to see if mate was mapped to different chromosome + //isGoodPair = ( read.getReadPairedFlag() ? read.getProperPairFlag() : true ); + isGoodPair = ( !read.getReadPairedFlag() || read.getProperPairFlag() ); // optimized version of above line + if ( !isGoodPair ) { numReadsMismatchedMate++; } + + // Generate sum number of mismatches for all reads at this locus + if( read.getAttribute("NM") != null ) { + sumMismatchRate += ((float) Integer.parseInt(read.getAttribute("NM").toString())) / ((float) read.getReadLength()); + } else { + sumMismatchRate += 1.0f; + } + } + } + + // Calculate averages from sums which accumulated during while loop above + if ( numValidMappingQuality == 0 ) { numValidMappingQuality = 1; } + percentMismatchedMate = ((float) numReadsMismatchedMate) / ((float) numValidMappingQuality); + avgMappingQuality = sumMappingQuality / ((float) numValidMappingQuality); + avgMismatchRate = sumMismatchRate / ((float) numValidMappingQuality); + + // Calculate the three metrics that go into a neighborhood quality score using exponential decay model + // BUGBUG: some analysis is needed to determine reasonable rates and scale factors for the exponential functions + float scoreMates = 40.0f * (float) Math.exp( -16.0f * (float) percentMismatchedMate ); + // exp decay with rate 16.0, scaled to Q=40 when mismatched mates is 0% + float scoreMapping = 40.0f * (float) Math.exp( -0.02f * Math.max( 99.0f - avgMappingQuality, 0.0f ) ); + // exp decay with rate 0.02, scaled to Q=40 when avg map quality is 99 + float scoreMismatch = 40.0f * (float) Math.exp( -27.0f * avgMismatchRate ); + // exp decay with rate 27.0, scaled to Q=40 when avg mismatch rate in reads is 0 + + // BUGBUG: some analysis is needed to determine reasonable weights for each metric + neighborhoodQualityScore = 0.35f * scoreMates + 0.05f * scoreMapping + 0.6f * scoreMismatch; + assert neighborhoodQualityScore >= 0.0f : "Neighborhood quality score must be nonnegative."; + if( neighborhoodQualityScore < 1.0f ) { neighborhoodQualityScore = 1.0f; } + + // verbose debug printing lines + logger.debug( context.getLocation() + " " + neighborhoodQualityScore ); + logger.debug( "mate mismatch% =\t" + percentMismatchedMate + " --> " + scoreMates ); + logger.debug( "mean mappingQ =\t" + avgMappingQuality + " --> " + scoreMapping ); + logger.debug( "mean mismatchRate =\t" + avgMismatchRate + " --> " + scoreMismatch ); + + // This printout useful for making histograms of scores in Matlab + //out.println( neighborhoodQualityScore + " " + scoreMates + " " + scoreMapping + " " + scoreMismatch); + + out.println( context.getLocation() + " " + neighborhoodQualityScore ); + return 0; + } + + public Long reduceInit() { + return 0L; + } + + public Long reduce( Integer value, Long sum ) { + return 0L; // nothing to do here + } + + public void onTraversalDone( Long reduceResult ) { + } + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReadQualityScoreWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReadQualityScoreWalker.java new file mode 100755 index 000000000..b081dca82 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReadQualityScoreWalker.java @@ -0,0 +1,200 @@ +package org.broadinstitute.sting.playground.gatk.walkers; + +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.StingException; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileWriter; + +import java.io.FileReader; +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.IOException; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Oct 23, 2009 + * + * This walker is designed to work as the second pass in a two-pass processing step. + * It does a by-read traversal calculating a read quality score based on a number of factors: + * 1.) Average neighborhood quality score for the length of the read (data generated by NeighborhoodQualityWalker) + * 2.) Is this read's mate mapped to a different chromosome? + * 3.) The mapping quality for this read. + * 4.) Number of reference mismatches in this read. + * This walker creates a new bam file in which each read is annotated by this read quality score + * in addition if the read quality score is below the given threshold, the read is flagged. + * + * This walker requires as input the file of (GenomeLoc QualityScore)'s generated by NeighborhoodQualityWalker. + * This walker accepts as input a threshold in order to flag reads which are of unacceptable read quality. + * + * This walker is designed to be used in conjunction with NeighborhoodQualityWalker. + */ + +public class ReadQualityScoreWalker extends ReadWalker { + @Argument(fullName = "inputQualityFile", shortName = "if", doc = "Input quality score file generated by NeighborhoodQualityWalker", required = true) + protected String inputQualityFile = null; + @Argument(fullName = "outputBamFile", shortName = "of", doc = "Write output to this BAM filename instead of STDOUT", required = false) + protected SAMFileWriter outputBamFile = null; + @Argument(fullName = "threshold", shortName = "th", doc="Flag reads whose read quality score is below this threshold", required = false) + protected int qualityThreshold = 13; + + protected static BufferedReader inputReader = null; + protected static String line = null; + + public SAMRecord map( char[] ref, SAMRecord read ) { + return read; // all the work is done in the reduce step for this walker + } + + public SAMFileWriter reduceInit() { + try { + inputReader = new BufferedReader( new FileReader ( inputQualityFile ) ); + } catch ( FileNotFoundException e) { + throw new StingException("Could not find required input file: " + inputQualityFile); + } catch (IOException e) { + throw new StingException("Failed while reading data from input file: " + inputQualityFile); + } + return outputBamFile; + } + + public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) { + + int readQualityScore = 0; + float meanNeighborhoodQuality = 0.0f; + + // The large block of code below is parsing through the input file and calculating the meanNeighborhoodQuality over the length of the read + // It does this by first skipping in the file to where the current read starts and marking that location + // Next it continues reading lines for the length of the read generating a sum of neighborhood quality + // When it reaches the end of the read it jumps back to the marker so that it can be used by the next read + // BUGBUG: This assumes reads will be sorted by start location + float sumNeighborhoodQuality = 0.0f; + int numLines = 0; + GenomeLoc readLoc = GenomeLocParser.createGenomeLoc( read ); + if( readLoc.size() > 0 ) { // only calculate mean NQS if the read has a well formed GenomeLoc, if not NQS will be zero + try { + if( line == null ) { + line = inputReader.readLine(); + if( line == null ) { throw new StingException( "Input file is empty: " + inputQualityFile ); } + } + String[] halves = line.split( " ", 2 ); + GenomeLoc curLoc = GenomeLocParser.parseGenomeLoc( halves[0] ); + while( curLoc.isBefore( readLoc ) ) { // Loop until the beginning of the read + line = inputReader.readLine(); + if( line == null ) { throw new StingException( "Input file doesn't encompass all reads. Can't find beginning of read: " + readLoc ); } + halves = line.split( " ", 2 ); + curLoc = GenomeLocParser.parseGenomeLoc( halves[0] ); + } + // now we have skipped ahead in the input file to where this read starts + logger.debug( "Starting: " + curLoc + ", read: " + readLoc + "\t size: " + readLoc.size() ); + inputReader.mark( 30 * ( (int)readLoc.size() + 3 ) ); // BUGBUG: Is this a sufficient buffer size? + String savedLine = line; + + while( !curLoc.isPast( readLoc ) ) { // Loop until just past the end of the read + sumNeighborhoodQuality += Float.parseFloat( halves[1] ); + numLines++; + line = inputReader.readLine(); + if( line == null ) { throw new StingException( "Input file doesn't encompass all reads. Can't find end of read: " + readLoc ); } + halves = line.split( " ", 2 ); + curLoc = GenomeLocParser.parseGenomeLoc( halves[0] ); + } + // now we have parsed the input file up to where the read ends + // reset back to the mark in order to parse the next read in the next call to the reduce function + inputReader.reset(); + line = savedLine; + + } catch ( FileNotFoundException e ) { + throw new StingException( "Could not find required input file: " + inputQualityFile ); + } catch (IOException e ) { + throw new StingException( "Failed while reading data from input file: " + inputQualityFile + " Also, " + e.getMessage() ); + } + + meanNeighborhoodQuality = sumNeighborhoodQuality / ((float) numLines); + } + + + // Find out if this read's mate mapped to a different chromosome + //boolean isGoodPair = ( read.getReadPairedFlag() ? read.getProperPairFlag() : true ); + boolean isGoodPair = ( !read.getReadPairedFlag() || read.getProperPairFlag() ); // optimized version of above line + + // Get the mapping quality for this read + int mappingQuality = read.getMappingQuality(); + + // Get the number of reference mismatches in this read + assert read.getReadLength() > 0 : "Read length must be greater than zero."; + float mismatchRate = 1.0f; + if( read.getAttribute("NM") != null ) { + mismatchRate = ((float) Integer.parseInt(read.getAttribute("NM").toString())) / ((float) read.getReadLength()); + } + + + // Calculate the three additional metrics that go into a read quality score + // BUGBUG: some analysis is needed to determine reasonable quality values and rates for the exponentials + float scoreMate = ( isGoodPair ? 40.0f : 2.0f ); + float scoreMapping = 40.0f * (float) Math.exp( -0.02f * Math.max( 99.0f - mappingQuality, 0.0f ) ); + // exp decay with rate 0.02, scaled to Q=40 when mapping quality is 99 + float scoreMismatch = 40.0f * (float) Math.exp( -27.0f * mismatchRate ); + // exp decay with rate 27.0, scaled to Q=40 when the mismatch rate is 0% for this read + + // BUGBUG: some analysis is needed to determine reasonable weights for each metric + readQualityScore = Math.round( 0.6f * meanNeighborhoodQuality + 0.1f * scoreMate + 0.05f * scoreMapping + 0.25f * scoreMismatch ); + if( readQualityScore == 0 ) { readQualityScore = 1; } + assert readQualityScore > 0 : "Read quality score must be positive and nonzero."; + + // Add the read quality score to the read in the new bam file and flag it if quality is below the given threshold + // BUGBUG: which attributes should be set here? + read.setAttribute( "XR", readQualityScore ); + if( readQualityScore < qualityThreshold ) { + read.setAttribute( "ZR", 1 ); + } + + // verbose debug printing lines + logger.debug( read.getReadName() + " " + readQualityScore ); + logger.debug( "neighborhood quality =\t" + meanNeighborhoodQuality ); + logger.debug( "mate mismatch? =\t" + isGoodPair + " --> " + scoreMate ); + logger.debug( "mapping quality =\t" + mappingQuality + " --> " + scoreMapping ); + logger.debug( "ref mismatch rate =\t" + mismatchRate + " --> " + scoreMismatch ); + + // This printout useful for making histograms of scores in Matlab + //out.println( readQualityScore + " " + meanNeighborhoodQuality + " " + scoreMate + " " + scoreMapping + " " + scoreMismatch ); + + // Add the read to the output bam file or output to STDOUT + if ( output != null ) { + output.addAlignment( read ); + } else { + out.println( read.format() ); + } + + return output; + } + + public void onTraversalDone( SAMFileWriter reduceResult ) { + } + +}