Implemented NQS covariate. Extended Cycle covariate to handle 454 and SOLID reads. Added a Primer Round covariate for SOLID reads.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2039 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-11-13 19:22:21 +00:00
parent bf451873ff
commit 394c839974
9 changed files with 154 additions and 73 deletions

View File

@ -41,5 +41,5 @@ public interface Covariate {
public Comparable getValue(SAMRecord read, int offset, String readGroup, byte[] quals, char[] bases, char refBase); //used to pick out the value from attributes of the read
public Comparable getValue(String str); // used to get value from input file
public int estimatedNumberOfBins(); // used to estimate the amount space required in the HashMap
public int estimatedNumberOfBins(); // used to estimate the amount space required for the HashMap
}

View File

@ -6,10 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.PackageUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.*;
import java.io.PrintStream;
import java.io.FileNotFoundException;
@ -70,9 +67,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
@Argument(fullName="min_mapping_quality", shortName="minmap", required=false, doc="Only use reads with at least this mapping quality score")
private int MIN_MAPPING_QUALITY = 1;
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
private boolean USE_ORIGINAL_QUALS = false; // BUGBUG need to pass this value to the proper covariates
private boolean USE_ORIGINAL_QUALS = false;
@Argument(fullName = "platform", shortName="pl", doc="Which sequencing technology was used? This is important for the cycle covariate. Options are SLX, 454, and SOLID.", required=false)
private String PLATFORM = "SLX";
@Argument(fullName = "windowSizeNQS", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 1;
@Argument(fullName="recal_file", shortName="rf", required=false, doc="Filename for the outputted covariates table recalibration file")
private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName="noPrintHeader", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file")
@ -83,7 +82,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
//private HashMap<SAMRecord, String> readGroupHashMap; // A hash map that hashes the read object itself into the read group name
// This is done for optimization purposes because pulling the read group out of the SAMRecord is expensive
// This is done for optimization purposes because pulling the read group out of the SAMRecord is expensive
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
@ -119,7 +118,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
requestedCovariates = new ArrayList<Covariate>();
if( validateOldRecalibrator ) {
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new CycleCovariate() ); // unfortunately the ordering here is different to match the output of the old recalibrator
requestedCovariates.add( new CycleCovariate( PLATFORM ) ); // unfortunately the ordering here is different to match the output of the old recalibrator
requestedCovariates.add( new QualityScoreCovariate() );
requestedCovariates.add( new DinucCovariate() );
//estimatedCapacity = 300 * 200 * 40 * 16;
@ -139,6 +138,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
try {
// Now that we've found a matching class, try to instantiate it
Covariate covariate = (Covariate)covClass.newInstance();
// some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof CycleCovariate ) { covariate = new CycleCovariate( PLATFORM ); }
else if( covariate instanceof PrimerRoundCovariate ) { covariate = new PrimerRoundCovariate( PLATFORM ); }
else if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
requestedCovariates.add( covariate );
//estimatedCapacity *= covariate.estimatedNumberOfBins(); // update the estimated initial capacity
} catch ( InstantiationException e ) {
@ -157,7 +160,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
logger.info( "Note: Using default set of covariates because none were specified." );
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new QualityScoreCovariate() );
requestedCovariates.add( new CycleCovariate() );
requestedCovariates.add( new CycleCovariate( PLATFORM ) );
requestedCovariates.add( new DinucCovariate() );
//estimatedCapacity = 300 * 40 * 200 * 16;
}
@ -222,6 +225,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
if( offset > 0 && offset < read.getReadLength() - 1) {
quals = read.getBaseQualities();
// Check if we need to use the original quality scores instead
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
quals = QualityUtils.fastqToPhred((String)obj);
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
}
// skip if base quality is zero
if( quals[offset] > 0 ) {
bases = read.getReadString().toCharArray();

View File

@ -36,11 +36,10 @@ import net.sf.samtools.SAMRecord;
*
* The Cycle covariate.
* For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
* Yet to be implemented:
* - For 454 the cycle is the number of discontinuous nucleotides seen during the length of the read
* For 454 the cycle is the number of discontinuous nucleotides seen during the length of the read
* For example, for the read: AAACCCCGAAATTTTTTT
* the cycle would be 111222234445555555
* - For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
* For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
*/
public class CycleCovariate implements Covariate {
@ -64,9 +63,18 @@ public class CycleCovariate implements Covariate {
}
return cycle;
} else if( platform.equalsIgnoreCase( "454" ) ) {
return 0; //BUGBUG: not yet implemented
int cycle = 1;
char prevBase = bases[0];
for( int iii = 1; iii <= offset; iii++ ) {
if(bases[iii] != prevBase) { // this base doesn't match the previous one so it is a new cycle
cycle++;
prevBase = bases[iii];
}
}
return cycle;
} else if( platform.equalsIgnoreCase( "SOLID" ) ) {
return 0; //BUGBUG: not yet implemented
// the ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
return (offset / 5) + 1; // integer division
} else {
throw new StingException( "Requested platform (" + platform + ") not supported in CycleCovariate." );
}
@ -74,7 +82,7 @@ public class CycleCovariate implements Covariate {
}
public final Comparable getValue(final String str) {
return (int)Integer.parseInt( str );
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
}
public final int estimatedNumberOfBins() {

View File

@ -43,11 +43,11 @@ public class MappingQualityCovariate implements Covariate {
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
final byte[] quals, final char[] bases, final char refBase) {
return (Integer)(read.getMappingQuality()); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code
return read.getMappingQuality();
}
public final Comparable getValue(final String str) {
return Integer.parseInt( str );
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
}
public final int estimatedNumberOfBins() {

View File

@ -34,48 +34,38 @@ import org.broadinstitute.sting.utils.QualityUtils;
* Date: Nov 4, 2009
*
* The Minimum Neighborhood Quality Score covariate, originally described by Chris Hartl.
* This covariate is the minimum base quality score along the length of the read.
* BUGBUG: I don't think this is what was intended by Chris. Covariate interface must change to implement the real covariate.
* This covariate is the minimum base quality score in the read in a small window around the current base.
*/
public class MinimumNQSCovariate implements Covariate {
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ";
private boolean USE_ORIGINAL_QUALS;
private int windowReach; // how far in each direction from the current base to look
public MinimumNQSCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
USE_ORIGINAL_QUALS = false;
windowReach = 1; // window size = 3 was the best covariate according to Chris's analysis
}
public MinimumNQSCovariate(final boolean originalQuals) {
USE_ORIGINAL_QUALS = originalQuals;
public MinimumNQSCovariate(final int windowSize) {
windowReach = windowSize / 2; // integer division
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
final byte[] quals, final char[] bases, final char refBase) {
//BUGBUG: Original qualities
//if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
// Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
// if ( obj instanceof String )
// quals = QualityUtils.fastqToPhred((String)obj);
// else {
// throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
// }
//}
// Loop over the list of qualities and find the minimum
Integer minQual = (int)(quals[0]);
for ( int qual : quals ) {
if( qual < minQual ) {
minQual = qual;
// Loop over the list of base quality scores in the window and find the minimum
int minQual = quals[offset];
int minIndex = Math.max(offset - windowReach, 0);
int maxIndex = Math.min(offset + windowReach, quals.length - 1);
for ( int iii = minIndex; iii < maxIndex; iii++ ) {
if( quals[iii] < minQual ) {
minQual = quals[iii];
}
}
return minQual;
}
public final Comparable getValue(final String str) {
return Integer.parseInt( str );
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
}
public final int estimatedNumberOfBins() {

View File

@ -0,0 +1,79 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import org.broadinstitute.sting.utils.StingException;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 13, 2009
*
* The Primer Round covariate.
* For Solexa and 454 this is the same value of the length of the read.
* For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
*/
public class PrimerRoundCovariate implements Covariate {
private String platform;
public PrimerRoundCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
platform = "SLX";
}
public PrimerRoundCovariate(final String _platform) {
platform = _platform;
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
final byte[] quals, final char[] bases, final char refBase) {
if( platform.equalsIgnoreCase( "SLX" ) ) {
return 1; // nothing to do here because it is always the same
} else if( platform.equalsIgnoreCase( "454" ) ) {
return 1; // nothing to do here because it is always the same
} else if( platform.equalsIgnoreCase( "SOLID" ) ) {
return offset % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
} else {
throw new StingException( "Requested platform (" + platform + ") not supported in PrimerRoundCovariate." );
}
}
public final Comparable getValue(final String str) {
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
}
public final int estimatedNumberOfBins() {
return 5;
}
public String toString() {
return "Primer Round";
}
}

View File

@ -38,36 +38,17 @@ import org.broadinstitute.sting.utils.QualityUtils;
public class QualityScoreCovariate implements Covariate {
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ";
protected boolean USE_ORIGINAL_QUALS;
public QualityScoreCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
USE_ORIGINAL_QUALS = false;
}
public QualityScoreCovariate(final boolean originalQuals) {
USE_ORIGINAL_QUALS = originalQuals;
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
final byte[] quals, final char[] bases, final char refBase) {
//BUGBUG: original qualities
//if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
// Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
// if ( obj instanceof String )
// quals = QualityUtils.fastqToPhred((String)obj);
// else {
// throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
// }
//}
//Integer returnQual = (int)quals[offset];
return (int)quals[offset]; // returning an Integer Object (as opposed to primitive int) is required so that return values from the two getValue methods hash to the same code
return (int)quals[offset];
}
public final Comparable getValue(final String str) {
return (int)Integer.parseInt( str );
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
}
public final int estimatedNumberOfBins() {

View File

@ -44,6 +44,8 @@ public class RecalDataManager {
private boolean collapsedTablesCreated;
public NHashMap<Double> dataSumExpectedErrors;
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ";
RecalDataManager() {
data = new NHashMap<RecalDatum>();
collapsedTablesCreated = false;

View File

@ -63,7 +63,11 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
@Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
public int PRESERVE_QSCORES_LESS_THAN = 5;
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
public boolean USE_ORIGINAL_QUALS = false;
private boolean USE_ORIGINAL_QUALS = false;
@Argument(fullName = "platform", shortName="pl", doc="Which sequencing technology was used? This is important for the cycle covariate. Options are SLX, 454, and SOLID.", required=false)
private String PLATFORM = "SLX";
@Argument(fullName = "windowSizeNQS", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 1;
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
public int SMOOTHING = 1;
@ -82,8 +86,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ";
//---------------------------------------------------------------------------------------------------------------
//
// initialize
@ -102,7 +105,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
int lineNumber = 0;
boolean foundAllCovariates = false;
int estimatedCapacity = 1;
//int estimatedCapacity = 1;
// Read in the covariates that were used from the input file
requestedCovariates = new ArrayList<Covariate>();
@ -128,8 +131,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
foundClass = true;
try {
Covariate covariate = (Covariate)covClass.newInstance();
// some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof CycleCovariate ) { covariate = new CycleCovariate( PLATFORM ); }
else if( covariate instanceof PrimerRoundCovariate ) { covariate = new PrimerRoundCovariate( PLATFORM ); }
else if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
requestedCovariates.add( covariate );
estimatedCapacity *= covariate.estimatedNumberOfBins();
//estimatedCapacity *= covariate.estimatedNumberOfBins();
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
@ -150,7 +157,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
foundAllCovariates = true;
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
dataManager = new RecalDataManager( estimatedCapacity );
//dataManager = new RecalDataManager( estimatedCapacity );
dataManager = new RecalDataManager();
}
addCSVData(line); // parse the line and add the data to the HashMap
@ -209,12 +217,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
byte[] originalQuals = read.getBaseQualities();
// Check if we need to use the original quality scores instead
if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
originalQuals = QualityUtils.fastqToPhred((String)obj);
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
}
byte[] recalQuals = originalQuals.clone();
@ -255,8 +263,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
read.setBaseQualities(recalQuals); // overwrite old qualities with new recalibrated qualities
if ( read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if the tag isn't already taken in the read
read.setAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if the tag isn't already taken in the read
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
}