CountCovariates now uses any rod of type VariationRod with the name dbsnp as the source of known variant sites to skip over. It also grabs the platform string out of the read group when deciding which algorithm to use to calculate machine cycle. In this way it can now handle multi-platform bams. I added a new covariate: PositionCovariate. This is simply the offset regardless of which platform the read came from. This will be useful for comparing between the two covariates. Finally, this message serves as a warning that I will be killing the old recalibrator tomorrow after I've updated and verified new integration tests.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2077 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-11-18 23:03:47 +00:00
parent f667bed7fc
commit 0fbd81766b
11 changed files with 153 additions and 78 deletions

View File

@ -38,7 +38,7 @@ import net.sf.samtools.SAMRecord;
*/
public interface Covariate {
public Comparable getValue(SAMRecord read, int offset, String readGroup, byte[] quals, byte[] bases); // used to pick out the value from attributes of the read
public Comparable getValue(SAMRecord read, int offset, String readGroup, String platform, byte[] quals, byte[] bases); // used to pick out the value from attributes of the read
public Comparable getValue(String str); // used to get value from input file
public int estimatedNumberOfBins(); // used to estimate the amount space required for the HashMap
}

View File

@ -3,11 +3,15 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RODRecordList;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.io.PrintStream;
import java.io.FileNotFoundException;
@ -70,8 +74,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
private String[] COVARIATES = null;
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
private boolean USE_ORIGINAL_QUALS = false;
@Argument(fullName = "platform", shortName="pl", doc="Which sequencing technology was used? This is important for the cycle covariate. Options are SLX, 454, and SOLID.", required=false)
private String PLATFORM = "SLX";
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 3;
@Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file")
@ -114,10 +116,17 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
// Warn the user if no dbSNP file was specified
if( this.getToolkit().getArguments().DBSNPFile == null ) {
boolean foundDBSNP = false;
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) {
foundDBSNP = true;
}
}
if( !foundDBSNP ) {
Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?");
}
// Initialize the requested covariates by parsing the -cov argument
requestedCovariates = new ArrayList<Covariate>();
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
@ -130,9 +139,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
Covariate covariate = (Covariate)covClass.newInstance();
estimatedCapacity *= covariate.estimatedNumberOfBins();
// Some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof CycleCovariate ) { covariate = new CycleCovariate( PLATFORM ); }
else if( covariate instanceof PrimerRoundCovariate ) { covariate = new PrimerRoundCovariate( PLATFORM ); }
else if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // these were already added so don't add them again
requestedCovariates.add( covariate );
}
@ -160,9 +167,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
Covariate covariate = (Covariate)covClass.newInstance();
estimatedCapacity *= covariate.estimatedNumberOfBins();
// Some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof CycleCovariate ) { covariate = new CycleCovariate( PLATFORM ); }
else if( covariate instanceof PrimerRoundCovariate ) { covariate = new PrimerRoundCovariate( PLATFORM ); }
else if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
requestedCovariates.add( covariate );
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
@ -209,11 +214,21 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
*/
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
final rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP( tracker.getTrackData("dbsnp", null) );
// pull out anything passed by -B name,type,file that has the name "dbsnp"
final RODRecordList<ReferenceOrderedDatum> dbsnpRODs = tracker.getTrackData( "dbsnp", null );
boolean isSNP = false;
if (dbsnpRODs != null) {
for( ReferenceOrderedDatum rod : dbsnpRODs ) {
if( ((Variation)rod).isSNP() ) {
isSNP = true; // at least one of the rods says this is a snp site
break;
}
}
}
// Only use data from non-dbsnp sites
// Assume every mismatch at a non-dbsnp site is indicitive of poor quality
if( dbsnp == null ) {
if( !isSNP ) {
final List<SAMRecord> reads = context.getReads();
final List<Integer> offsets = context.getOffsets();
SAMRecord read;
@ -223,6 +238,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
byte[] bases;
byte refBase;
byte prevBase;
String platform;
byte[] colorSpaceQuals;
final int numReads = reads.size();
// For each read at this locus
@ -251,30 +268,33 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
}
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
byte[] colorSpaceQuals = null;
if( PLATFORM.equalsIgnoreCase("SOLID") ) {
colorSpaceQuals = (byte[])read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
}
if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
{
// skip if base quality is zero
if( quals[offset] > 0 ) {
bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'
refBase = (byte)ref.getBase();
prevBase = bases[offset-1];
// skip if base quality is zero
if( quals[offset] > 0 ) {
bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'
refBase = (byte)ref.getBase();
prevBase = bases[offset-1];
// Get the complement base strand if we are a negative strand read
if( read.getReadNegativeStrandFlag() ) {
bases = BaseUtils.simpleComplement( bases ); // this is an expensive call
refBase = (byte)BaseUtils.simpleComplement( ref.getBase() );
prevBase = bases[offset+1];
// Get the complement base strand if we are a negative strand read
if( read.getReadNegativeStrandFlag() ) {
bases = BaseUtils.simpleComplement( bases ); // this is an expensive call
refBase = (byte)BaseUtils.simpleComplement( ref.getBase() );
prevBase = bases[offset+1];
}
// skip if this base or the previous one was an 'N' or etc.
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)bases[offset] ) ) {
readGroup = read.getReadGroup().getReadGroupId(); // this is an expensive call
platform = read.getReadGroup().getPlatform(); // this is an expensive call
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
colorSpaceQuals = null;
if( platform.equalsIgnoreCase("SOLID") ) {
colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
}
// skip if this base or the previous one was an 'N' or etc.
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)bases[offset] ) ) {
readGroup = read.getReadGroup().getReadGroupId(); // this is an expensive call
updateDataFromRead( read, offset, readGroup, quals, bases, refBase );
if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
{
updateDataFromRead( read, offset, readGroup, platform, quals, bases, refBase );
}
}
}
@ -303,13 +323,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
* @param bases The bases which make up the read
* @param refBase The reference base at this locus
*/
private void updateDataFromRead(final SAMRecord read, final int offset, final String readGroup, final byte[] quals, final byte[] bases, final byte refBase) {
private void updateDataFromRead(final SAMRecord read, final int offset, final String readGroup, final String platform,
final byte[] quals, final byte[] bases, final byte refBase) {
List<Comparable> key = new ArrayList<Comparable>();
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
for( Covariate covariate : requestedCovariates ) {
key.add( covariate.getValue( read, offset, readGroup, quals, bases ) );
key.add( covariate.getValue( read, offset, readGroup, platform, quals, bases ) );
}
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap

View File

@ -44,25 +44,18 @@ import net.sf.samtools.SAMRecord;
public class CycleCovariate implements Covariate {
private String platform;
public CycleCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
platform = "SLX"; // Solexa is the default
}
public CycleCovariate(final String _platform) {
platform = _platform;
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
final byte[] quals, final byte[] bases) {
if( platform.equalsIgnoreCase( "SLX" ) ) {
int cycle = offset;
if( platform.equalsIgnoreCase( "ILLUMINA" ) ) {
int cycle = offset;
if( read.getReadNegativeStrandFlag() ) {
cycle = bases.length - (offset + 1);
}
return cycle;
} else if( platform.equalsIgnoreCase( "454" ) ) {
} else if( platform.contains( "454" ) ) { // some bams have "LS454" and others have just "454"
int cycle = 0;
byte prevBase = bases[0];
for( int iii = 1; iii <= offset; iii++ ) {
@ -76,7 +69,7 @@ public class CycleCovariate implements Covariate {
// the ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
return offset / 5; // integer division
} else {
throw new StingException( "Requested platform (" + platform + ") not supported in CycleCovariate." );
throw new StingException( "Platform in read (" + platform + ") is not supported in CycleCovariate. Read = " + read );
}
}

View File

@ -56,7 +56,7 @@ public class DinucCovariate implements Covariate {
}
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
final byte[] quals, final byte[] bases) {
byte base = bases[offset];

View File

@ -40,7 +40,7 @@ public class MappingQualityCovariate implements Covariate {
public MappingQualityCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
final byte[] quals, final byte[] bases) {
return read.getMappingQuality();

View File

@ -48,7 +48,7 @@ public class MinimumNQSCovariate implements Covariate {
windowReach = windowSize / 2; // integer division
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
final byte[] quals, final byte[] bases) {
// Loop over the list of base quality scores in the window and find the minimum

View File

@ -0,0 +1,64 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 18, 2009
*
* The Position covariate. It is a lot like the Cycle covariate except it always returns the offset regardless of which platform the read came from.
* This is the Solexa definition of machine cycle and the covariate that was always being used in the original version of the recalibrator.
*/
public class PositionCovariate implements Covariate {
public PositionCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
final byte[] quals, final byte[] bases) {
int cycle = offset;
if( read.getReadNegativeStrandFlag() ) {
cycle = bases.length - (offset + 1);
}
return cycle;
}
public final Comparable getValue(final String str) {
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
}
public final int estimatedNumberOfBins() {
return 100;
}
public String toString() {
return "Position in Read";
}
}

View File

@ -41,26 +41,15 @@ import net.sf.samtools.SAMRecord;
public class PrimerRoundCovariate implements Covariate {
private String platform;
public PrimerRoundCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
platform = "SLX";
public PrimerRoundCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
}
public PrimerRoundCovariate(final String _platform) {
platform = _platform;
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
final byte[] quals, final byte[] bases) {
if( platform.equalsIgnoreCase( "SLX" ) ) {
return 1; // nothing to do here because it is always the same
} else if( platform.equalsIgnoreCase( "454" ) ) {
return 1; // nothing to do here because it is always the same
} else if( platform.equalsIgnoreCase( "SOLID" ) ) {
return offset % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
if( platform.equalsIgnoreCase( "SOLID" ) ) {
return offset % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
} else {
throw new StingException( "Requested platform (" + platform + ") not supported in PrimerRoundCovariate." );
return 1; // nothing to do here because it is always the same
}
}

View File

@ -40,7 +40,7 @@ public class QualityScoreCovariate implements Covariate {
public QualityScoreCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
final byte[] quals, final byte[] bases) {
return (int)quals[offset];

View File

@ -40,7 +40,7 @@ public class ReadGroupCovariate implements Covariate{
public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
final byte[] quals, final byte[] bases) {
return readGroup;
}

View File

@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
@ -69,8 +70,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private int PRESERVE_QSCORES_LESS_THAN = 5;
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
private boolean USE_ORIGINAL_QUALS = false;
@Argument(fullName = "platform", shortName="pl", doc="Which sequencing technology was used? This is important for the cycle covariate. Options are SLX, 454, and SOLID.", required=false)
private String PLATFORM = "SLX";
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 3;
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
@ -114,10 +113,20 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
boolean foundAllCovariates = false;
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
// Warn the user if a dbSNP file was specified since it isn't being used here
boolean foundDBSNP = false;
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) {
foundDBSNP = true;
}
}
if( foundDBSNP ) {
Utils.warnUser("A dbSNP rod file was specified but this walker doesn't make use of it.");
}
// Read in the covariates that were used from the input file
requestedCovariates = new ArrayList<Covariate>();
// Read in the data from the csv file and populate the map
logger.info( "Reading in the data from input file..." );
@ -139,9 +148,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
try {
Covariate covariate = (Covariate)covClass.newInstance();
// some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof CycleCovariate ) { covariate = new CycleCovariate( PLATFORM ); }
else if( covariate instanceof PrimerRoundCovariate ) { covariate = new PrimerRoundCovariate( PLATFORM ); }
else if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
requestedCovariates.add( covariate );
estimatedCapacity *= covariate.estimatedNumberOfBins();
@ -243,6 +250,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// These calls are expensive so only do them once for each read
String readGroup = read.getReadGroup().getReadGroupId();
String platform = read.getReadGroup().getPlatform();
byte[] bases = read.getReadBases();
if( read.getReadNegativeStrandFlag() ) {
@ -254,7 +262,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
for( int iii = 1; iii < read.getReadLength() - 1; iii++ ) { // skip first and last bases because there is no dinuc
List<Comparable> key = new ArrayList<Comparable>();
for( Covariate covariate : requestedCovariates ) {
key.add( covariate.getValue( read, iii, readGroup, originalQuals, bases ) ); // offset is zero based so passing iii is correct here
key.add( covariate.getValue( read, iii, readGroup, platform, originalQuals, bases ) ); // offset is zero based so passing iii is correct here
}
recalQuals[iii] = performSequentialQualityCalculation( key );
@ -275,8 +283,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
// SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores
if( PLATFORM.equalsIgnoreCase("SOLID") ) {
byte[] colorSpaceQuals = (byte[])read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
if( platform.equalsIgnoreCase("SOLID") ) {
byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); }
}