Working with byte arrays is faster than working with Strings so the Covariates now take in byte arrays. None of the Covariates themselves used the reference base so I removed it. DinucCovariate now returns a Dinuc object which implements Comparable instead of returning a String because it was too slow. CountCovariates now uses a read filter to filter out unmapped reads and allows the user to specify -cov all which will use all of the available covariates, of which there are 7 now. If no covariates are specified it defaults to ReadGroup and QualityScore, the two required covariates. Initial code in place to leave SOLID bases alone if they have bad color space quality. TableRecalibration uses @Requires to tell the GATK to not give the reference bases since they weren't being used for anything.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2062 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-11-17 21:50:52 +00:00
parent 4d9c826766
commit ec1a870905
13 changed files with 278 additions and 268 deletions

View File

@ -38,7 +38,7 @@ import net.sf.samtools.SAMRecord;
*/
public interface Covariate {
public Comparable getValue(SAMRecord read, int offset, String readGroup, byte[] quals, char[] bases, char refBase); // used to pick out the value from attributes of the read
public Comparable getValue(SAMRecord read, int offset, String readGroup, byte[] quals, byte[] bases); // used to pick out the value from attributes of the read
public Comparable getValue(String str); // used to get value from input file
public int estimatedNumberOfBins(); // used to estimate the amount space required for the HashMap
}

View File

@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
@ -57,27 +58,26 @@ import net.sf.samtools.SAMRecord;
* Note: This walker is designed to be used in conjunction with TableRecalibrationWalker.
*/
@WalkerName("CountCovariatesRefactored")
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
@WalkerName( "CountCovariatesRefactored" )
@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
private Boolean LIST_ONLY = false;
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false)
private String[] COVARIATES = null;
@Argument(fullName="min_mapping_quality", shortName="minmap", required=false, doc="Only use reads with at least this mapping quality score")
private int MIN_MAPPING_QUALITY = 1;
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
private boolean USE_ORIGINAL_QUALS = false;
@Argument(fullName = "platform", shortName="pl", doc="Which sequencing technology was used? This is important for the cycle covariate. Options are SLX, 454, and SOLID.", required=false)
private String PLATFORM = "SLX";
@Argument(fullName = "windowSizeNQS", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 3;
@Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file")
private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName="noPrintHeader", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file")
@Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. For debugging purposes only.")
private boolean NO_PRINT_HEADER = false;
@Argument(fullName="validateOldRecalibrator", shortName="valor", required=false, doc="Depricated, no longer replicates previous behavior exactly due to clean up of structure of code. If yes will reorder the output to match the old recalibrator exactly but makes the file useless to the refactored TableRecalibrationWalker")
private boolean validateOldRecalibrator = false;
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
@ -101,8 +101,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
// Get a list of all available covariates
final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
//int estimatedCapacity = 1; // start at one because capacity is multiplicative for each covariate
// Print and exit if that's what was requested
if ( LIST_ONLY ) {
out.println( "Available covariates:" );
@ -121,60 +120,75 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
// Initialize the requested covariates by parsing the -cov argument
requestedCovariates = new ArrayList<Covariate>();
if( validateOldRecalibrator ) {
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new CycleCovariate( PLATFORM ) ); // unfortunately the ordering here is different to match the output of the old recalibrator
requestedCovariates.add( new QualityScoreCovariate() );
requestedCovariates.add( new DinucCovariate() );
//estimatedCapacity = 300 * 200 * 40 * 16;
} else if( COVARIATES != null ) {
int covNumber = 1;
for( String requestedCovariateString : COVARIATES ) {
boolean foundClass = false;
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
if( COVARIATES != null ) {
if(COVARIATES[0].equalsIgnoreCase("ALL")) { // the user wants ALL covariates to be used
requestedCovariates.add( new ReadGroupCovariate() ); // first add the required covariates then add the rest by looping over all implementing classes that were found
requestedCovariates.add( new QualityScoreCovariate() );
for( Class<?> covClass : classes ) {
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
foundClass = true;
// Read Group Covariate and Quality Score Covariate are required covariates for the recalibration calculation and must begin the list
if( (covNumber == 1 && !requestedCovariateString.equalsIgnoreCase( "ReadGroupCovariate" )) ||
(covNumber == 2 && !requestedCovariateString.equalsIgnoreCase( "QualityScoreCovariate" )) ) {
throw new StingException("ReadGroupCovariate and QualityScoreCovariate are required covariates for the recalibration calculation and must begin the list" );
}
covNumber++;
try {
// Now that we've found a matching class, try to instantiate it
Covariate covariate = (Covariate)covClass.newInstance();
// some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof CycleCovariate ) { covariate = new CycleCovariate( PLATFORM ); }
else if( covariate instanceof PrimerRoundCovariate ) { covariate = new PrimerRoundCovariate( PLATFORM ); }
else if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
try {
Covariate covariate = (Covariate)covClass.newInstance();
estimatedCapacity *= covariate.estimatedNumberOfBins();
// Some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof CycleCovariate ) { covariate = new CycleCovariate( PLATFORM ); }
else if( covariate instanceof PrimerRoundCovariate ) { covariate = new PrimerRoundCovariate( PLATFORM ); }
else if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // these were already added so don't add them again
requestedCovariates.add( covariate );
//estimatedCapacity *= covariate.estimatedNumberOfBins(); // update the estimated initial capacity
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
}
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
}
}
} else { // The user has specified a list of several covariates
int covNumber = 1;
for( String requestedCovariateString : COVARIATES ) {
boolean foundClass = false;
for( Class<?> covClass : classes ) {
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
foundClass = true;
// Read Group Covariate and Quality Score Covariate are required covariates for the recalibration calculation and must begin the list
if( (covNumber == 1 && !requestedCovariateString.equalsIgnoreCase( "ReadGroupCovariate" )) ||
(covNumber == 2 && !requestedCovariateString.equalsIgnoreCase( "QualityScoreCovariate" )) ) {
throw new StingException("ReadGroupCovariate and QualityScoreCovariate are required covariates for the recalibration calculation and must begin the list" );
}
covNumber++;
try {
// Now that we've found a matching class, try to instantiate it
Covariate covariate = (Covariate)covClass.newInstance();
estimatedCapacity *= covariate.estimatedNumberOfBins();
// Some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof CycleCovariate ) { covariate = new CycleCovariate( PLATFORM ); }
else if( covariate instanceof PrimerRoundCovariate ) { covariate = new PrimerRoundCovariate( PLATFORM ); }
else if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
requestedCovariates.add( covariate );
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
}
}
}
}
if( !foundClass ) {
throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
if( !foundClass ) {
throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
}
}
}
} else { // Not validating old recalibrator and no covariates were specified by the user so add the default ones
logger.info( "Note: Using default set of covariates because none were specified." );
} else { // No covariates were specified by the user so add the default, required ones
Utils.warnUser( "Using default set of covariates because none were specified. Using ReadGroupCovariate and QualityScoreCovariate only." );
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new QualityScoreCovariate() );
requestedCovariates.add( new CycleCovariate( PLATFORM ) );
requestedCovariates.add( new DinucCovariate() );
//estimatedCapacity = 300 * 40 * 200 * 16;
estimatedCapacity = 300 * 40;
}
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
//dataManager = new RecalDataManager( estimatedCapacity );
dataManager = new RecalDataManager();
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
dataManager = new RecalDataManager( estimatedCapacity );
//readGroupHashMap = new HashMap<SAMRecord, String>( 50000000, 0.97f );
}
@ -195,7 +209,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
*/
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
final rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
final rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP( tracker.getTrackData("dbsnp", null) );
// Only use data from non-dbsnp sites
// Assume every mismatch at a non-dbsnp site is indicitive of poor quality
@ -206,61 +220,65 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
int offset;
String readGroup;
byte[] quals;
char[] bases;
char refBase;
char prevBase;
byte[] bases;
byte refBase;
byte prevBase;
final int numReads = reads.size();
// For each read at this locus
for( int iii = 0; iii < numReads; iii++ ) {
read = reads.get(iii);
// Only use data from reads with mapping quality above specified quality value
if( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) {
//readGroup = readGroupHashMap.get( read );
//if( readGroup == null ) { // read is not in the hashmap so add it
// readGroup = read.getReadGroup().getReadGroupId();
// readGroupHashMap.put( read, readGroup );
//}
offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct
// skip first and last base because there is no dinuc, this is mainly done for speed so we don't have to check cases
if( offset > 0 && offset < read.getReadLength() - 1) {
quals = read.getBaseQualities();
// Check if we need to use the original quality scores instead
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
quals = QualityUtils.fastqToPhred((String)obj);
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
//readGroup = readGroupHashMap.get( read );
//if( readGroup == null ) { // read is not in the hashmap so add it
// readGroup = read.getReadGroup().getReadGroupId();
// readGroupHashMap.put( read, readGroup );
//}
offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct
// skip first and last base because there is no dinuc, this is mainly done for speed so we don't have to check cases
if( offset > 0 && offset < read.getReadLength() - 1 ) {
quals = read.getBaseQualities();
// Check if we need to use the original quality scores instead
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
quals = QualityUtils.fastqToPhred((String)obj);
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
}
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
byte[] colorSpaceQuals = null;
if( PLATFORM.equalsIgnoreCase("SOLID") ) {
colorSpaceQuals = (byte[])read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
}
if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
{
// skip if base quality is zero
if( quals[offset] > 0 ) {
bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'
refBase = (byte)ref.getBase();
prevBase = bases[offset-1];
// Get the complement base strand if we are a negative strand read
if( read.getReadNegativeStrandFlag() ) {
bases = BaseUtils.simpleComplement( bases ); // this is an expensive call
refBase = (byte)BaseUtils.simpleComplement( ref.getBase() );
prevBase = bases[offset+1];
}
// skip if this base or the previous one was an 'N' or etc.
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)bases[offset] ) ) {
readGroup = read.getReadGroup().getReadGroupId(); // this is an expensive call
updateDataFromRead( read, offset, readGroup, quals, bases, refBase );
}
}
// skip if base quality is zero
if( quals[offset] > 0 ) {
bases = read.getReadString().toCharArray();
refBase = ref.getBase();
prevBase = bases[offset-1];
// Get the complement base strand if we are a negative strand read
if( read.getReadNegativeStrandFlag() ) {
bases = BaseUtils.simpleComplement( read.getReadString() ).toCharArray(); // this is an expensive call
refBase = BaseUtils.simpleComplement( refBase );
prevBase = bases[offset+1];
}
// skip if this base or the previous one was an 'N' or etc.
if( BaseUtils.isRegularBase(prevBase) && BaseUtils.isRegularBase(bases[offset]) ) {
readGroup = read.getReadGroup().getReadGroupId(); // this is an expensive call
updateDataFromRead( read, offset, readGroup, quals, bases, refBase );
}
}
}
}
}
}
countedSites++;
@ -285,31 +303,27 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
* @param bases The bases which make up the read
* @param refBase The reference base at this locus
*/
private void updateDataFromRead(final SAMRecord read, final int offset, final String readGroup, final byte[] quals, final char[] bases, final char refBase) {
private void updateDataFromRead(final SAMRecord read, final int offset, final String readGroup, final byte[] quals, final byte[] bases, final byte refBase) {
List<Comparable> key = new ArrayList<Comparable>();
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
for( Covariate covariate : requestedCovariates ) {
key.add( covariate.getValue( read, offset, readGroup, quals, bases, refBase ) );
key.add( covariate.getValue( read, offset, readGroup, quals, bases ) );
}
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
RecalDatum datum = dataManager.data.get( key );
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
if( validateOldRecalibrator ) {
dataManager.data.myPut( key, datum );
} else {
dataManager.data.put( key, datum );
}
dataManager.data.put( key, datum );
}
// Need the bases to determine whether or not we have a mismatch
char base = bases[offset];
byte base = bases[offset];
// Add one to the number of observations and potentially one to the number of mismatches
datum.increment( base, refBase );
datum.increment( (char)base, (char)refBase ); // dangerous: if you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad
countedBases++;
}
@ -360,49 +374,27 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
*/
private void outputToCSV( final PrintStream recalTableStream ) {
if( validateOldRecalibrator ) {
final boolean collapsePos = false;
final boolean collapseDinuc = false;
recalTableStream.printf("# collapsed_pos %b%n", collapsePos);
recalTableStream.printf("# collapsed_dinuc %b%n", collapseDinuc);
recalTableStream.printf("# counted_sites %d%n", countedSites);
recalTableStream.printf("# counted_bases %d%n", countedBases);
recalTableStream.printf("# skipped_sites %d%n", skippedSites);
recalTableStream.printf("# fraction_skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
for(Pair<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySetSorted4() ) {
// For each Covariate in the key
for( Comparable comp : entry.getFirst() ) {
// Output the Covariate's value
recalTableStream.print( comp + "," );
}
// Output the RecalDatum entry
recalTableStream.println( entry.getSecond().outputToCSV() );
}
} else {
if( !NO_PRINT_HEADER ) {
recalTableStream.printf("# Counted Sites %d%n", countedSites);
recalTableStream.printf("# Counted Bases %d%n", countedBases);
recalTableStream.printf("# Skipped Sites %d%n", skippedSites);
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
for( Covariate cov : requestedCovariates ) {
// The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name
recalTableStream.println( "@!" + cov.getClass().getSimpleName() );
}
}
// For each entry in the data hashmap
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
// For each Covariate in the key
for( Comparable comp : entry.getKey() ) {
// Output the Covariate's value
if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG
recalTableStream.print( comp + "," );
}
// Output the RecalDatum entry
recalTableStream.println( entry.getValue().outputToCSV() );
if( !NO_PRINT_HEADER ) {
recalTableStream.printf("# Counted Sites %d%n", countedSites);
recalTableStream.printf("# Counted Bases %d%n", countedBases);
recalTableStream.printf("# Skipped Sites %d%n", skippedSites);
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
for( Covariate cov : requestedCovariates ) {
// The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name
recalTableStream.println( "@!" + cov.getClass().getSimpleName() );
}
}
// For each entry in the data hashmap
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
// For each Covariate in the key
for( Comparable comp : entry.getKey() ) {
// Output the Covariate's value
if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG
recalTableStream.print( comp + "," );
}
// Output the RecalDatum entry
recalTableStream.println( entry.getValue().outputToCSV() );
}
}
}

View File

@ -55,7 +55,7 @@ public class CycleCovariate implements Covariate {
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
final byte[] quals, final char[] bases, final char refBase) {
final byte[] quals, final byte[] bases) {
if( platform.equalsIgnoreCase( "SLX" ) ) {
int cycle = offset;
if( read.getReadNegativeStrandFlag() ) {
@ -64,7 +64,7 @@ public class CycleCovariate implements Covariate {
return cycle;
} else if( platform.equalsIgnoreCase( "454" ) ) {
int cycle = 0;
char prevBase = bases[0];
byte prevBase = bases[0];
for( int iii = 1; iii <= offset; iii++ ) {
if(bases[iii] != prevBase) { // this base doesn't match the previous one so it is a new cycle
cycle++;

View File

@ -0,0 +1,46 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 16, 2009
*/
public class Dinuc implements Comparable<Dinuc>{
private byte first;
private byte second;
public Dinuc() {
first = 0;
second = 0;
}
public Dinuc(final byte _first, final byte _second) {
first = _first;
second = _second;
}
public final void setValues(final byte _first, final byte _second) {
first = _first;
second = _second;
}
public int compareTo(final Dinuc that) {
if( this.first > that.first ) { return 1; }
else if( this.first < that.first ) { return -1; }
else { //this.first equals that.first
if( this.second > that.second ) { return 1; }
else if( this.second < that.second ) { return -1; }
else { return 0; }
}
}
public static int hashBytes(final byte byte1, final byte byte2) {
return byte1 << 16 + byte2 << 4;
}
public String toString() { // This method call is how the Dinuc will get written out to the table recalibration file
byte[] byteArray = {first,second};
return new String(byteArray);
}
}

View File

@ -2,6 +2,10 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import net.sf.samtools.SAMRecord;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -39,28 +43,37 @@ import net.sf.samtools.SAMRecord;
public class DinucCovariate implements Covariate {
private String returnString;
HashMap<Integer, Dinuc> dinucHashMap;
public DinucCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
final byte[] BASES = { (byte)'A', (byte)'C', (byte)'G', (byte)'T' };
dinucHashMap = new HashMap<Integer, Dinuc>();
for(byte byte1 : BASES) {
for(byte byte2: BASES) {
dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) );
}
}
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
final byte[] quals, final char[] bases, final char refBase) {
final byte[] quals, final byte[] bases) {
char base = bases[offset];
char prevBase = bases[offset - 1];
// If it is a negative strand read then we need to reverse the direction for our previous base
byte base = bases[offset];
byte prevBase = bases[offset - 1];
// If this is a negative strand read then we need to reverse the direction for our previous base
if( read.getReadNegativeStrandFlag() ) {
prevBase = bases[offset + 1];
}
final char[] charArray = {prevBase, base};
returnString = String.valueOf(charArray);
return returnString;
}
//char[] charArray = {(char)prevBase, (char)base};
//return new String( charArray ); // This is an expensive call
return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) );
//return String.format("%c%c", prevBase, base); // This return statement is too slow
}
public final Comparable getValue(final String str) {
return str;
//return str;
return dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) );
}
public final int estimatedNumberOfBins() {

View File

@ -41,7 +41,7 @@ public class MappingQualityCovariate implements Covariate {
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
final byte[] quals, final char[] bases, final char refBase) {
final byte[] quals, final byte[] bases) {
return read.getMappingQuality();
}

View File

@ -49,7 +49,7 @@ public class MinimumNQSCovariate implements Covariate {
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
final byte[] quals, final char[] bases, final char refBase) {
final byte[] quals, final byte[] bases) {
// Loop over the list of base quality scores in the window and find the minimum
int minQual = quals[offset];

View File

@ -1,8 +1,5 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.StingException;
import java.util.*;
/*
@ -36,84 +33,18 @@ import java.util.*;
* Date: Oct 30, 2009
*
* A HashMap that maps a list of comparables to any object <T>.
* There is functionality for the mappings to be given back to you in sorted order.
*/
public class NHashMap<T> extends HashMap<List<? extends Comparable>, T> {
private static final long serialVersionUID = 1L; //BUGBUG: what should I do here?, added by Eclipse
private ArrayList<ArrayList<Comparable>> keyLists;
public NHashMap() {
super();
keyLists = null;
}
public NHashMap( int initialCapacity, float loadingFactor ) {
super( initialCapacity, loadingFactor );
keyLists = null;
}
// This method is here only to help facilitate direct comparison of old and refactored recalibrator.
// The old recalibrator prints out its mappings in a sorted order but the refactored recalibrator doesn't need to.
public T myPut(List<? extends Comparable> key, T value) {
if( keyLists == null ) {
keyLists = new ArrayList<ArrayList<Comparable>>();
for( Comparable comp : key ) {
keyLists.add( new ArrayList<Comparable>() );
}
}
ArrayList<Comparable> thisList;
for( int iii = 0; iii < key.size(); iii++ ) {
thisList = keyLists.get( iii );
if( thisList == null ) {
thisList = new ArrayList<Comparable>();
}
if( !thisList.contains( key.get( iii ) ) ) {
thisList.add( key.get(iii ) );
}
}
return super.put( key, value );
}
// This method is very ugly but is here only to help facilitate direct comparison of old and refactored recalibrator.
// The old recalibrator prints out its mappings in a sorted order but the refactored recalibrator doesn't need to.
@SuppressWarnings(value = "unchecked")
public ArrayList<Pair<List<? extends Comparable>, T>> entrySetSorted4() {
ArrayList<Pair<List<? extends Comparable>, T>> theSet = new ArrayList<Pair<List<? extends Comparable>, T>>();
for( ArrayList<Comparable> list : keyLists ) {
Collections.sort(list);
}
if( keyLists.size() != 4 ) {
throw new StingException("Are you sure you want to be calling this ugly method? NHashMap.entrySetSorted4()");
}
ArrayList<Comparable> newKey = null;
for( Comparable c0 : keyLists.get(0) ) {
for( Comparable c1 : keyLists.get(1) ) {
for( Comparable c2 : keyLists.get(2) ) {
for( Comparable c3 : keyLists.get(3) ) {
newKey = new ArrayList<Comparable>();
newKey.add(c0);
newKey.add(c1);
newKey.add(c2);
newKey.add(c3);
T value = this.get( newKey );
if( value!= null ) {
theSet.add(new Pair<List<? extends Comparable>,T>( newKey, value ) );
}
}
}
}
}
return theSet;
}
public static <T> List<T> makeList(T... args) {

View File

@ -52,7 +52,7 @@ public class PrimerRoundCovariate implements Covariate {
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
final byte[] quals, final char[] bases, final char refBase) {
final byte[] quals, final byte[] bases) {
if( platform.equalsIgnoreCase( "SLX" ) ) {
return 1; // nothing to do here because it is always the same
} else if( platform.equalsIgnoreCase( "454" ) ) {

View File

@ -41,7 +41,7 @@ public class QualityScoreCovariate implements Covariate {
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
final byte[] quals, final char[] bases, final char refBase) {
final byte[] quals, final byte[] bases) {
return (int)quals[offset];
}

View File

@ -41,7 +41,7 @@ public class ReadGroupCovariate implements Covariate{
}
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
final byte[] quals, final char[] bases, final char refBase) {
final byte[] quals, final byte[] bases) {
return readGroup;
}

View File

@ -46,7 +46,8 @@ public class RecalDataManager {
private boolean collapsedTablesCreated;
public NHashMap<Double> dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag in a BAM file that holds the original quality scores
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams
RecalDataManager() {
data = new NHashMap<RecalDatum>();
@ -54,8 +55,8 @@ public class RecalDataManager {
}
RecalDataManager( int estimatedCapacity ) {
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.95f ); // second arg is the 'loading factor',
// a number to monkey around with when optimizing performace of the HashMap
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f ); // second arg is the 'loading factor',
// a number to monkey around with when optimizing performace of the HashMap
collapsedTablesCreated = false;
}

View File

@ -4,6 +4,8 @@ import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileWriter;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
@ -54,19 +56,22 @@ import java.io.FileNotFoundException;
*/
@WalkerName("TableRecalibrationRefactored")
@Requires({ DataSource.READS, DataSource.REFERENCE }) // This walker requires -I input.bam, it also requires -R reference.fasta, but by not saying @requires REFERENCE_BASES I'm telling the
// GATK to not actually spend time giving me the refBase array since I don't use it
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
@Argument(fullName="recal_file", shortName="recalFile", doc="Input recalibration table file generated by CountCovariates", required=true)
public String RECAL_FILE;
@Argument(fullName="outputBam", shortName="outputBam", doc="output BAM file", required=false)
public SAMFileWriter OUTPUT_BAM = null;
@Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
public int PRESERVE_QSCORES_LESS_THAN = 5;
private String RECAL_FILE = null;
@Argument(fullName="output_bam", shortName="outputBam", doc="output BAM file", required=false)
private SAMFileWriter OUTPUT_BAM = null;
@Argument(fullName="preserve_qscores_less_than", shortName="pQ",
doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
private int PRESERVE_QSCORES_LESS_THAN = 5;
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
private boolean USE_ORIGINAL_QUALS = false;
@Argument(fullName = "platform", shortName="pl", doc="Which sequencing technology was used? This is important for the cycle covariate. Options are SLX, 454, and SOLID.", required=false)
private String PLATFORM = "SLX";
@Argument(fullName = "windowSizeNQS", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 3;
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
public int SMOOTHING = 1;
@ -77,7 +82,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// ERROR
//}
@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
//@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
public String MODE_STRING = "SEQUENTIAL";
//public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
@ -86,6 +91,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
//---------------------------------------------------------------------------------------------------------------
//
@ -105,7 +112,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
int lineNumber = 0;
boolean foundAllCovariates = false;
//int estimatedCapacity = 1;
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
// Read in the covariates that were used from the input file
requestedCovariates = new ArrayList<Covariate>();
@ -136,7 +143,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
else if( covariate instanceof PrimerRoundCovariate ) { covariate = new PrimerRoundCovariate( PLATFORM ); }
else if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
requestedCovariates.add( covariate );
//estimatedCapacity *= covariate.estimatedNumberOfBins();
estimatedCapacity *= covariate.estimatedNumberOfBins();
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
@ -157,8 +164,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
foundAllCovariates = true;
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
//dataManager = new RecalDataManager( estimatedCapacity );
dataManager = new RecalDataManager();
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
dataManager = new RecalDataManager( estimatedCapacity );
}
addCSVData(line); // parse the line and add the data to the HashMap
@ -211,9 +218,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
*/
public SAMRecord map( char[] refBases, SAMRecord read ) {
if( refBases == null ) {
return read; // early return here, unmapped reads should be left alone
}
// WARNING: refBases is always null because this walker doesn't have @REQUIRES({DataSource.REFERENCE_BASES})
// This is done in order to speed up the code
// if( read.getMappingQuality() <= 0 ) {
// return read; // early return here, unmapped reads and mapping quality zero reads should be left alone
// }
byte[] originalQuals = read.getBaseQualities();
// Check if we need to use the original quality scores instead
@ -229,14 +239,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// These calls are expensive so only do them once for each read
String readGroup = read.getReadGroup().getReadGroupId();
char[] bases = read.getReadString().toCharArray();
if( refBases.length != bases.length ) {
return read; // something is wrong with the mapping of the read so leave it alone
}
String myRefBases = new String(refBases);
byte[] bases = read.getReadBases();
if( read.getReadNegativeStrandFlag() ) {
bases = BaseUtils.simpleComplement( read.getReadString() ).toCharArray();
myRefBases = BaseUtils.simpleComplement( myRefBases );
bases = BaseUtils.simpleComplement( bases );
}
@ -244,7 +250,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
for( int iii = 1; iii < read.getReadLength() - 1; iii++ ) { // skip first and last bases because there is no dinuc
List<Comparable> key = new ArrayList<Comparable>();
for( Covariate covariate : requestedCovariates ) {
key.add( covariate.getValue( read, iii, readGroup, originalQuals, bases, myRefBases.charAt(iii) ) ); // offset is zero based so passing iii is correct here // technically COVARIATE_ERROR is fine too, but this won't match the behavior of the old recalibrator
key.add( covariate.getValue( read, iii, readGroup, originalQuals, bases ) ); // offset is zero based so passing iii is correct here
}
if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) {
@ -265,18 +271,24 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
// SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores
if( PLATFORM.equalsIgnoreCase("SOLID") ) {
byte[] colorSpaceQuals = (byte[])read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); }
}
read.setBaseQualities(recalQuals); // overwrite old qualities with new recalibrated qualities
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if the tag isn't already taken in the read
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
}
return read;
}
/**
* Implements a serial recalibration of the reads using the combinational table. First,
* we perform a positional recalibration, and then a subsequent dinuc correction.
* Implements a serial recalibration of the reads using the combinational table.
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
*
* Given the full recalibration table, we perform the following preprocessing steps:
*
@ -293,7 +305,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
byte qualFromRead = Byte.parseByte(key.get(1).toString());
ArrayList<Comparable> newKey;
// The global quality shift (over the read group only)
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // read group
RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get( newKey );
@ -304,7 +317,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
globalDeltaQ = globalDeltaQDatum.empiricalQualDouble( SMOOTHING ) - aggregrateQreported;
}
// The shift in quality between reported and empirical
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // read group
newKey.add( key.get(1) ); // quality score
@ -314,7 +327,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
deltaQReported = deltaQReportedDatum.empiricalQualDouble( SMOOTHING ) - qualFromRead - globalDeltaQ;
}
// The shift in quality due to each covariate by itself in turn
double deltaQCovariates = 0.0;
RecalDatum deltaQCovariateDatum;
for( int iii = 2; iii < key.size(); iii++ ) {
@ -355,7 +368,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores
*/
private void preserveQScores( byte[] originalQuals, byte[] recalQuals ) {
private void preserveQScores( final byte[] originalQuals, byte[] recalQuals ) {
for( int iii = 0; iii < recalQuals.length; iii++ ) {
if ( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
recalQuals[iii] = originalQuals[iii];
@ -363,6 +376,20 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
}
/**
* Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero
* @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores
* @param colorSpaceQuals The list of color space quality scores for this read
*/
private void preserveBadColorSpaceQualities_SOLID( final byte[] originalQuals, byte[] recalQuals, final byte[] colorSpaceQuals ) {
for( int iii = 0; iii < recalQuals.length; iii++ ) {
if ( colorSpaceQuals[iii] <= 0 ) { //BUGBUG: This isn't exactly correct yet
recalQuals[iii] = originalQuals[iii];
}
}
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce