Encapsulated the sections of code that were shared by the two Recalibration walkers. This includes both the shared command line arguments and the section of code in the map methods which pull out data from the SAMRecord and stuff it into the ReadHashDatum. Command line arguments are now passed to the Covariates using a new initialize method that all Covariates must implement. Updated the dbsnp sanity check warning message to be less cryptic.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2170 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
75b61a3663
commit
d1298dda13
|
|
@ -36,7 +36,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
|
|||
*/
|
||||
|
||||
public interface Covariate {
|
||||
public Comparable getValue(ReadHashDatum readDatum, int offset); // Used to pick out the covariate's value from attributes of the read
|
||||
public Comparable getValue(String str); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public Comparable getValue( ReadHashDatum readDatum, int offset ); // Used to pick out the covariate's value from attributes of the read
|
||||
public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public int estimatedNumberOfBins(); // Used to estimate the amount space required for the full data HashMap
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
|
|
@ -69,6 +70,11 @@ import edu.mit.broad.picard.illumina.parser.IlluminaUtil;
|
|||
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
|
||||
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||
|
||||
/////////////////////////////
|
||||
// Shared Arguments
|
||||
/////////////////////////////
|
||||
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
|
|
@ -76,31 +82,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
private Boolean LIST_ONLY = false;
|
||||
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false)
|
||||
private String[] COVARIATES = null;
|
||||
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
private boolean USE_ORIGINAL_QUALS = false;
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
|
||||
private int WINDOW_SIZE = 5;
|
||||
@Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file")
|
||||
private String RECAL_FILE = "output.recal_data.csv";
|
||||
@Argument(fullName="process_nth_locus", shortName="pN", required=false, doc="Only process every Nth covered locus we see.")
|
||||
private int PROCESS_EVERY_NTH_LOCUS = 1;
|
||||
@Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.")
|
||||
private String DEFAULT_READ_GROUP = ReadGroupCovariate.defaultReadGroup;
|
||||
@Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||
private String DEFAULT_PLATFORM = "Illumina";
|
||||
@Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.")
|
||||
private String FORCE_READ_GROUP = null;
|
||||
@Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
private String FORCE_PLATFORM = null;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. FOR DEBUGGING PURPOSES ONLY.")
|
||||
private boolean NO_PRINT_HEADER = false;
|
||||
@Argument(fullName="validate_old_recalibrator", shortName="validate", required=false, doc="!!!Depricated!!! Match the output of the old recalibrator exactly. FOR DEBUGGING PURPOSES ONLY.")
|
||||
private boolean VALIDATE_OLD_RECALIBRATOR = false;
|
||||
// BUGBUG: This validate option no longer does exactly what it says because now using read filter to filter out zero mapping quality reads. The old recalibrator counted those reads in the counted_sites variable but the new one doesn't.
|
||||
@Argument(fullName="sorted_output", shortName="sorted", required=false, doc="The outputted table recalibration file will be in sorted order at the cost of added overhead. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
private boolean SORTED_OUTPUT = false;
|
||||
|
||||
|
|
@ -115,13 +104,12 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
|
||||
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
|
||||
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
|
||||
private final String versionString = "v2.0.6"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.0.7"; // Major version, minor version, and build number
|
||||
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
||||
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||
private int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
|
||||
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
|
||||
private boolean warnUserNullReadGroup = false; // Has the walker warned the user about null read groups yet?
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -136,8 +124,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
public void initialize() {
|
||||
|
||||
logger.info( "CovariateCounterWalker version: " + versionString );
|
||||
if( FORCE_READ_GROUP != null ) { DEFAULT_READ_GROUP = FORCE_READ_GROUP; }
|
||||
if( FORCE_PLATFORM != null ) { DEFAULT_PLATFORM = FORCE_PLATFORM; }
|
||||
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
|
||||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
||||
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface( Covariate.class );
|
||||
|
|
@ -169,11 +157,12 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
// BUGBUG: This is a mess because there are a lot of cases (validate, all, none, and supplied covList). Clean up needed.
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
int estimatedCapacity = 1; // Capacity is multiplicitive so this starts at one
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
requestedCovariates.add( new CycleCovariate() ); // Unfortunately order is different here in order to match the old recalibrator exactly
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
requestedCovariates.add( new DinucCovariate() );
|
||||
estimatedCapacity = 60 * 100 * 40 * 16;
|
||||
} else if( COVARIATES != null ) {
|
||||
if(COVARIATES[0].equalsIgnoreCase( "ALL" )) { // The user wants ALL covariates to be used
|
||||
requestedCovariates.add( new ReadGroupCovariate() ); // First add the required covariates then add the rest by looping over all implementing classes that were found
|
||||
|
|
@ -181,10 +170,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
for( Class<?> covClass : classes ) {
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
// Some covariates need parameters (user supplied command line arguments) passed to them
|
||||
if( covariate instanceof CycleCovariate && DEFAULT_PLATFORM != null ) { covariate = new CycleCovariate( DEFAULT_PLATFORM ); }
|
||||
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
|
||||
if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // These were already added so don't add them again
|
||||
requestedCovariates.add( covariate );
|
||||
}
|
||||
|
|
@ -211,9 +198,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
// Now that we've found a matching class, try to instantiate it
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
// Some covariates need parameters (user supplied command line arguments) passed to them
|
||||
if( covariate instanceof CycleCovariate && DEFAULT_PLATFORM != null ) { covariate = new CycleCovariate( DEFAULT_PLATFORM ); }
|
||||
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
|
||||
requestedCovariates.add( covariate );
|
||||
} catch ( InstantiationException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
|
||||
|
|
@ -232,14 +216,19 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
Utils.warnUser( "Using default set of covariates because none were specified. Using ReadGroupCovariate and QualityScoreCovariate only." );
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
estimatedCapacity = 300 * 40;
|
||||
estimatedCapacity = 60 * 40;
|
||||
}
|
||||
|
||||
logger.info( "The covariates being used here: " );
|
||||
logger.info( requestedCovariates );
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
logger.info( "\t" + cov.getClass().getSimpleName() );
|
||||
cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection
|
||||
}
|
||||
|
||||
if(estimatedCapacity > 300 * 40 * 200 * 16 || estimatedCapacity < 0) // could be negative if overflowed
|
||||
{ estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
|
||||
// Don't want to crash with out of heap space exception
|
||||
if(estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0) { // Could be negative if overflowed
|
||||
estimatedCapacity = 300 * 40 * 200;
|
||||
}
|
||||
dataManager = new RecalDataManager( estimatedCapacity );
|
||||
readDatumHashMap = new IdentityHashMap<SAMRecord, ReadHashDatum>();
|
||||
}
|
||||
|
|
@ -283,17 +272,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
final List<Integer> offsets = context.getOffsets();
|
||||
SAMRecord read;
|
||||
int offset;
|
||||
String readGroupId;
|
||||
byte[] quals;
|
||||
byte[] bases;
|
||||
byte refBase;
|
||||
byte prevBase;
|
||||
String platform;
|
||||
byte[] colorSpaceQuals;
|
||||
ReadHashDatum readDatum;
|
||||
boolean isNegStrand;
|
||||
int mappingQuality;
|
||||
int length;
|
||||
|
||||
final int numReads = reads.size();
|
||||
// For each read at this locus
|
||||
|
|
@ -312,45 +294,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
|
||||
// This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again
|
||||
// Lots of lines just pulling things out of the SAMRecord and stuffing into local variables, many edge cases to worry about
|
||||
// These lines should be exactly the same in TableRecalibrationWalker
|
||||
quals = read.getBaseQualities();
|
||||
// Check if we need to use the original quality scores instead
|
||||
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
if ( obj instanceof String )
|
||||
quals = QualityUtils.fastqToPhred((String)obj);
|
||||
else {
|
||||
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
}
|
||||
bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'.
|
||||
isNegStrand = read.getReadNegativeStrandFlag();
|
||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
if( readGroup == null ) {
|
||||
if( !warnUserNullReadGroup && FORCE_READ_GROUP == null ) {
|
||||
Utils.warnUser("The input .bam file contains reads with no read group. " +
|
||||
"Defaulting to read group ID = " + DEFAULT_READ_GROUP + " and platform = " + DEFAULT_PLATFORM + ". " +
|
||||
"First observed at read with name = " + read.getReadName() );
|
||||
warnUserNullReadGroup = true;
|
||||
}
|
||||
// There is no readGroup so defaulting to these values
|
||||
readGroupId = DEFAULT_READ_GROUP;
|
||||
platform = DEFAULT_PLATFORM;
|
||||
} else {
|
||||
readGroupId = readGroup.getReadGroupId();
|
||||
platform = readGroup.getPlatform();
|
||||
}
|
||||
mappingQuality = read.getMappingQuality();
|
||||
length = bases.length;
|
||||
if( FORCE_READ_GROUP != null ) { // Collapse all the read groups into a single common String provided by the user
|
||||
readGroupId = FORCE_READ_GROUP;
|
||||
}
|
||||
if( FORCE_PLATFORM != null ) {
|
||||
platform = FORCE_PLATFORM;
|
||||
}
|
||||
Integer tile = IlluminaUtil.getTileFromReadName(read.getReadName());
|
||||
readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length, tile );
|
||||
readDatum = ReadHashDatum.parseSAMRecord( read, RAC );
|
||||
readDatumHashMap.put( read, readDatum );
|
||||
sizeOfReadDatumHashMap++;
|
||||
}
|
||||
|
|
@ -384,7 +328,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
updateDataFromRead( readDatum, offset, refBase );
|
||||
}
|
||||
} else {
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
|
||||
countedBases++; // Replicating a small bug in the old recalibrator
|
||||
}
|
||||
}
|
||||
|
|
@ -406,7 +350,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
|
||||
// Do a dbSNP sanity check every so often
|
||||
if ( ++lociSinceLastDbsnpCheck == DBSNP_VALIDATION_CHECK_FREQUENCY ) {
|
||||
if( ++lociSinceLastDbsnpCheck == DBSNP_VALIDATION_CHECK_FREQUENCY ) {
|
||||
lociSinceLastDbsnpCheck = 0;
|
||||
validateDbsnpMismatchRate();
|
||||
}
|
||||
|
|
@ -424,13 +368,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
private static void updateMismatchCounts(Pair<Long, Long> counts, AlignmentContext context, char ref) {
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
for (int iii = 0; iii < reads.size(); iii++ ) {
|
||||
for(int iii = 0; iii < reads.size(); iii++ ) {
|
||||
char readChar = (char)(reads.get(iii).getReadBases()[offsets.get(iii)]);
|
||||
int readCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(readChar);
|
||||
int refCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
|
||||
|
||||
if ( readCharBaseIndex != -1 && refCharBaseIndex != -1 ) {
|
||||
if ( readCharBaseIndex != refCharBaseIndex ) {
|
||||
if( readCharBaseIndex != -1 && refCharBaseIndex != -1 ) {
|
||||
if( readCharBaseIndex != refCharBaseIndex ) {
|
||||
counts.first++;
|
||||
}
|
||||
counts.second++;
|
||||
|
|
@ -442,17 +386,17 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
* Validate the dbSNP mismatch rates.
|
||||
*/
|
||||
private void validateDbsnpMismatchRate() {
|
||||
if ( novel_counts.second == 0 || dbSNP_counts.second == 0 ) {
|
||||
if( novel_counts.second == 0 || dbSNP_counts.second == 0 ) {
|
||||
return;
|
||||
}
|
||||
|
||||
double fractionMM_novel = (double)novel_counts.first / (double)novel_counts.second;
|
||||
double fractionMM_dbsnp = (double)dbSNP_counts.first / (double)dbSNP_counts.second;
|
||||
//System.out.println(String.format("dbSNP rate = %.2f, novel rate = %.2f", fractionMM_dbsnp, fractionMM_novel));
|
||||
|
||||
if ( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) {
|
||||
Utils.warnUser("The variation rate at supplied known variant sites seems suspiciously low. Please double-check that the correct ROD is being used.");
|
||||
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't output the message every megabase of a large file
|
||||
|
||||
if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) {
|
||||
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " +
|
||||
String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) );
|
||||
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -480,7 +424,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
RecalDatum datum = dataManager.data.get( key );
|
||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
|
||||
if( VALIDATE_OLD_RECALIBRATOR || SORTED_OUTPUT ) {
|
||||
if( RAC.VALIDATE_OLD_RECALIBRATOR || SORTED_OUTPUT ) {
|
||||
dataManager.data.sortedPut( key, datum );
|
||||
} else {
|
||||
dataManager.data.put( key, datum );
|
||||
|
|
@ -508,7 +452,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
*/
|
||||
public PrintStream reduceInit() {
|
||||
try {
|
||||
return new PrintStream( RECAL_FILE );
|
||||
return new PrintStream( RAC.RECAL_FILE );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException( "Couldn't open output file: ", e );
|
||||
}
|
||||
|
|
@ -543,7 +487,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
private void outputToCSV( final PrintStream recalTableStream ) {
|
||||
|
||||
//BUGBUG: This method is a mess. It will be cleaned up when I get rid of the validation and no_header debug options.
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
|
||||
// Output the old header as well as output the data in sorted order
|
||||
recalTableStream.printf("# collapsed_pos false%n");
|
||||
recalTableStream.printf("# collapsed_dinuc false%n");
|
||||
|
|
@ -574,7 +518,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
}
|
||||
|
||||
if( SORTED_OUTPUT )
|
||||
if( SORTED_OUTPUT && requestedCovariates.size() == 4 )
|
||||
{
|
||||
for( Pair<List<? extends Comparable>,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { //BUGBUG: entrySetSorted4 isn't correct here
|
||||
for( Comparable comp : entry.first ) {
|
||||
|
|
|
|||
|
|
@ -46,28 +46,35 @@ public class CycleCovariate implements Covariate {
|
|||
private static boolean warnedUserBadPlatform = false;
|
||||
private static String defaultPlatform;
|
||||
|
||||
public CycleCovariate() { // Empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
defaultPlatform = null;
|
||||
}
|
||||
|
||||
public CycleCovariate( final String _defaultPlatform ) {
|
||||
if( _defaultPlatform.equalsIgnoreCase( "ILLUMINA" ) || _defaultPlatform.contains( "454" ) || _defaultPlatform.equalsIgnoreCase( "SOLID" ) ) {
|
||||
defaultPlatform = _defaultPlatform;
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
if( RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ILLUMINA" ) || RAC.DEFAULT_PLATFORM.contains( "454" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SOLID" ) ) {
|
||||
defaultPlatform = RAC.DEFAULT_PLATFORM;
|
||||
} else {
|
||||
throw new StingException( "The requested default platform (" + _defaultPlatform +") is not a recognized platform. Implemented options are illumina, 454, and solid");
|
||||
throw new StingException( "The requested default platform (" + RAC.DEFAULT_PLATFORM +") is not a recognized platform. Implemented options are illumina, 454, and solid");
|
||||
}
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
|
||||
|
||||
//-----------------------------
|
||||
// ILLUMINA
|
||||
//-----------------------------
|
||||
|
||||
if( readDatum.platform.equalsIgnoreCase( "ILLUMINA" ) ) {
|
||||
int cycle = offset;
|
||||
if( readDatum.isNegStrand ) {
|
||||
cycle = readDatum.bases.length - (offset + 1);
|
||||
}
|
||||
return cycle;
|
||||
} else if( readDatum.platform.contains( "454" ) ) { // Some bams have "LS454" and others have just "454"
|
||||
}
|
||||
|
||||
//-----------------------------
|
||||
// 454
|
||||
//-----------------------------
|
||||
|
||||
else if( readDatum.platform.contains( "454" ) ) { // Some bams have "LS454" and others have just "454"
|
||||
int cycle = 0;
|
||||
//BUGBUG: should reverse directions on negative strand reads!
|
||||
byte prevBase = readDatum.bases[0];
|
||||
|
|
@ -78,11 +85,23 @@ public class CycleCovariate implements Covariate {
|
|||
}
|
||||
}
|
||||
return cycle;
|
||||
} else if( readDatum.platform.equalsIgnoreCase( "SOLID" ) ) {
|
||||
}
|
||||
|
||||
//-----------------------------
|
||||
// SOLID
|
||||
//-----------------------------
|
||||
|
||||
else if( readDatum.platform.equalsIgnoreCase( "SOLID" ) ) {
|
||||
// The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
|
||||
//BUGBUG: should reverse directions on negative strand reads!
|
||||
return offset / 5; // integer division
|
||||
} else { // Platform is unrecognized so revert to the default platform but warn the user first
|
||||
}
|
||||
|
||||
//-----------------------------
|
||||
// UNRECOGNIZED PLATFORM
|
||||
//-----------------------------
|
||||
|
||||
else { // Platform is unrecognized so revert to the default platform but warn the user first
|
||||
if( !warnedUserBadPlatform ) {
|
||||
if( defaultPlatform != null) { // the user set a default platform
|
||||
Utils.warnUser( "Platform string (" + readDatum.platform + ") unrecognized in CycleCovariate. " +
|
||||
|
|
@ -110,8 +129,4 @@ public class CycleCovariate implements Covariate {
|
|||
public final int estimatedNumberOfBins() {
|
||||
return 100;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "Machine Cycle";
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,30 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
|
|
|
|||
|
|
@ -43,8 +43,8 @@ public class DinucCovariate implements Covariate {
|
|||
|
||||
HashMap<Integer, Dinuc> dinucHashMap;
|
||||
|
||||
public DinucCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
final byte[] BASES = { (byte)'A', (byte)'C', (byte)'G', (byte)'T' };
|
||||
dinucHashMap = new HashMap<Integer, Dinuc>();
|
||||
for(byte byte1 : BASES) {
|
||||
|
|
@ -83,8 +83,4 @@ public class DinucCovariate implements Covariate {
|
|||
public final int estimatedNumberOfBins() {
|
||||
return 16;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "Dinucleotide";
|
||||
}
|
||||
}
|
||||
|
|
@ -35,7 +35,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
|
|||
|
||||
public class MappingQualityCovariate implements Covariate {
|
||||
|
||||
public MappingQualityCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
|
|
@ -53,8 +54,4 @@ public class MappingQualityCovariate implements Covariate {
|
|||
public final int estimatedNumberOfBins() {
|
||||
return 100;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "Mapping Quality Score";
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,14 +36,11 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
|
|||
|
||||
public class MinimumNQSCovariate implements Covariate {
|
||||
|
||||
private int windowReach; // how far in each direction from the current base to look
|
||||
private int windowReach; // How far in each direction from the current base to look
|
||||
|
||||
public MinimumNQSCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
windowReach = 2; // window size = 5 was the best covariate according to Chris's analysis
|
||||
}
|
||||
|
||||
public MinimumNQSCovariate(final int windowSize) {
|
||||
windowReach = windowSize / 2; // integer division
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
windowReach = RAC.WINDOW_SIZE / 2; // integer division
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
|
|
@ -70,8 +67,4 @@ public class MinimumNQSCovariate implements Covariate {
|
|||
public final int estimatedNumberOfBins() {
|
||||
return 40;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "Minimum Neighborhood Quality Score (window size = " + (windowReach * 2 + 1) + ")";
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,7 +36,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
|
|||
|
||||
public class PositionCovariate implements Covariate {
|
||||
|
||||
public PositionCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
|
|
@ -57,8 +58,4 @@ public class PositionCovariate implements Covariate {
|
|||
public final int estimatedNumberOfBins() {
|
||||
return 100;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "Position in Read";
|
||||
}
|
||||
}
|
||||
|
|
@ -37,7 +37,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
|
|||
|
||||
public class PrimerRoundCovariate implements Covariate {
|
||||
|
||||
public PrimerRoundCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
|
|
@ -59,8 +60,4 @@ public class PrimerRoundCovariate implements Covariate {
|
|||
public final int estimatedNumberOfBins() {
|
||||
return 5;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "Primer Round";
|
||||
}
|
||||
}
|
||||
|
|
@ -35,7 +35,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
|
|||
|
||||
public class QualityScoreCovariate implements Covariate {
|
||||
|
||||
public QualityScoreCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
|
|
@ -53,8 +54,4 @@ public class QualityScoreCovariate implements Covariate {
|
|||
public final int estimatedNumberOfBins() {
|
||||
return 40;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "Reported Quality Score";
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -37,7 +37,8 @@ public class ReadGroupCovariate implements Covariate{
|
|||
|
||||
public static final String defaultReadGroup = "DefaultReadGroup";
|
||||
|
||||
public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
|
|
@ -52,11 +53,7 @@ public class ReadGroupCovariate implements Covariate{
|
|||
|
||||
// Used to estimate the amount space required for the full data HashMap
|
||||
public final int estimatedNumberOfBins() {
|
||||
return 300;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "Read Group";
|
||||
return 60;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,42 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import edu.mit.broad.picard.illumina.parser.IlluminaUtil;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 14, 2009
|
||||
*/
|
||||
|
||||
public class ReadHashDatum {
|
||||
public String readGroup;
|
||||
public String platform;
|
||||
|
|
@ -14,6 +46,7 @@ public class ReadHashDatum {
|
|||
public int mappingQuality;
|
||||
public int length;
|
||||
public Integer tile;
|
||||
private static boolean warnUserNullReadGroup = false; // Has the walker warned the user about null read groups yet?
|
||||
|
||||
public ReadHashDatum(String _readGroup, String _platform, byte[] _quals, byte[] _bases, boolean _isNegStrand,
|
||||
int _mappingQuality, int _length, Integer _tile) {
|
||||
|
|
@ -37,4 +70,64 @@ public class ReadHashDatum {
|
|||
this.length = that.length;
|
||||
this.tile = that.tile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pulls the important bits out of a SAMRecord and stuffs them into a leaner Object to be used by the Covariates and cached by the Recalibration walkers
|
||||
* @param read The SAMRecord whose data we want to get at
|
||||
* @param RAC The collection of arguments that are shared between CovariateCounterWalker and TableRecalibrationWalker
|
||||
* @return A new ReadHashDatum holding all the important parts of the read
|
||||
*/
|
||||
public static ReadHashDatum parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) {
|
||||
|
||||
// Lots of lines just pulling things out of the SAMRecord and stuffing into local variables, many edge cases to worry about
|
||||
byte[] quals = read.getBaseQualities();
|
||||
|
||||
// Check if we need to use the original quality scores instead
|
||||
if ( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
if ( obj instanceof String )
|
||||
quals = QualityUtils.fastqToPhred((String)obj);
|
||||
else {
|
||||
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
}
|
||||
|
||||
final byte[] bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'.
|
||||
final boolean isNegStrand = read.getReadNegativeStrandFlag();
|
||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
String readGroupId;
|
||||
String platform;
|
||||
|
||||
// If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments
|
||||
if( readGroup == null ) {
|
||||
if( !warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null ) {
|
||||
Utils.warnUser("The input .bam file contains reads with no read group. " +
|
||||
"Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " +
|
||||
"First observed at read with name = " + read.getReadName() );
|
||||
warnUserNullReadGroup = true;
|
||||
}
|
||||
// There is no readGroup so defaulting to these values
|
||||
readGroupId = RAC.DEFAULT_READ_GROUP;
|
||||
platform = RAC.DEFAULT_PLATFORM;
|
||||
} else {
|
||||
readGroupId = readGroup.getReadGroupId();
|
||||
platform = readGroup.getPlatform();
|
||||
}
|
||||
|
||||
final int mappingQuality = read.getMappingQuality();
|
||||
final int length = bases.length;
|
||||
|
||||
if( RAC.FORCE_READ_GROUP != null ) { // Collapse all the read groups into a single common String provided by the user
|
||||
readGroupId = RAC.FORCE_READ_GROUP;
|
||||
}
|
||||
|
||||
if( RAC.FORCE_PLATFORM != null ) {
|
||||
platform = RAC.FORCE_PLATFORM;
|
||||
}
|
||||
|
||||
Integer tile = IlluminaUtil.getTileFromReadName(read.getReadName());
|
||||
|
||||
return new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length, tile );
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,68 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 27, 2009
|
||||
*
|
||||
* A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker.
|
||||
* This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
|
||||
*/
|
||||
|
||||
public class RecalibrationArgumentCollection {
|
||||
|
||||
//////////////////////////////////
|
||||
// Shared Command Line Arguments
|
||||
//////////////////////////////////
|
||||
@Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file")
|
||||
public String RECAL_FILE = "output.recal_data.csv";
|
||||
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
public boolean USE_ORIGINAL_QUALS = false;
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
|
||||
public int WINDOW_SIZE = 5;
|
||||
@Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.")
|
||||
public String DEFAULT_READ_GROUP = ReadGroupCovariate.defaultReadGroup;
|
||||
@Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String DEFAULT_PLATFORM = "Illumina";
|
||||
@Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.")
|
||||
public String FORCE_READ_GROUP = null;
|
||||
@Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String FORCE_PLATFORM = null;
|
||||
|
||||
//////////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
//////////////////////////////////
|
||||
@Argument(fullName="validate_old_recalibrator", shortName="validate", required=false, doc="!!!Depricated!!! Match the output of the old recalibrator exactly. FOR DEBUGGING PURPOSES ONLY.")
|
||||
public boolean VALIDATE_OLD_RECALIBRATOR = false;
|
||||
// BUGBUG: This validate option no longer does exactly what it says because now using read filter to filter out zero mapping quality reads. The old recalibrator counted those reads in the counted_sites variable but the new one doesn't.
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.Requires;
|
|||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -62,36 +63,25 @@ import edu.mit.broad.picard.illumina.parser.IlluminaUtil;
|
|||
// GATK to not actually spend time giving me the refBase array since I don't use it
|
||||
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
|
||||
/////////////////////////////
|
||||
// Shared Arguments
|
||||
/////////////////////////////
|
||||
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="recal_file", shortName="recalFile", doc="Input recalibration table file generated by CountCovariates", required=true)
|
||||
private String RECAL_FILE = null;
|
||||
@Argument(fullName="output_bam", shortName="outputBam", doc="output BAM file", required=true)
|
||||
private String OUTPUT_BAM_FILE = null;
|
||||
@Argument(fullName="preserve_qscores_less_than", shortName="pQ",
|
||||
doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
||||
doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
||||
private int PRESERVE_QSCORES_LESS_THAN = 5;
|
||||
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
private boolean USE_ORIGINAL_QUALS = false;
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
|
||||
private int WINDOW_SIZE = 5;
|
||||
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
|
||||
private int SMOOTHING = 1;
|
||||
@Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.")
|
||||
private String DEFAULT_READ_GROUP = ReadGroupCovariate.defaultReadGroup;
|
||||
@Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||
private String DEFAULT_PLATFORM = "Illumina";
|
||||
@Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String.")
|
||||
private String FORCE_READ_GROUP = null;
|
||||
@Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
private String FORCE_PLATFORM = null;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. FOR DEBUGGING PURPOSES ONLY.")
|
||||
private boolean VALIDATE_OLD_RECALIBRATOR = false;
|
||||
@Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
private boolean NO_PG_TAG = false;
|
||||
|
||||
|
|
@ -102,11 +92,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private ArrayList<Covariate> requestedCovariates; // List of covariates to be used in this calculation
|
||||
private ArrayList<Comparable> fullCovariateKey; // The list that will be used over and over again to hold the full set of covariate values
|
||||
private ArrayList<Comparable> collapsedTableKey; // The key that will be used over and over again to query the collapsed tables
|
||||
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
|
||||
private final String versionString = "v2.0.4"; // Major version, minor version, and build number
|
||||
private boolean warnUserNullReadGroup = false; // Has the walker warned the user about null read groups yet?
|
||||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
|
||||
private static final String versionString = "v2.0.5"; // Major version, minor version, and build number
|
||||
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -123,8 +112,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
public void initialize() {
|
||||
|
||||
logger.info( "TableRecalibrationWalker version: " + versionString );
|
||||
if( FORCE_READ_GROUP != null ) { DEFAULT_READ_GROUP = FORCE_READ_GROUP; }
|
||||
if( FORCE_PLATFORM != null ) { DEFAULT_PLATFORM = FORCE_PLATFORM; }
|
||||
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
|
||||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
||||
|
||||
// Get a list of all available covariates
|
||||
List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
|
||||
|
|
@ -154,14 +143,14 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
logger.info( "Reading in the data from input file..." );
|
||||
|
||||
try {
|
||||
for ( String line : new xReadLines(new File( RECAL_FILE )) ) {
|
||||
for ( String line : new xReadLines(new File( RAC.RECAL_FILE )) ) {
|
||||
lineNumber++;
|
||||
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data. " + RECAL_FILE );
|
||||
throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data. " + RAC.RECAL_FILE );
|
||||
} else { // Found another covariate in input file
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
|
|
@ -170,9 +159,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
// Some covariates need parameters (user supplied command line arguments) passed to them
|
||||
if( covariate instanceof CycleCovariate && DEFAULT_PLATFORM != null ) { covariate = new CycleCovariate( DEFAULT_PLATFORM ); }
|
||||
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
|
||||
requestedCovariates.add( covariate );
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
|
||||
|
|
@ -192,15 +178,23 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
} else { // Found some data
|
||||
if( !foundAllCovariates ) {
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
requestedCovariates.add( new CycleCovariate() );
|
||||
requestedCovariates.add( new DinucCovariate() );
|
||||
}
|
||||
foundAllCovariates = true;
|
||||
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
|
||||
// Don't want to crash with out of heap space exception
|
||||
if(estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0) { // Could be negative if overflowed
|
||||
estimatedCapacity = 300 * 40 * 200;
|
||||
}
|
||||
final boolean createCollapsedTables = true;
|
||||
|
||||
// Initialize any covariate member variables using the shared argument collection
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
cov.initialize( RAC );
|
||||
}
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() );
|
||||
|
||||
|
|
@ -210,14 +204,16 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
Utils.scareUser("Can not find input file: " + RECAL_FILE);
|
||||
Utils.scareUser("Can not find input file: " + RAC.RECAL_FILE);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new StingException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
logger.info( "...done!" );
|
||||
|
||||
logger.info( "The covariates being used here: " );
|
||||
logger.info( requestedCovariates );
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
logger.info( "\t" + cov.getClass().getSimpleName() );
|
||||
}
|
||||
|
||||
// Create the tables of empirical quality scores that will be used in the sequential calculation
|
||||
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
|
||||
|
|
@ -251,7 +247,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if( iii == 1 ) { // Order is different in the old recalibrator unfortunately
|
||||
key.add( cov.getValue( vals[2] ) );
|
||||
} else if ( iii == 2 ) {
|
||||
|
|
@ -289,7 +285,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
byte[] originalQuals = read.getBaseQualities();
|
||||
// Check if we need to use the original quality scores instead
|
||||
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
if ( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
if ( obj instanceof String )
|
||||
originalQuals = QualityUtils.fastqToPhred((String)obj);
|
||||
|
|
@ -299,44 +295,16 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
// These calls are all expensive so only do them once for each read
|
||||
// Pulling out all of the useful things from the read
|
||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
String readGroupId;
|
||||
String platform;
|
||||
if( readGroup == null ) {
|
||||
if( !warnUserNullReadGroup && FORCE_READ_GROUP == null) {
|
||||
Utils.warnUser("The input .bam file contains reads with no read group. " +
|
||||
"Defaulting to read group ID = " + DEFAULT_READ_GROUP + " and platform = " + DEFAULT_PLATFORM + ". " +
|
||||
"First observed at read with name = " + read.getReadName() );
|
||||
warnUserNullReadGroup = true;
|
||||
}
|
||||
// There is no readGroup so defaulting to these values
|
||||
readGroupId = DEFAULT_READ_GROUP;
|
||||
platform = DEFAULT_PLATFORM;
|
||||
} else {
|
||||
readGroupId = readGroup.getReadGroupId();
|
||||
platform = readGroup.getPlatform();
|
||||
}
|
||||
final boolean isNegStrand = read.getReadNegativeStrandFlag();
|
||||
byte[] bases = read.getReadBases();
|
||||
// Fill out the readDatum with the useful information from the read
|
||||
ReadHashDatum readDatum = ReadHashDatum.parseSAMRecord( read, RAC );
|
||||
|
||||
int startPos = 1;
|
||||
int stopPos = bases.length;
|
||||
int stopPos = readDatum.length;
|
||||
|
||||
if( isNegStrand ) { // DinucCovariate is responsible for getting the complement base if needed
|
||||
if( readDatum.isNegStrand ) { // DinucCovariate is responsible for getting the complement base if needed
|
||||
startPos = 0;
|
||||
stopPos = bases.length - 1;
|
||||
stopPos = readDatum.length - 1;
|
||||
}
|
||||
if( FORCE_READ_GROUP != null ) { // Collapse all the read groups into a single common String provided by the user
|
||||
readGroupId = FORCE_READ_GROUP;
|
||||
}
|
||||
if( FORCE_PLATFORM != null ) {
|
||||
platform = FORCE_PLATFORM;
|
||||
}
|
||||
|
||||
Integer tile = IlluminaUtil.getTileFromReadName(read.getReadName());
|
||||
ReadHashDatum readDatum = new ReadHashDatum( readGroupId, platform, originalQuals, bases, isNegStrand,
|
||||
read.getMappingQuality(), bases.length, tile );
|
||||
|
||||
// For each base in the read
|
||||
for( int iii = startPos; iii < stopPos; iii++ ) { // Skip first or last base because there is no dinuc depending on the direction of the read
|
||||
|
|
@ -353,7 +321,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
|
||||
|
||||
// SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores
|
||||
if( platform.equalsIgnoreCase("SOLID") ) {
|
||||
if( readDatum.platform.equalsIgnoreCase("SOLID") ) {
|
||||
byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
|
||||
if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); }
|
||||
}
|
||||
|
|
@ -415,7 +383,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get( collapsedTableKey );
|
||||
if( deltaQCovariateEmpirical != null ) {
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if(iii==2) { deltaQPos = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } // BUGBUG: Only here to validate against the old recalibrator
|
||||
if(iii==3) { deltaQDinuc = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,6 +29,12 @@ import org.broadinstitute.sting.utils.StingException;
|
|||
* @author alecw@broadinstitute.org
|
||||
*/
|
||||
public class TileCovariate implements Covariate {
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public Comparable getValue(final ReadHashDatum readDatum, final int offset) {
|
||||
if (readDatum.tile == null) {
|
||||
throw new StingException("Tile number not defined for read");
|
||||
|
|
@ -36,10 +42,12 @@ public class TileCovariate implements Covariate {
|
|||
return readDatum.tile;
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public Comparable getValue(final String str) {
|
||||
return Integer.parseInt( str );
|
||||
}
|
||||
|
||||
// Used to estimate the amount space required for the full data HashMap
|
||||
public int estimatedNumberOfBins() {
|
||||
return 120;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue