Added hieracrchy to the covariate classes: Required, Standard, and Experimental. Required covariates (rg and reported quality) are added for the user whether or not they are specified in the -cov list. There is now a -standard option in CountCovariates which will add in all of the standard covariates so the user doesn't have to type them all out or even know which ones are the standard. There is logger output to say which covariates are being used of course. The list of covariates used is also added to the PG tag in the bam file produced by TableRecalibration.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2338 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-12-12 16:34:05 +00:00
parent 6955b5bf53
commit 014013630f
12 changed files with 64 additions and 61 deletions

View File

@ -43,3 +43,12 @@ public interface Covariate {
public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
public int estimatedNumberOfBins(); // Used to estimate the amount space required for the full data HashMap
}
interface RequiredCovariate extends Covariate {
}
interface StandardCovariate extends Covariate {
}
interface ExperimentalCovariate extends Covariate {
}

View File

@ -78,9 +78,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
// Command Line Arguments
/////////////////////////////
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
private Boolean LIST_ONLY = false;
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false)
private boolean LIST_ONLY = false;
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false)
private String[] COVARIATES = null;
@Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false)
private boolean USE_STANDARD_COVARIATES = false;
@Argument(fullName="process_nth_locus", shortName="pN", required=false, doc="Only process every Nth covered locus we see.")
private int PROCESS_EVERY_NTH_LOCUS = 1;
@ -101,7 +103,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. BUGBUG: I don't understand what is going on in this case
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
private static final String versionString = "v2.1.0"; // Major version, minor version, and build number
private static final String versionString = "v2.1.1"; // Major version, minor version, and build number
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
@ -129,12 +131,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
// Get a list of all available covariates
final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface( Covariate.class );
final List<Class<? extends Covariate>> covariateClasses = PackageUtils.getClassesImplementingInterface( Covariate.class );
final List<Class<? extends RequiredCovariate>> requiredClasses = PackageUtils.getClassesImplementingInterface( RequiredCovariate.class );
final List<Class<? extends StandardCovariate>> standardClasses = PackageUtils.getClassesImplementingInterface( StandardCovariate.class );
// Print and exit if that's what was requested
if ( LIST_ONLY ) {
out.println( "Available covariates:" );
for( Class<?> covClass : classes ) {
for( Class<?> covClass : covariateClasses ) {
out.println( covClass.getSimpleName() );
}
out.println();
@ -155,44 +159,38 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
// Initialize the requested covariates by parsing the -cov argument
// BUGBUG: This is a mess because there are a lot of cases (validate, all, none, and supplied covList). Clean up needed.
requestedCovariates = new ArrayList<Covariate>();
int estimatedCapacity = 1; // Capacity is multiplicitive so this starts at one
// First add the required covariates
if( requiredClasses.size() == 2) { // readGroup and reported quality score
requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here
requestedCovariates.add( new QualityScoreCovariate() );
} else {
throw new StingException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order.");
}
// Next add the standard covariates if -standard was specified by the user
if( USE_STANDARD_COVARIATES ) {
for( Class<?> covClass : standardClasses ) {
try {
Covariate covariate = (Covariate)covClass.newInstance();
requestedCovariates.add( covariate );
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
}
}
}
// Finally parse the -cov arguments that were provided, skipping over the ones already specified
if( COVARIATES != null ) {
if(COVARIATES[0].equalsIgnoreCase( "ALL" )) { // The user wants ALL covariates to be used
requestedCovariates.add( new ReadGroupCovariate() ); // First add the required covariates then add the rest by looping over all implementing classes that were found
requestedCovariates.add( new QualityScoreCovariate() );
for( Class<?> covClass : classes ) {
try {
Covariate covariate = (Covariate)covClass.newInstance();
estimatedCapacity *= covariate.estimatedNumberOfBins();
if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // These were already added so don't add them again
requestedCovariates.add( covariate );
}
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
}
}
} else { // The user has specified a list of several covariates
int covNumber = 1;
for( String requestedCovariateString : COVARIATES ) {
boolean foundClass = false;
for( Class<?> covClass : classes ) {
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
foundClass = true;
// Read Group Covariate and Quality Score Covariate are required covariates for the recalibration calculation and must begin the list
if( (covNumber == 1 && !requestedCovariateString.equalsIgnoreCase( "ReadGroupCovariate" )) ||
(covNumber == 2 && !requestedCovariateString.equalsIgnoreCase( "QualityScoreCovariate" )) ) {
throw new StingException("ReadGroupCovariate and QualityScoreCovariate are required covariates for the recalibration calculation and must begin the list" );
}
covNumber++;
for( String requestedCovariateString : COVARIATES ) {
boolean foundClass = false;
for( Class<?> covClass : covariateClasses ) {
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
foundClass = true;
if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) {
try {
// Now that we've found a matching class, try to instantiate it
Covariate covariate = (Covariate)covClass.newInstance();
estimatedCapacity *= covariate.estimatedNumberOfBins();
requestedCovariates.add( covariate );
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
@ -201,17 +199,12 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
}
}
}
if( !foundClass ) {
throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
}
if( !foundClass ) {
throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
}
}
} else { // No covariates were specified by the user so add the default, required ones
Utils.warnUser( "Using default set of covariates because none were specified. Using ReadGroupCovariate and QualityScoreCovariate only." );
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new QualityScoreCovariate() );
estimatedCapacity = 60 * 40;
}
logger.info( "The covariates being used here: " );
@ -221,10 +214,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
// Don't want to crash with out of heap space exception
if( estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0 ) { // Could be negative if overflowed
estimatedCapacity = 300 * 40 * 200;
}
dataManager = new RecalDataManager( estimatedCapacity );
//if( estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0 ) { // Could be negative if overflowed
// estimatedCapacity = 300 * 40 * 200;
//}
dataManager = new RecalDataManager();
}

View File

@ -42,7 +42,7 @@ import net.sf.samtools.SAMRecord;
* For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
*/
public class CycleCovariate implements Covariate {
public class CycleCovariate implements StandardCovariate {
private static boolean warnedUserBadPlatform = false;
private static String defaultPlatform;

View File

@ -40,7 +40,7 @@ import net.sf.samtools.SAMRecord;
* This assumption is made to speed up the code.
*/
public class DinucCovariate implements Covariate {
public class DinucCovariate implements StandardCovariate {
HashMap<Integer, Dinuc> dinucHashMap;

View File

@ -38,7 +38,7 @@ import net.sf.samtools.SAMRecord;
* 00100123121123456789
*/
public class HomopolymerCovariate implements Covariate {
public class HomopolymerCovariate implements ExperimentalCovariate {
int numBack = 10;

View File

@ -35,7 +35,7 @@ import net.sf.samtools.SAMRecord;
* The Mapping Quality covariate.
*/
public class MappingQualityCovariate implements Covariate {
public class MappingQualityCovariate implements ExperimentalCovariate {
// Initialize any member variables using the command-line arguments passed to the walkers
public void initialize( final RecalibrationArgumentCollection RAC ) {

View File

@ -36,7 +36,7 @@ import net.sf.samtools.SAMRecord;
* This covariate is the minimum base quality score in the read in a small window around the current base.
*/
public class MinimumNQSCovariate implements Covariate {
public class MinimumNQSCovariate implements ExperimentalCovariate {
private int windowReach; // How far in each direction from the current base to look

View File

@ -36,7 +36,7 @@ import net.sf.samtools.SAMRecord;
* This is the Solexa definition of machine cycle and the covariate that was always being used in the original version of the recalibrator.
*/
public class PositionCovariate implements Covariate {
public class PositionCovariate implements ExperimentalCovariate {
// Initialize any member variables using the command-line arguments passed to the walkers
public void initialize( final RecalibrationArgumentCollection RAC ) {

View File

@ -37,7 +37,7 @@ import net.sf.samtools.SAMRecord;
* For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
*/
public class PrimerRoundCovariate implements Covariate {
public class PrimerRoundCovariate implements ExperimentalCovariate {
// Initialize any member variables using the command-line arguments passed to the walkers
public void initialize( final RecalibrationArgumentCollection RAC ) {

View File

@ -35,7 +35,7 @@ import net.sf.samtools.SAMRecord;
* The Reported Quality Score covariate.
*/
public class QualityScoreCovariate implements Covariate {
public class QualityScoreCovariate implements RequiredCovariate {
// Initialize any member variables using the command-line arguments passed to the walkers
public void initialize( final RecalibrationArgumentCollection RAC ) {

View File

@ -35,7 +35,7 @@ import net.sf.samtools.SAMRecord;
* The Read Group covariate.
*/
public class ReadGroupCovariate implements Covariate{
public class ReadGroupCovariate implements RequiredCovariate{
public static final String defaultReadGroup = "DefaultReadGroup";

View File

@ -30,7 +30,7 @@ import edu.mit.broad.picard.illumina.parser.IlluminaUtil;
/**
* @author alecw@broadinstitute.org
*/
public class TileCovariate implements Covariate {
public class TileCovariate implements ExperimentalCovariate {
// Initialize any member variables using the command-line arguments passed to the walkers
public void initialize( final RecalibrationArgumentCollection RAC ) {