From 014013630f70e32d1c80c50ff3016b005816b928 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Sat, 12 Dec 2009 16:34:05 +0000 Subject: [PATCH] Added hieracrchy to the covariate classes: Required, Standard, and Experimental. Required covariates (rg and reported quality) are added for the user whether or not they are specified in the -cov list. There is now a -standard option in CountCovariates which will add in all of the standard covariates so the user doesn't have to type them all out or even know which ones are the standard. There is logger output to say which covariates are being used of course. The list of covariates used is also added to the PG tag in the bam file produced by TableRecalibration. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2338 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/recalibration/Covariate.java | 9 ++ .../recalibration/CovariateCounterWalker.java | 96 +++++++++---------- .../walkers/recalibration/CycleCovariate.java | 2 +- .../walkers/recalibration/DinucCovariate.java | 2 +- .../recalibration/HomopolymerCovariate.java | 2 +- .../MappingQualityCovariate.java | 2 +- .../recalibration/MinimumNQSCovariate.java | 2 +- .../recalibration/PositionCovariate.java | 2 +- .../recalibration/PrimerRoundCovariate.java | 2 +- .../recalibration/QualityScoreCovariate.java | 2 +- .../recalibration/ReadGroupCovariate.java | 2 +- .../walkers/recalibration/TileCovariate.java | 2 +- 12 files changed, 64 insertions(+), 61 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java index cc298ee78..0fb6c179f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java @@ -43,3 +43,12 @@ public interface Covariate { public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker public int estimatedNumberOfBins(); // Used to estimate the amount space required for the full data HashMap } + +interface RequiredCovariate extends Covariate { +} + +interface StandardCovariate extends Covariate { +} + +interface ExperimentalCovariate extends Covariate { +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index 87ac86afc..4a00a8b08 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -78,9 +78,11 @@ public class CovariateCounterWalker extends LocusWalker { // Command Line Arguments ///////////////////////////// @Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false) - private Boolean LIST_ONLY = false; - @Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false) + private boolean LIST_ONLY = false; + @Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false) private String[] COVARIATES = null; + @Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false) + private boolean USE_STANDARD_COVARIATES = false; @Argument(fullName="process_nth_locus", shortName="pN", required=false, doc="Only process every Nth covered locus we see.") private int PROCESS_EVERY_NTH_LOCUS = 1; @@ -101,7 +103,7 @@ public class CovariateCounterWalker extends LocusWalker { private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. BUGBUG: I don't understand what is going on in this case private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site - private static final String versionString = "v2.1.0"; // Major version, minor version, and build number + private static final String versionString = "v2.1.1"; // Major version, minor version, and build number private Pair dbSNP_counts = new Pair(0L, 0L); // mismatch/base counts for dbSNP loci private Pair novel_counts = new Pair(0L, 0L); // mismatch/base counts for non-dbSNP loci private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) @@ -129,12 +131,14 @@ public class CovariateCounterWalker extends LocusWalker { } // Get a list of all available covariates - final List> classes = PackageUtils.getClassesImplementingInterface( Covariate.class ); - + final List> covariateClasses = PackageUtils.getClassesImplementingInterface( Covariate.class ); + final List> requiredClasses = PackageUtils.getClassesImplementingInterface( RequiredCovariate.class ); + final List> standardClasses = PackageUtils.getClassesImplementingInterface( StandardCovariate.class ); + // Print and exit if that's what was requested if ( LIST_ONLY ) { out.println( "Available covariates:" ); - for( Class covClass : classes ) { + for( Class covClass : covariateClasses ) { out.println( covClass.getSimpleName() ); } out.println(); @@ -155,44 +159,38 @@ public class CovariateCounterWalker extends LocusWalker { // Initialize the requested covariates by parsing the -cov argument - // BUGBUG: This is a mess because there are a lot of cases (validate, all, none, and supplied covList). Clean up needed. requestedCovariates = new ArrayList(); - int estimatedCapacity = 1; // Capacity is multiplicitive so this starts at one + // First add the required covariates + if( requiredClasses.size() == 2) { // readGroup and reported quality score + requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here + requestedCovariates.add( new QualityScoreCovariate() ); + } else { + throw new StingException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order."); + } + // Next add the standard covariates if -standard was specified by the user + if( USE_STANDARD_COVARIATES ) { + for( Class covClass : standardClasses ) { + try { + Covariate covariate = (Covariate)covClass.newInstance(); + requestedCovariates.add( covariate ); + } catch ( InstantiationException e ) { + throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); + } catch ( IllegalAccessException e ) { + throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) ); + } + } + } + // Finally parse the -cov arguments that were provided, skipping over the ones already specified if( COVARIATES != null ) { - if(COVARIATES[0].equalsIgnoreCase( "ALL" )) { // The user wants ALL covariates to be used - requestedCovariates.add( new ReadGroupCovariate() ); // First add the required covariates then add the rest by looping over all implementing classes that were found - requestedCovariates.add( new QualityScoreCovariate() ); - for( Class covClass : classes ) { - try { - Covariate covariate = (Covariate)covClass.newInstance(); - - estimatedCapacity *= covariate.estimatedNumberOfBins(); - if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // These were already added so don't add them again - requestedCovariates.add( covariate ); - } - } catch ( InstantiationException e ) { - throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); - } catch ( IllegalAccessException e ) { - throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) ); - } - } - } else { // The user has specified a list of several covariates - int covNumber = 1; - for( String requestedCovariateString : COVARIATES ) { - boolean foundClass = false; - for( Class covClass : classes ) { - if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class - foundClass = true; - // Read Group Covariate and Quality Score Covariate are required covariates for the recalibration calculation and must begin the list - if( (covNumber == 1 && !requestedCovariateString.equalsIgnoreCase( "ReadGroupCovariate" )) || - (covNumber == 2 && !requestedCovariateString.equalsIgnoreCase( "QualityScoreCovariate" )) ) { - throw new StingException("ReadGroupCovariate and QualityScoreCovariate are required covariates for the recalibration calculation and must begin the list" ); - } - covNumber++; + for( String requestedCovariateString : COVARIATES ) { + boolean foundClass = false; + for( Class covClass : covariateClasses ) { + if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class + foundClass = true; + if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) { try { // Now that we've found a matching class, try to instantiate it Covariate covariate = (Covariate)covClass.newInstance(); - estimatedCapacity *= covariate.estimatedNumberOfBins(); requestedCovariates.add( covariate ); } catch ( InstantiationException e ) { throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); @@ -201,17 +199,12 @@ public class CovariateCounterWalker extends LocusWalker { } } } + } - if( !foundClass ) { - throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." ); - } + if( !foundClass ) { + throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." ); } } - } else { // No covariates were specified by the user so add the default, required ones - Utils.warnUser( "Using default set of covariates because none were specified. Using ReadGroupCovariate and QualityScoreCovariate only." ); - requestedCovariates.add( new ReadGroupCovariate() ); - requestedCovariates.add( new QualityScoreCovariate() ); - estimatedCapacity = 60 * 40; } logger.info( "The covariates being used here: " ); @@ -221,10 +214,11 @@ public class CovariateCounterWalker extends LocusWalker { } // Don't want to crash with out of heap space exception - if( estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0 ) { // Could be negative if overflowed - estimatedCapacity = 300 * 40 * 200; - } - dataManager = new RecalDataManager( estimatedCapacity ); + //if( estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0 ) { // Could be negative if overflowed + // estimatedCapacity = 300 * 40 * 200; + //} + + dataManager = new RecalDataManager(); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index fa53248d9..0062b4efd 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -42,7 +42,7 @@ import net.sf.samtools.SAMRecord; * For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round */ -public class CycleCovariate implements Covariate { +public class CycleCovariate implements StandardCovariate { private static boolean warnedUserBadPlatform = false; private static String defaultPlatform; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java index ebb55a204..1278e7292 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java @@ -40,7 +40,7 @@ import net.sf.samtools.SAMRecord; * This assumption is made to speed up the code. */ -public class DinucCovariate implements Covariate { +public class DinucCovariate implements StandardCovariate { HashMap dinucHashMap; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java index c2f063917..ecf82131d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java @@ -38,7 +38,7 @@ import net.sf.samtools.SAMRecord; * 00100123121123456789 */ -public class HomopolymerCovariate implements Covariate { +public class HomopolymerCovariate implements ExperimentalCovariate { int numBack = 10; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java index 882785dde..0b19fd57b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java @@ -35,7 +35,7 @@ import net.sf.samtools.SAMRecord; * The Mapping Quality covariate. */ -public class MappingQualityCovariate implements Covariate { +public class MappingQualityCovariate implements ExperimentalCovariate { // Initialize any member variables using the command-line arguments passed to the walkers public void initialize( final RecalibrationArgumentCollection RAC ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java index a396ac8c5..c97d553d2 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java @@ -36,7 +36,7 @@ import net.sf.samtools.SAMRecord; * This covariate is the minimum base quality score in the read in a small window around the current base. */ -public class MinimumNQSCovariate implements Covariate { +public class MinimumNQSCovariate implements ExperimentalCovariate { private int windowReach; // How far in each direction from the current base to look diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java index 17301460a..7c71db430 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java @@ -36,7 +36,7 @@ import net.sf.samtools.SAMRecord; * This is the Solexa definition of machine cycle and the covariate that was always being used in the original version of the recalibrator. */ -public class PositionCovariate implements Covariate { +public class PositionCovariate implements ExperimentalCovariate { // Initialize any member variables using the command-line arguments passed to the walkers public void initialize( final RecalibrationArgumentCollection RAC ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java index 329ba75e6..328f2dcd1 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java @@ -37,7 +37,7 @@ import net.sf.samtools.SAMRecord; * For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf */ -public class PrimerRoundCovariate implements Covariate { +public class PrimerRoundCovariate implements ExperimentalCovariate { // Initialize any member variables using the command-line arguments passed to the walkers public void initialize( final RecalibrationArgumentCollection RAC ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java index 0b4a9e285..a174f4477 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java @@ -35,7 +35,7 @@ import net.sf.samtools.SAMRecord; * The Reported Quality Score covariate. */ -public class QualityScoreCovariate implements Covariate { +public class QualityScoreCovariate implements RequiredCovariate { // Initialize any member variables using the command-line arguments passed to the walkers public void initialize( final RecalibrationArgumentCollection RAC ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java index fa847bcd1..255417409 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java @@ -35,7 +35,7 @@ import net.sf.samtools.SAMRecord; * The Read Group covariate. */ -public class ReadGroupCovariate implements Covariate{ +public class ReadGroupCovariate implements RequiredCovariate{ public static final String defaultReadGroup = "DefaultReadGroup"; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TileCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TileCovariate.java index b92217fd0..cd1551791 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TileCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TileCovariate.java @@ -30,7 +30,7 @@ import edu.mit.broad.picard.illumina.parser.IlluminaUtil; /** * @author alecw@broadinstitute.org */ -public class TileCovariate implements Covariate { +public class TileCovariate implements ExperimentalCovariate { // Initialize any member variables using the command-line arguments passed to the walkers public void initialize( final RecalibrationArgumentCollection RAC ) {