Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Mark DePristo 2011-08-11 11:01:38 -04:00
commit 418a4d541f
3 changed files with 27 additions and 77 deletions

View File

@ -25,15 +25,10 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broad.tribble.bed.BEDCodec;
import org.broad.tribble.dbsnp.DbSNPCodec;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Gather;
import org.broadinstitute.sting.commandline.Output;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableReadFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -42,8 +37,6 @@ import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
import org.broadinstitute.sting.utils.collections.NestedHashMap;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -94,13 +87,18 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/////////////////////////////
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
@Output
PrintStream out;
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the outputted covariates table recalibration file")
/**
* This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
* so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites.
*/
@Input(fullName="knownSites", shortName = "knownSites", doc="A database of known polymorphic sites to skip over in the recalibration algorithm", required=false)
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
@Output
PrintStream out;
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file")
@Gather(CountCovariatesGatherer.class)
public PrintStream RECAL_FILE;
@ -124,8 +122,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/////////////////////////////
private final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // A list to hold the covariate objects that were requested
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
private static int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
private static int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
public static class CountedData {
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
@ -136,7 +134,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
private long dbSNPCountsMM = 0, dbSNPCountsBases = 0; // mismatch/base counts for dbSNP loci
private long novelCountsMM = 0, novelCountsBases = 0; // mismatch/base counts for non-dbSNP loci
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
/**
* Adds the values of other to this, returning this
@ -190,20 +188,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
}
// Warn the user if no dbSNP file or other variant mask was specified
boolean foundDBSNP = false;
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
if( rod != null ) {
if( rod.getType().equals(DbSNPCodec.class) ||
rod.getType().equals(VCFCodec.class) ||
rod.getType().equals(VCF3Codec.class) ||
rod.getType().equals(BEDCodec.class) ) {
foundDBSNP = true;
break;
}
}
}
if( !foundDBSNP && !RUN_WITHOUT_DBSNP ) {
throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a dbSNP ROD or a VCF file containing known sites of genetic variation.");
if( knownSites.isEmpty() && !RUN_WITHOUT_DBSNP ) {
throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.");
}
// Initialize the requested covariates by parsing the -cov argument
@ -266,12 +252,6 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
logger.info( "\t" + cov.getClass().getSimpleName() );
cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection
}
// try {
// stream = new PrintStream( RAC.RECAL_FILE );
// } catch ( FileNotFoundException e ) {
// throw new RuntimeException( "Couldn't open output file: ", e );
// }
}
@ -290,16 +270,13 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
* @return Returns 1, but this value isn't used in the reduce step
*/
public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
// If any ROD covers this site then we assume it is a site of known genetic variation and we skip it
boolean isSNP = tracker.getNTracksWithBoundFeatures() > 0;
// Only use data from non-dbsnp sites
// Assume every mismatch at a non-dbsnp site is indicative of poor quality
CountedData counter = new CountedData();
if( !isSNP ) {
if( tracker.getValues(knownSites).size() == 0 ) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
// For each read at this locus
for( PileupElement p : context.getBasePileup() ) {
GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead();
for( final PileupElement p : context.getBasePileup() ) {
final GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead();
int offset = p.getOffset();
if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) {
@ -358,8 +335,6 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
return counter;
}
/**
* Update the mismatch / total_base counts for a given class of loci.
*

View File

@ -93,8 +93,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
@Output(doc="The output BAM file", required=true)
private StingSAMFileWriter OUTPUT_BAM = null;
@Argument(fullName="preserve_qscores_less_than", shortName="pQ",
doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
@Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
private int PRESERVE_QSCORES_LESS_THAN = 5;
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points, default=1")
private int SMOOTHING = 1;

View File

@ -54,7 +54,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" -B:dbsnp,vcf " + b36dbSNP129 +
" -knownSites " + b36dbSNP129 +
" -T CountCovariates" +
" -I " + bam +
( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" )
@ -135,7 +135,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
" -standard" +
" -OQ" +
" -recalFile %s" +
" -B:dbsnp,vcf " + b36dbSNP129,
" -knownSites " + b36dbSNP129,
1, // just one output file
Arrays.asList(md5));
executeTest("testCountCovariatesUseOriginalQuals", spec);
@ -182,7 +182,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" -B:dbsnp,vcf " + b36dbSNP129 +
" -knownSites " + b36dbSNP129 +
" -T CountCovariates" +
" -I " + bam +
" -standard" +
@ -225,30 +225,6 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
}
}
@Test
public void testCountCovariatesVCF() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "170f0c3cc4b8d72c539136effeec9a16");
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
String md5 = entry.getValue();
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" -B:dbsnp,VCF3 " + validationDataLocation + "vcfexample3.vcf" +
" -T CountCovariates" +
" -I " + bam +
" -L 1:10,000,000-10,200,000" +
" -standard" +
" --solid_recal_mode SET_Q_ZERO" +
" -recalFile %s",
1, // just one output file
Arrays.asList(md5));
executeTest("testCountCovariatesVCF", spec);
}
}
@Test
public void testCountCovariatesBED() {
HashMap<String, String> e = new HashMap<String, String>();
@ -260,7 +236,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" -B:bed,bed " + validationDataLocation + "recalibrationTest.bed" +
" -knownSites:bed " + validationDataLocation + "recalibrationTest.bed" +
" -T CountCovariates" +
" -I " + bam +
" -L 1:10,000,000-10,200,000" +
@ -284,10 +260,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" -B:anyNameABCD,VCF3 " + validationDataLocation + "vcfexample3.vcf" +
" -knownSites:anyNameABCD,VCF3 " + validationDataLocation + "vcfexample3.vcf" +
" -T CountCovariates" +
" -I " + bam +
" -B:dbsnp,vcf " + b36dbSNP129 +
" -knownSites " + b36dbSNP129 +
" -L 1:10,000,000-10,200,000" +
" -cov ReadGroupCovariate" +
" -cov QualityScoreCovariate" +
@ -312,7 +288,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" -B:dbsnp,vcf " + b36dbSNP129 +
" -knownSites " + b36dbSNP129 +
" -T CountCovariates" +
" -I " + bam +
" -cov ReadGroupCovariate" +