Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
418a4d541f
|
|
@ -25,15 +25,10 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||||
|
|
||||||
import org.broad.tribble.bed.BEDCodec;
|
import org.broad.tribble.Feature;
|
||||||
import org.broad.tribble.dbsnp.DbSNPCodec;
|
import org.broadinstitute.sting.commandline.*;
|
||||||
import org.broadinstitute.sting.commandline.Argument;
|
|
||||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
|
||||||
import org.broadinstitute.sting.commandline.Gather;
|
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
|
||||||
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableReadFilter;
|
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableReadFilter;
|
||||||
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter;
|
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
|
@ -42,8 +37,6 @@ import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
|
||||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
|
@ -94,13 +87,18 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||||
|
|
||||||
@Output
|
|
||||||
PrintStream out;
|
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Command Line Arguments
|
// Command Line Arguments
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the outputted covariates table recalibration file")
|
/**
|
||||||
|
* This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
|
||||||
|
* so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites.
|
||||||
|
*/
|
||||||
|
@Input(fullName="knownSites", shortName = "knownSites", doc="A database of known polymorphic sites to skip over in the recalibration algorithm", required=false)
|
||||||
|
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
|
||||||
|
@Output
|
||||||
|
PrintStream out;
|
||||||
|
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file")
|
||||||
@Gather(CountCovariatesGatherer.class)
|
@Gather(CountCovariatesGatherer.class)
|
||||||
public PrintStream RECAL_FILE;
|
public PrintStream RECAL_FILE;
|
||||||
|
|
||||||
|
|
@ -124,8 +122,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
private final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
private final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||||
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // A list to hold the covariate objects that were requested
|
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // A list to hold the covariate objects that were requested
|
||||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||||
private static int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
|
private static int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
|
||||||
|
|
||||||
public static class CountedData {
|
public static class CountedData {
|
||||||
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
|
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
|
||||||
|
|
@ -136,7 +134,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
|
|
||||||
private long dbSNPCountsMM = 0, dbSNPCountsBases = 0; // mismatch/base counts for dbSNP loci
|
private long dbSNPCountsMM = 0, dbSNPCountsBases = 0; // mismatch/base counts for dbSNP loci
|
||||||
private long novelCountsMM = 0, novelCountsBases = 0; // mismatch/base counts for non-dbSNP loci
|
private long novelCountsMM = 0, novelCountsBases = 0; // mismatch/base counts for non-dbSNP loci
|
||||||
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
|
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds the values of other to this, returning this
|
* Adds the values of other to this, returning this
|
||||||
|
|
@ -190,20 +188,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
}
|
}
|
||||||
|
|
||||||
// Warn the user if no dbSNP file or other variant mask was specified
|
// Warn the user if no dbSNP file or other variant mask was specified
|
||||||
boolean foundDBSNP = false;
|
if( knownSites.isEmpty() && !RUN_WITHOUT_DBSNP ) {
|
||||||
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
|
throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.");
|
||||||
if( rod != null ) {
|
|
||||||
if( rod.getType().equals(DbSNPCodec.class) ||
|
|
||||||
rod.getType().equals(VCFCodec.class) ||
|
|
||||||
rod.getType().equals(VCF3Codec.class) ||
|
|
||||||
rod.getType().equals(BEDCodec.class) ) {
|
|
||||||
foundDBSNP = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if( !foundDBSNP && !RUN_WITHOUT_DBSNP ) {
|
|
||||||
throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a dbSNP ROD or a VCF file containing known sites of genetic variation.");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize the requested covariates by parsing the -cov argument
|
// Initialize the requested covariates by parsing the -cov argument
|
||||||
|
|
@ -266,12 +252,6 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
logger.info( "\t" + cov.getClass().getSimpleName() );
|
logger.info( "\t" + cov.getClass().getSimpleName() );
|
||||||
cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection
|
cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection
|
||||||
}
|
}
|
||||||
|
|
||||||
// try {
|
|
||||||
// stream = new PrintStream( RAC.RECAL_FILE );
|
|
||||||
// } catch ( FileNotFoundException e ) {
|
|
||||||
// throw new RuntimeException( "Couldn't open output file: ", e );
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -290,16 +270,13 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
* @return Returns 1, but this value isn't used in the reduce step
|
* @return Returns 1, but this value isn't used in the reduce step
|
||||||
*/
|
*/
|
||||||
public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||||
// If any ROD covers this site then we assume it is a site of known genetic variation and we skip it
|
|
||||||
boolean isSNP = tracker.getNTracksWithBoundFeatures() > 0;
|
|
||||||
|
|
||||||
// Only use data from non-dbsnp sites
|
// Only use data from non-dbsnp sites
|
||||||
// Assume every mismatch at a non-dbsnp site is indicative of poor quality
|
// Assume every mismatch at a non-dbsnp site is indicative of poor quality
|
||||||
CountedData counter = new CountedData();
|
CountedData counter = new CountedData();
|
||||||
if( !isSNP ) {
|
if( tracker.getValues(knownSites).size() == 0 ) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
|
||||||
// For each read at this locus
|
// For each read at this locus
|
||||||
for( PileupElement p : context.getBasePileup() ) {
|
for( final PileupElement p : context.getBasePileup() ) {
|
||||||
GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead();
|
final GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead();
|
||||||
int offset = p.getOffset();
|
int offset = p.getOffset();
|
||||||
|
|
||||||
if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) {
|
if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) {
|
||||||
|
|
@ -358,8 +335,6 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
return counter;
|
return counter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Update the mismatch / total_base counts for a given class of loci.
|
* Update the mismatch / total_base counts for a given class of loci.
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -93,8 +93,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
|
|
||||||
@Output(doc="The output BAM file", required=true)
|
@Output(doc="The output BAM file", required=true)
|
||||||
private StingSAMFileWriter OUTPUT_BAM = null;
|
private StingSAMFileWriter OUTPUT_BAM = null;
|
||||||
@Argument(fullName="preserve_qscores_less_than", shortName="pQ",
|
@Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
||||||
doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
|
||||||
private int PRESERVE_QSCORES_LESS_THAN = 5;
|
private int PRESERVE_QSCORES_LESS_THAN = 5;
|
||||||
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points, default=1")
|
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points, default=1")
|
||||||
private int SMOOTHING = 1;
|
private int SMOOTHING = 1;
|
||||||
|
|
|
||||||
|
|
@ -54,7 +54,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-R " + b36KGReference +
|
"-R " + b36KGReference +
|
||||||
" -B:dbsnp,vcf " + b36dbSNP129 +
|
" -knownSites " + b36dbSNP129 +
|
||||||
" -T CountCovariates" +
|
" -T CountCovariates" +
|
||||||
" -I " + bam +
|
" -I " + bam +
|
||||||
( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" )
|
( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" )
|
||||||
|
|
@ -135,7 +135,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
" -standard" +
|
" -standard" +
|
||||||
" -OQ" +
|
" -OQ" +
|
||||||
" -recalFile %s" +
|
" -recalFile %s" +
|
||||||
" -B:dbsnp,vcf " + b36dbSNP129,
|
" -knownSites " + b36dbSNP129,
|
||||||
1, // just one output file
|
1, // just one output file
|
||||||
Arrays.asList(md5));
|
Arrays.asList(md5));
|
||||||
executeTest("testCountCovariatesUseOriginalQuals", spec);
|
executeTest("testCountCovariatesUseOriginalQuals", spec);
|
||||||
|
|
@ -182,7 +182,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-R " + b36KGReference +
|
"-R " + b36KGReference +
|
||||||
" -B:dbsnp,vcf " + b36dbSNP129 +
|
" -knownSites " + b36dbSNP129 +
|
||||||
" -T CountCovariates" +
|
" -T CountCovariates" +
|
||||||
" -I " + bam +
|
" -I " + bam +
|
||||||
" -standard" +
|
" -standard" +
|
||||||
|
|
@ -225,30 +225,6 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testCountCovariatesVCF() {
|
|
||||||
HashMap<String, String> e = new HashMap<String, String>();
|
|
||||||
e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "170f0c3cc4b8d72c539136effeec9a16");
|
|
||||||
|
|
||||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
|
||||||
String bam = entry.getKey();
|
|
||||||
String md5 = entry.getValue();
|
|
||||||
|
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
|
||||||
"-R " + b36KGReference +
|
|
||||||
" -B:dbsnp,VCF3 " + validationDataLocation + "vcfexample3.vcf" +
|
|
||||||
" -T CountCovariates" +
|
|
||||||
" -I " + bam +
|
|
||||||
" -L 1:10,000,000-10,200,000" +
|
|
||||||
" -standard" +
|
|
||||||
" --solid_recal_mode SET_Q_ZERO" +
|
|
||||||
" -recalFile %s",
|
|
||||||
1, // just one output file
|
|
||||||
Arrays.asList(md5));
|
|
||||||
executeTest("testCountCovariatesVCF", spec);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCountCovariatesBED() {
|
public void testCountCovariatesBED() {
|
||||||
HashMap<String, String> e = new HashMap<String, String>();
|
HashMap<String, String> e = new HashMap<String, String>();
|
||||||
|
|
@ -260,7 +236,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-R " + b36KGReference +
|
"-R " + b36KGReference +
|
||||||
" -B:bed,bed " + validationDataLocation + "recalibrationTest.bed" +
|
" -knownSites:bed " + validationDataLocation + "recalibrationTest.bed" +
|
||||||
" -T CountCovariates" +
|
" -T CountCovariates" +
|
||||||
" -I " + bam +
|
" -I " + bam +
|
||||||
" -L 1:10,000,000-10,200,000" +
|
" -L 1:10,000,000-10,200,000" +
|
||||||
|
|
@ -284,10 +260,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-R " + b36KGReference +
|
"-R " + b36KGReference +
|
||||||
" -B:anyNameABCD,VCF3 " + validationDataLocation + "vcfexample3.vcf" +
|
" -knownSites:anyNameABCD,VCF3 " + validationDataLocation + "vcfexample3.vcf" +
|
||||||
" -T CountCovariates" +
|
" -T CountCovariates" +
|
||||||
" -I " + bam +
|
" -I " + bam +
|
||||||
" -B:dbsnp,vcf " + b36dbSNP129 +
|
" -knownSites " + b36dbSNP129 +
|
||||||
" -L 1:10,000,000-10,200,000" +
|
" -L 1:10,000,000-10,200,000" +
|
||||||
" -cov ReadGroupCovariate" +
|
" -cov ReadGroupCovariate" +
|
||||||
" -cov QualityScoreCovariate" +
|
" -cov QualityScoreCovariate" +
|
||||||
|
|
@ -312,7 +288,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-R " + b36KGReference +
|
"-R " + b36KGReference +
|
||||||
" -B:dbsnp,vcf " + b36dbSNP129 +
|
" -knownSites " + b36dbSNP129 +
|
||||||
" -T CountCovariates" +
|
" -T CountCovariates" +
|
||||||
" -I " + bam +
|
" -I " + bam +
|
||||||
" -cov ReadGroupCovariate" +
|
" -cov ReadGroupCovariate" +
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue