The recalibrator now uses all input RODs when looking for known polymorphic sites not just the one named dbsnp. Added an integration test which uses both dbsnp and an input vcf file and skips over the union of the two.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2564 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-01-12 18:50:39 +00:00
parent 16777e3875
commit 189829841b
3 changed files with 38 additions and 11 deletions

View File

@ -125,7 +125,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
} }
/** /**
* given a existing file, open it and append all the valid triplet lines to an existing list * given an existing file, open it and append all the valid triplet lines to an existing list
* *
* @param rodTripletList the list of existing triplets * @param rodTripletList the list of existing triplets
* @param filename the file to attempt to extract ROD triplets from * @param filename the file to attempt to extract ROD triplets from

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RODRecordList; import org.broadinstitute.sting.gatk.refdata.RODRecordList;
import org.broadinstitute.sting.gatk.refdata.VariationRod;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
@ -103,7 +104,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
private static final String versionString = "v2.2.4"; // Major version, minor version, and build number private static final String versionString = "v2.2.5"; // Major version, minor version, and build number
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
@ -149,8 +150,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
// Warn the user if no dbSNP file was specified // Warn the user if no dbSNP file was specified
boolean foundDBSNP = false; boolean foundDBSNP = false;
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) { for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) { if( rod != null ) {
foundDBSNP = true; foundDBSNP = true;
break;
} }
} }
if( !foundDBSNP ) { if( !foundDBSNP ) {
@ -232,17 +234,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
*/ */
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
// Pull out anything passed by -B name,type,file that has the name "dbsnp" // Pull out data for this locus for all the input RODs and check if this is a known variant site in any of them
final RODRecordList<ReferenceOrderedDatum> dbsnpRODs = tracker.getTrackData( "dbsnp", null );
boolean isSNP = false; boolean isSNP = false;
if (dbsnpRODs != null) { for( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
for( ReferenceOrderedDatum rod : dbsnpRODs ) { if( rod != null && rod instanceof Variation && ((Variation)rod).isSNP() ) {
if( ((Variation)rod).isSNP() ) {
isSNP = true; // At least one of the rods says this is a snp site isSNP = true; // At least one of the rods says this is a snp site
break; break;
} }
} }
}
// Only use data from non-dbsnp sites // Only use data from non-dbsnp sites
// Assume every mismatch at a non-dbsnp site is indicitive of poor quality // Assume every mismatch at a non-dbsnp site is indicitive of poor quality

View File

@ -133,6 +133,34 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
} }
} }
@Test
public void testCountCovariatesVCFPlusDBsnp() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "cc1cc9c1ff184d388d81574fdccc608e");
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
String md5 = entry.getValue();
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" -B anyNameABCD,VCF," + validationDataLocation + "vcfexample3.vcf" +
" -T CountCovariates" +
" -I " + bam +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -L 1:10,000,000-10,200,000" +
" -cov ReadGroupCovariate" +
" -cov QualityScoreCovariate" +
" -cov CycleCovariate" +
" -cov DinucCovariate" +
" --solid_recal_mode SET_Q_ZERO" +
" -recalFile %s",
1, // just one output file
Arrays.asList(md5));
executeTest("testCountCovariatesVCFPlusDBsnp", spec);
}
}
@Test @Test
public void testCountCovariatesNoReadGroups() { public void testCountCovariatesNoReadGroups() {
HashMap<String, String> e = new HashMap<String, String>(); HashMap<String, String> e = new HashMap<String, String>();