Added a CountCovariates integration test that uses a vcf file as the list of variant sites to skip over instead of the usual dbSNP rod.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2152 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-11-24 21:51:38 +00:00
parent 3484f652e7
commit c9ff5f209c
3 changed files with 43 additions and 11 deletions

View File

@ -110,7 +110,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
private final String versionString = "v2.0.4"; // Major version, minor version, and build number private final String versionString = "v2.0.5"; // Major version, minor version, and build number
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
@ -301,7 +301,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
} }
// This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again // This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again
// Lots of lines just pulling things out of the SAMRecord and stuffing into local variables // Lots of lines just pulling things out of the SAMRecord and stuffing into local variables, many edge cases to worry about
// These lines should be exactly the same in TableRecalibrationWalker
quals = read.getBaseQualities(); quals = read.getBaseQualities();
// Check if we need to use the original quality scores instead // Check if we need to use the original quality scores instead
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
@ -350,7 +351,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
refBase = (byte)ref.getBase(); refBase = (byte)ref.getBase();
prevBase = readDatum.bases[offset - 1]; prevBase = readDatum.bases[offset - 1];
// Get the complement base strand if we are a negative strand read, DinucCovariate is responsible for getting the complement bases if needed // DinucCovariate is responsible for getting the complement bases if needed
if( readDatum.isNegStrand ) { if( readDatum.isNegStrand ) {
prevBase = readDatum.bases[offset + 1]; prevBase = readDatum.bases[offset + 1];
} }
@ -415,8 +416,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
int refCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref); int refCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
if ( readCharBaseIndex != -1 && refCharBaseIndex != -1 ) { if ( readCharBaseIndex != -1 && refCharBaseIndex != -1 ) {
if ( readCharBaseIndex != refCharBaseIndex ) if ( readCharBaseIndex != refCharBaseIndex ) {
counts.first++; counts.first++;
}
counts.second++; counts.second++;
} }
} }
@ -426,8 +428,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
* Validate the dbSNP mismatch rates. * Validate the dbSNP mismatch rates.
*/ */
private void validateDbsnpMismatchRate() { private void validateDbsnpMismatchRate() {
if ( novel_counts.second == 0 || dbSNP_counts.second == 0 ) if ( novel_counts.second == 0 || dbSNP_counts.second == 0 ) {
return; return;
}
double fractionMM_novel = (double)novel_counts.first / (double)novel_counts.second; double fractionMM_novel = (double)novel_counts.first / (double)novel_counts.second;
double fractionMM_dbsnp = (double)dbSNP_counts.first / (double)dbSNP_counts.second; double fractionMM_dbsnp = (double)dbSNP_counts.first / (double)dbSNP_counts.second;
@ -464,7 +467,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
if( VALIDATE_OLD_RECALIBRATOR || SORTED_OUTPUT ) { if( VALIDATE_OLD_RECALIBRATOR || SORTED_OUTPUT ) {
dataManager.data.myPut( key, datum ); dataManager.data.sortedPut( key, datum );
} else { } else {
dataManager.data.put( key, datum ); dataManager.data.put( key, datum );
} }

View File

@ -35,7 +35,7 @@ import java.util.*;
* User: rpoplin * User: rpoplin
* Date: Oct 30, 2009 * Date: Oct 30, 2009
* *
* A HashMap that maps a list of comparables to any object <T>. * A HashMap that maps a list of comparables to any Object <T>.
* There is functionality for the mappings to be given back to you in sorted order. * There is functionality for the mappings to be given back to you in sorted order.
*/ */
@ -55,9 +55,8 @@ public class NHashMap<T> extends HashMap<List<? extends Comparable>, T> {
} }
// This method is here only to help facilitate direct comparison of old and refactored recalibrator. // This method is here only to help facilitate outputting the mappings in sorted order
// The old recalibrator prints out its mappings in a sorted order but the refactored recalibrator doesn't need to. public T sortedPut(List<? extends Comparable> key, T value) {
public T myPut(List<? extends Comparable> key, T value) {
if( keyLists == null ) { if( keyLists == null ) {
keyLists = new ArrayList<ArrayList<Comparable>>(); keyLists = new ArrayList<ArrayList<Comparable>>();

View File

@ -20,6 +20,8 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "7842b33245a53b13c7b6cd4e4616ac11"); e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "7842b33245a53b13c7b6cd4e4616ac11");
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "cbe17568c5f03516a88ff0c5ce33a87b" ); e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "cbe17568c5f03516a88ff0c5ce33a87b" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "8806fb7f0e669d1abab61dc4b3d10d8f" ); e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "8806fb7f0e669d1abab61dc4b3d10d8f" );
for ( Map.Entry<String, String> entry : e.entrySet() ) { for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey(); String bam = entry.getKey();
String md5 = entry.getValue(); String md5 = entry.getValue();
@ -30,7 +32,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
" -T CountCovariates" + " -T CountCovariates" +
" -I " + bam + " -I " + bam +
( bam.equals( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" ) ( bam.equals( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" )
? " -L 1:10,800,000-10,810,000" : " -L 1:10,000,000-10,200,000" ) + ? " -L 1:10,800,000-10,810,000" : " -L 1:10,000,000-10,200,000" ) +
" -cov ReadGroupCovariate" + " -cov ReadGroupCovariate" +
" -cov QualityScoreCovariate" + " -cov QualityScoreCovariate" +
" -cov CycleCovariate" + " -cov CycleCovariate" +
@ -74,4 +76,32 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
} }
} }
} }
@Test
public void testCountCovariatesVCF() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "d5962c39a53a511d7ec710d5343e7a37");
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
String md5 = entry.getValue();
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R /broad/1KG/reference/human_b36_both.fasta" +
" -B dbsnp,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3.vcf" +
" -T CountCovariates" +
" -I " + bam +
" -L 1:10,000,000-10,200,000" +
" -cov ReadGroupCovariate" +
" -cov QualityScoreCovariate" +
" -cov CycleCovariate" +
" -cov DinucCovariate" +
" --sorted_output" +
" -recalFile %s",
1, // just one output file
Arrays.asList(md5));
executeTest("testCountCovariatesVCF", spec);
}
}
} }