Updated recalibrator integration tests to use all three platforms as well as a bam with multi-platform reads intermingled. CountCovariates v2.0.1: Once again uses a read filter to filter out zero mapping quality reads. Added --sorted_output option to output the table recalibration file in sorted order

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2122 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-11-23 19:51:36 +00:00
parent c299ca5f49
commit 7f947f6b60
5 changed files with 83 additions and 71 deletions

View File

@ -51,11 +51,11 @@ import net.sf.samtools.SAMReadGroupRecord;
*
* This walker is designed to work as the first pass in a two-pass processing step.
* It does a by-locus traversal operating only at sites that are not in dbSNP.
* We assume that all mismatches we see are therefore errors.
* This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
* We assume that all reference mismatches we see are therefore errors and indicitive of poor base quality.
* This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinucleotide)
* Since there is a large amount of data one can then calculate an empirical probability of error
* given the particular covariates seen at this site, where p(error) = num mismatches / num observations
* The output file is a CSV list of (several covariate values, num observations, num mismatches, empirical quality score)
* The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score)
* The first lines of the output file give the name of the covariate classes that were used for this calculation.
*
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and must be at the start of the list.
@ -64,7 +64,7 @@ import net.sf.samtools.SAMReadGroupRecord;
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
@WalkerName( "CountCovariates" )
//@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality // BUGBUG taken out to match old integration tests
@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
@ -80,10 +80,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. For debugging purposes only.")
private boolean NO_PRINT_HEADER = false;
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.")
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Depricated. Match the output of the old recalibrator exactly. For debugging purposes only.")
private boolean VALIDATE_OLD_RECALIBRATOR = false;
// BUGBUG: This validate option no longer does exactly what it says because now using read filter to filter out zero mapping quality reads. The old recalibrator counted those reads in the counted_sites variable but the new one doesn't.
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.")
private boolean USE_SLX_PLATFORM = false;
@Argument(fullName="sorted_output", shortName="sortedOutput", required=false, doc="The outputted table recalibration file will be in sorted order at the cost of added overhead. For debugging purposes only. This option is required in order to pass the integration tests.")
private boolean SORTED_OUTPUT = false;
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
@ -94,7 +98,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
private final String versionNumber = "2.0.0"; // major version, minor version, and build number
private final String versionNumber = "2.0.1"; // major version, minor version, and build number
//---------------------------------------------------------------------------------------------------------------
//
@ -301,46 +305,43 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
sizeOfReadDatumHashMap++;
}
if( readDatum.mappingQuality > 0 ) { // BUGBUG: turn this into a read filter after passing the old integration tests
// skip first and last base because there is no dinuc
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
if( offset > 0 ) {
if( offset < readDatum.length - 1 ) {
// skip if base quality is zero
if( readDatum.quals[offset] > 0 ) {
// skip first and last base because there is no dinuc
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
if( offset > 0 ) {
if( offset < readDatum.length - 1 ) {
// skip if base quality is zero
if( readDatum.quals[offset] > 0 ) {
refBase = (byte)ref.getBase();
prevBase = readDatum.bases[offset-1];
refBase = (byte)ref.getBase();
prevBase = readDatum.bases[offset-1];
// Get the complement base strand if we are a negative strand read
if( readDatum.isNegStrand ) {
prevBase = readDatum.bases[offset+1];
}
// Get the complement base strand if we are a negative strand read
if( readDatum.isNegStrand ) {
prevBase = readDatum.bases[offset+1];
// skip if this base or the previous one was an 'N' or etc.
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(readDatum.bases[offset]) ) ) {
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
colorSpaceQuals = null;
if( readDatum.platform.equalsIgnoreCase("SOLID") ) {
colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
}
// skip if this base or the previous one was an 'N' or etc.
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(readDatum.bases[offset]) ) ) {
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
colorSpaceQuals = null;
if( readDatum.platform.equalsIgnoreCase("SOLID") ) {
colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
}
if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
{
// This base finally passed all the checks, so add it to the big hashmap
updateDataFromRead( readDatum, offset, refBase );
}
} else {
if( VALIDATE_OLD_RECALIBRATOR ) {
countedBases++; // replicating a small bug in the old recalibrator
}
if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
{
// This base finally passed all the checks for a good base, so add it to the big data hashmap
updateDataFromRead( readDatum, offset, refBase );
}
} else {
if( VALIDATE_OLD_RECALIBRATOR ) {
countedBases++; // replicating a small bug in the old recalibrator
}
}
} else { // at the last base in the read so we can remove it from our IdentityHashMap
readDatumHashMap.remove( read );
sizeOfReadDatumHashMap--;
}
} else { // at the last base in the read so we can remove the read from our IdentityHashMap since we will never see it again
readDatumHashMap.remove( read );
sizeOfReadDatumHashMap--;
}
}
}
@ -377,7 +378,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
RecalDatum datum = dataManager.data.get( key );
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
if( VALIDATE_OLD_RECALIBRATOR ) {
if( VALIDATE_OLD_RECALIBRATOR || SORTED_OUTPUT ) {
dataManager.data.myPut( key, datum );
} else {
dataManager.data.put( key, datum );
@ -439,7 +440,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
*/
private void outputToCSV( final PrintStream recalTableStream ) {
//BUGBUG: This method is a mess. It will be cleaned up when I get rid of the validation and no_header debug options.
if( VALIDATE_OLD_RECALIBRATOR ) {
// output the old header as well as output the data in sorted order
recalTableStream.printf("# collapsed_pos false%n");
recalTableStream.printf("# collapsed_dinuc false%n");
recalTableStream.printf("# counted_sites %d%n", countedSites);
@ -447,14 +450,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
recalTableStream.printf("# skipped_sites %d%n", skippedSites);
recalTableStream.printf("# fraction_skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
recalTableStream.printf("rg,pos,Qrep,dn,nBases,nMismatches,Qemp%n");
// For each entry in the data hashmap
for( Pair<List<? extends Comparable>,RecalDatum> entry : dataManager.data.entrySetSorted4() ) {
// For each Covariate in the key
for( Comparable comp : entry.first ) {
// Output the Covariate's value
recalTableStream.print( comp + "," );
}
// Output the RecalDatum entry
recalTableStream.println( entry.second.outputToCSV() );
}
} else {
@ -468,16 +467,27 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
recalTableStream.println( "@!" + cov.getClass().getSimpleName() );
}
}
// For each entry in the data hashmap
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
// For each Covariate in the key
for( Comparable comp : entry.getKey() ) {
// Output the Covariate's value
if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG
recalTableStream.print( comp + "," );
if( SORTED_OUTPUT )
{
for( Pair<List<? extends Comparable>,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { //BUGBUG: entrySetSorted4 isn't correct here
for( Comparable comp : entry.first ) {
recalTableStream.print( comp + "," );
}
recalTableStream.println( entry.second.outputToCSV() );
}
} else {
// For each entry in the data hashmap
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
// For each Covariate in the key
for( Comparable comp : entry.getKey() ) {
// Output the Covariate's value
if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG
recalTableStream.print( comp + "," );
}
// Output the RecalDatum entry
recalTableStream.println( entry.getValue().outputToCSV() );
}
// Output the RecalDatum entry
recalTableStream.println( entry.getValue().outputToCSV() );
}
}
}

View File

@ -89,6 +89,6 @@ public class CycleCovariate implements Covariate {
}
public String toString() {
return "Cycle";
return "Machine Cycle";
}
}

View File

@ -15,10 +15,11 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testCountCovariates1() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "7be0b7a624d22187e712131f12aae546" );
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "dc1a1b6e99f4a47525cc1dce7b6eb1dc" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "5bb3cb7445ec223cd450b05430cb00cd" );
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "dc1a1b6e99f4a47525cc1dce7b6eb1dc" ); // This file doesn't have read groups?
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "7842b33245a53b13c7b6cd4e4616ac11");
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "cbe17568c5f03516a88ff0c5ce33a87b" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "8806fb7f0e669d1abab61dc4b3d10d8f" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
String md5 = entry.getValue();
@ -28,9 +29,13 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -T CountCovariates" +
" -I " + bam +
" -L 1:10,000,000-11,000,000" +
" --validate_old_recalibrator" +
" --use_slx_platform" +
( bam.equals( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" )
? " -L 1:10,800,000-10,810,000" : " -L 1:10,000,000-10,200,000" ) +
" -cov ReadGroupCovariate" +
" -cov QualityScoreCovariate" +
" -cov CycleCovariate" +
" -cov DinucCovariate" +
" --sorted_output" +
" -recalFile %s",
1, // just one output file
Arrays.asList(md5));
@ -42,9 +47,11 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testTableRecalibrator1() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "3ace4b9b8495429cc32e7008145f4996" );
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" );
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "d7f3e0db5ed9fefc917144a0f937d50d" );
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "7be0b7a624d22187e712131f12aae546" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "94f2c602fef9800e270326be9faab3ad");
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "eea2e02ffe71c9a1f318d2e9cd31a103" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "8126b9494837d504ef1277a799267c15" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
@ -54,12 +61,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
if ( paramsFile != null ) {
WalkerTestSpec spec = new WalkerTestSpec(
"-R /broad/1KG/reference/human_b36_both.fasta" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -T TableRecalibration" +
" -I " + bam +
" -L 1:10,000,000-20,000,000" +
" --validate_old_recalibrator" +
" --use_slx_platform" +
( bam.equals( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" )
? " -L 1:10,800,000-10,810,000" : " -L 1:10,100,000-10,300,000" ) +
" -outputBam %s" +
" -recalFile " + paramsFile,
1, // just one output file

View File

@ -8,8 +8,6 @@ import org.junit.Assert;
import org.junit.Test;
import org.junit.Before;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.HashSet;
import java.util.Arrays;
/**

View File

@ -8,7 +8,6 @@ import org.junit.Assert;
import org.junit.Test;
import org.junit.Before;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.*;