Changed the format of the recalibration csv file slightly so that it is easier to load the file into something like R and look at the values of the covariates.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2183 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a184d28ce9
commit
d8146ab23d
|
|
@ -32,9 +32,11 @@ import net.sf.samtools.SAMRecord;
|
|||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 19, 2009
|
||||
*
|
||||
* Filter out reads that don't have Original Quality scores inside.
|
||||
*/
|
||||
public class NoOriginalQualityScoresFilter implements SamRecordFilter {
|
||||
public boolean filterOut(SAMRecord rec) {
|
||||
return (rec.getAttribute("OQ") == null);
|
||||
public boolean filterOut( final SAMRecord read ) {
|
||||
return (read.getAttribute("OQ") == null);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,8 +42,8 @@ public class SingleReadGroupFilter implements SamRecordFilter {
|
|||
@Argument(fullName = "read_group_to_keep", shortName = "goodRG", doc="The name of the read group to keep, filtering out all others", required=true)
|
||||
private String READ_GROUP_TO_KEEP = null;
|
||||
|
||||
public boolean filterOut(SAMRecord read) {
|
||||
SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
public boolean filterOut( final SAMRecord read ) {
|
||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
return !( readGroup != null && readGroup.getReadGroupId().equals( READ_GROUP_TO_KEEP ) );
|
||||
}
|
||||
}
|
||||
|
|
@ -102,7 +102,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
|
||||
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
|
||||
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
|
||||
private static final String versionString = "v2.0.7"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.0.8"; // Major version, minor version, and build number
|
||||
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
||||
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||
|
|
@ -224,7 +224,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
|
||||
// Don't want to crash with out of heap space exception
|
||||
if(estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0) { // Could be negative if overflowed
|
||||
if( estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0 ) { // Could be negative if overflowed
|
||||
estimatedCapacity = 300 * 40 * 200;
|
||||
}
|
||||
dataManager = new RecalDataManager( estimatedCapacity );
|
||||
|
|
@ -512,8 +512,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
// The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name
|
||||
recalTableStream.println( "@!" + cov.getClass().getSimpleName() );
|
||||
recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," );
|
||||
}
|
||||
recalTableStream.println("nObservations,nMismatches,Qempirical");
|
||||
}
|
||||
|
||||
if( SORTED_OUTPUT && requestedCovariates.size() == 4 )
|
||||
|
|
@ -530,7 +531,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
// For each Covariate in the key
|
||||
for( Comparable comp : entry.getKey() ) {
|
||||
// Output the Covariate's value
|
||||
if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG
|
||||
recalTableStream.print( comp + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ public class RecalibrationArgumentCollection {
|
|||
public String FORCE_PLATFORM = null;
|
||||
|
||||
//////////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
// Shared Debugging-only Arguments
|
||||
//////////////////////////////////
|
||||
@Argument(fullName="validate_old_recalibrator", shortName="validate", required=false, doc="!!!Depricated!!! Match the output of the old recalibrator exactly. FOR DEBUGGING PURPOSES ONLY.")
|
||||
public boolean VALIDATE_OLD_RECALIBRATOR = false;
|
||||
|
|
|
|||
|
|
@ -92,8 +92,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private ArrayList<Comparable> collapsedTableKey; // The key that will be used over and over again to query the collapsed tables
|
||||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
|
||||
private static final String versionString = "v2.0.5"; // Major version, minor version, and build number
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
private static final String versionString = "v2.0.6"; // Major version, minor version, and build number
|
||||
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -148,33 +148,35 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data. " + RAC.RECAL_FILE );
|
||||
} else { // Found another covariate in input file
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RAC.RECAL_FILE );
|
||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
String[] vals = line.split(",");
|
||||
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
|
||||
if( line.equalsIgnoreCase( "@!" + covClass.getSimpleName() ) ) { // The "@!" was added by CovariateCounterWalker as a code to recognize covariate class names
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
|
||||
} catch ( InstantiationException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
|
||||
} catch ( IllegalAccessException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
|
||||
} catch ( InstantiationException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
|
||||
} catch ( IllegalAccessException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new StingException( "Malformed input recalibration file. The requested covariate type (" + line + ") isn't a valid covariate option." );
|
||||
if( !foundClass ) {
|
||||
throw new StingException( "Malformed input recalibration file. The requested covariate type (" + vals[iii] + ") isn't a valid covariate option." );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else { // Found some data
|
||||
} else { // Found a line of data
|
||||
if( !foundAllCovariates ) {
|
||||
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
|
|
@ -183,8 +185,14 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
requestedCovariates.add( new DinucCovariate() );
|
||||
}
|
||||
foundAllCovariates = true;
|
||||
|
||||
// At this point all the covariates should have been found and initialized
|
||||
if( requestedCovariates.size() < 2 ) {
|
||||
throw new StingException( "Malformed input recalibration file. Covariate names can't be found in file: " + RAC.RECAL_FILE );
|
||||
}
|
||||
|
||||
// Don't want to crash with out of heap space exception
|
||||
if(estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0) { // Could be negative if overflowed
|
||||
if( estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0 ) { // Could be negative if overflowed
|
||||
estimatedCapacity = 300 * 40 * 200;
|
||||
}
|
||||
final boolean createCollapsedTables = true;
|
||||
|
|
@ -223,7 +231,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
if( !NO_PG_TAG ) {
|
||||
SAMProgramRecord programRecord = new SAMProgramRecord( "TableRecalibrationWalker" );
|
||||
programRecord.setProgramVersion( versionString );
|
||||
programRecord.setCommandLine( "Covariates used: " + requestedCovariates );
|
||||
String commandLineString = "Covariates used: ";
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
commandLineString += cov.getClass().getSimpleName() + ", ";
|
||||
}
|
||||
commandLineString = commandLineString.substring(0, commandLineString.length() - 2); // trim off the trailing comma
|
||||
programRecord.setCommandLine( commandLineString );
|
||||
header.addProgramRecord( programRecord );
|
||||
}
|
||||
|
||||
|
|
@ -240,6 +253,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
*/
|
||||
private void addCSVData(String line) {
|
||||
String[] vals = line.split(",");
|
||||
|
||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||
throw new StingException("Malformed input recalibration file. Found data line with too many fields: " + line +
|
||||
" --Perhaps the read group string contains a comma and isn't parsed correctly.");
|
||||
}
|
||||
|
||||
ArrayList<Comparable> key = new ArrayList<Comparable>();
|
||||
Covariate cov;
|
||||
int iii;
|
||||
|
|
|
|||
|
|
@ -16,11 +16,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testCountCovariates1() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "5bb3cb7445ec223cd450b05430cb00cd" );
|
||||
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "dc1a1b6e99f4a47525cc1dce7b6eb1dc" ); // This file doesn't have read groups?
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "7842b33245a53b13c7b6cd4e4616ac11");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "cbe17568c5f03516a88ff0c5ce33a87b" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "8806fb7f0e669d1abab61dc4b3d10d8f" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "16f87c9644b27c3c3fe7a963eed45d2d" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "602bc582a0e5507fee806db19f07b180");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "92b7ec1f3b5d33eb86be8298827d55f5" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "3caf6ec9c38ce78284159b279b0de0c0" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -45,12 +44,11 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
paramsFiles.put(bam, result.get(0).getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testTableRecalibrator1() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "d7f3e0db5ed9fefc917144a0f937d50d" );
|
||||
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "7be0b7a624d22187e712131f12aae546" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "94f2c602fef9800e270326be9faab3ad");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "eea2e02ffe71c9a1f318d2e9cd31a103" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "8126b9494837d504ef1277a799267c15" );
|
||||
|
|
@ -80,7 +78,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testCountCovariatesVCF() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "d5962c39a53a511d7ec710d5343e7a37");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "1a2762916473bd037c72125c35d274d6");
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -104,11 +102,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testCountCovariatesNoReadGroups() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "ca9e1312102190bbd58f82809eabf175" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "437a6dd63c61d2814f42129ce393d7ce" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
|
|||
Loading…
Reference in New Issue