Improvements to the CountCovariates and TableRecablirator, as well as regression tests for SLX and 454 data

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1539 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-09-04 22:26:57 +00:00
parent 2b0d1c52b2
commit 1c3d67f0f3
7 changed files with 123 additions and 21 deletions

View File

@ -22,9 +22,12 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
@Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false)
public int buggyMaxReadLen = 100000;
@Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Filename root for the outputted logistic regression training files")
public String OUTPUT_FILEROOT = "output";
@Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Depreciated output file root -- now use --params to directly specify the file output name")
public String OUTPUT_FILEROOT = null; // going to blow up if specified
@Argument(fullName="params", shortName="params", required=false, doc="Filename root for the outputted logistic regression training files")
public String params = "output.recal_data.csv";
@Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
public int MIN_MAPPING_QUALITY = 1;
@ -205,19 +208,19 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
*/
public PrintStream reduceInit() {
try {
return new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" );
if ( OUTPUT_FILEROOT != null )
throw new RuntimeException("OUTPUT_FILEROOT argument has been removed, please use --params from now on to directly specify the output parameter filename");
return new PrintStream( params );
} catch ( FileNotFoundException e ) {
throw new RuntimeException("Couldn't open output file", e);
}
}
public void onTraversalDone(PrintStream recalTableStream) {
printInfo(out);
out.printf("Writing raw recalibration data..."); out.flush();
out.printf("Writing raw recalibration data...");
writeRecalTable(recalTableStream);
out.printf("...done%n");
//out.printf("Writing logistic recalibration data%n");
//writeLogisticRecalibrationTable();
//out.printf("...done%n");
@ -230,7 +233,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
* @param out
*/
private void printInfo(PrintStream out) {
out.printf("# date \"%s\"%n", new Date());
//out.printf("# date \"%s\"%n", new Date());
out.printf("# collapsed_pos %b%n", collapsePos);
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
out.printf("# counted_sites %d%n", counted_sites);
@ -249,8 +252,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
for (String readGroup : new TreeSet<String>(covariateCounter.getReadGroups()) ) {
for ( RecalData datum: RecalData.sort(covariateCounter.getRecalData(readGroup)) ) {
if ( datum.N > 0 )
if ( datum.N > 0 ) {
recalTableStream.println(datum.toCSVString(collapsePos));
}
}
}
}

View File

@ -42,13 +42,13 @@ import java.io.FileNotFoundException;
import java.lang.reflect.Method;
@WalkerName("TableRecalibration")
@Requires({DataSource.READS, DataSource.REFERENCE})
@Requires({DataSource.READS}) // , DataSource.REFERENCE})
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
@Argument(shortName="params", doc="CountCovariates params file", required=true)
@Argument(fullName="params", shortName="params", doc="CountCovariates params file", required=true)
public String paramsFile;
@Argument(fullName="outputBamFile", shortName="outputBAM", doc="output BAM file", required=false)
public SAMFileWriter outputBamFile = null;
@Argument(fullName="outputBam", shortName="outputBam", doc="output BAM file", required=true)
public SAMFileWriter outputBam = null;
@Argument(shortName="rawQempirical", doc="If provided, we will use raw Qempirical scores calculated from the # mismatches and # bases, rather than the more conservative estimate of # mismatches + 1 / # bases + 1", required=false)
public boolean rawQempirical = false;
@ -63,7 +63,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// Basic static information
//
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
private static String VERSION = "0.2.4";
private static String VERSION = "0.2.5";
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
@ -271,7 +271,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private void preserveQScores( byte[] originalQuals, byte[] recalQuals, SAMRecord read ) {
for ( int i = 0; i < recalQuals.length; i++ ) {
if ( originalQuals[i] < preserveQScoresLessThan ) {
System.out.printf("Preserving Q%d base at %d in read %s%n", originalQuals[i], i, read.getReadName());
//System.out.printf("Preserving Q%d base at %d in read %s%n", originalQuals[i], i, read.getReadName());
recalQuals[i] = originalQuals[i];
}
}
@ -332,7 +332,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
public SAMFileWriter reduceInit() {
return outputBamFile;
return outputBam;
}
/**

View File

@ -139,7 +139,7 @@ public abstract class CommandLineProgram {
BasicConfigurator.configure();
}
public static int result = 0;
public static int result = -1;
/**
* This function is called to start processing the command line, and kick

View File

@ -2,6 +2,8 @@ package org.broadinstitute.sting;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
import org.junit.Test;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.CommandLineGATK;
@ -97,7 +99,7 @@ public class WalkerTest extends BaseTest {
return false;
}
protected List<String> executeTest(final String name, WalkerTestSpec spec) {
protected Pair<List<File>, List<String>> executeTest(final String name, WalkerTestSpec spec) {
List<File> tmpFiles = new ArrayList<File>();
for ( int i = 0; i < spec.nOutputFiles; i++ ) {
try {
@ -110,14 +112,19 @@ public class WalkerTest extends BaseTest {
final String args = String.format(spec.args, tmpFiles.toArray());
logger.warn(Utils.dupString('-', 80));
logger.warn(String.format("Executing test %s with GATK arguments: %s", name, args));
CommandLineGATK instance = new CommandLineGATK();
CommandLineExecutable.start(instance, args.split(" "));
return assertMatchingMD5s(name, tmpFiles, spec.md5s);
if ( CommandLineExecutable.result != 0 ) {
throw new RuntimeException("Error running the GATK with arguments: " + args);
}
return new Pair<List<File>, List<String>>(tmpFiles, assertMatchingMD5s(name, tmpFiles, spec.md5s));
}
@Test
public void testWalkerTest() {
logger.warn("WalkerTest is just a framework");
//logger.warn("WalkerTest is just a framework");
}
}

View File

@ -44,6 +44,26 @@ public class SingleSampleGenotyperTest extends WalkerTest {
// return true;
//}
// --------------------------------------------------------------------------------------------------------------
//
// testing calls with SLX, 454, and SOLID data
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void testMultiTechnologies() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T SingleSampleGenotyper" +
" -R /broad/1KG/reference/human_b36_both.fasta" +
" -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" +
" -varout %s" +
" -L 1:10,000,000-10,100,000" +
" -m empirical",
1,
Arrays.asList("b8975b303952edff3b0273165ba91001"));
executeTest(String.format("testMultiTechnologies"), spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing the cache
@ -56,7 +76,7 @@ public class SingleSampleGenotyperTest extends WalkerTest {
WalkerTest.WalkerTestSpec withoutCacheSpec = new WalkerTest.WalkerTestSpec(
testGeliLod5() + " -L 1:10,000,000-10,100,000 --disableCache -m " + model.toString(), 1,
Arrays.asList(""));
List<String> withoutCache = executeTest("empirical1MbTest", withoutCacheSpec );
List<String> withoutCache = executeTest("empirical1MbTest", withoutCacheSpec ).getSecond();
WalkerTest.WalkerTestSpec withCacheSpec = new WalkerTest.WalkerTestSpec(
testGeliLod5() + " -L 1:10,000,000-10,100,000 -m " + model.toString(), 1,
@ -114,6 +134,8 @@ public class SingleSampleGenotyperTest extends WalkerTest {
executeTest("empirical1MbTest", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing output formats

View File

@ -0,0 +1,66 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.HashMap;
import java.util.Map;
import java.util.Arrays;
import java.util.List;
import java.io.File;
public class RecalibrationWalkersTest extends WalkerTest {
static HashMap<String, String> paramsFiles = new HashMap<String, String>();
@Test
public void testCountCovariates1() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "47664c48992f593258932583576b47e4" );
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b58185dc5fbdd88ca9539d940dff6c1a" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
String md5 = entry.getValue();
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R /broad/1KG/reference/human_b36_both.fasta" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -T CountCovariates" +
" -I " + bam +
" -L 1:10,000,000-11,000,000" +
" --params %s",
1, // just one output file
Arrays.asList(md5));
List<File> result = executeTest("testCountCovariates1", spec).getFirst();
paramsFiles.put(bam, result.get(0).getAbsolutePath());
}
}
@Test
public void testTableRecalibrator1() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "c98525aca6493179f084159df0264782" );
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" );
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "28c5026359a952a5c652b1ccc2acac04" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
String md5 = entry.getValue();
String paramsFile = paramsFiles.get(bam);
System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile);
WalkerTestSpec spec = new WalkerTestSpec(
"-R /broad/1KG/reference/human_b36_both.fasta" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -T TableRecalibration" +
" -I " + bam +
" -L 1:10,000,000-20,000,000" +
" --outputBam %s" +
" --params " + paramsFile,
1, // just one output file
Arrays.asList(md5));
executeTest("testTableRecalibrator1", spec);
}
}
}

View File

@ -0,0 +1,3 @@
python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d freeze5.1 -q gsa lists/low_coverage_freeze5.list -n lists/naids_and_pop.txt
python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d freeze5.1 -q gsa -s lists/trios_freeze5.list