Improvements to the CountCovariates and TableRecablirator, as well as regression tests for SLX and 454 data
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1539 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2b0d1c52b2
commit
1c3d67f0f3
|
|
@ -22,9 +22,12 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
@Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false)
|
||||
public int buggyMaxReadLen = 100000;
|
||||
|
||||
@Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Filename root for the outputted logistic regression training files")
|
||||
public String OUTPUT_FILEROOT = "output";
|
||||
@Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Depreciated output file root -- now use --params to directly specify the file output name")
|
||||
public String OUTPUT_FILEROOT = null; // going to blow up if specified
|
||||
|
||||
@Argument(fullName="params", shortName="params", required=false, doc="Filename root for the outputted logistic regression training files")
|
||||
public String params = "output.recal_data.csv";
|
||||
|
||||
@Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
|
||||
public int MIN_MAPPING_QUALITY = 1;
|
||||
|
||||
|
|
@ -205,19 +208,19 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
*/
|
||||
public PrintStream reduceInit() {
|
||||
try {
|
||||
return new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" );
|
||||
if ( OUTPUT_FILEROOT != null )
|
||||
throw new RuntimeException("OUTPUT_FILEROOT argument has been removed, please use --params from now on to directly specify the output parameter filename");
|
||||
return new PrintStream( params );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException("Couldn't open output file", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void onTraversalDone(PrintStream recalTableStream) {
|
||||
printInfo(out);
|
||||
|
||||
out.printf("Writing raw recalibration data..."); out.flush();
|
||||
out.printf("Writing raw recalibration data...");
|
||||
writeRecalTable(recalTableStream);
|
||||
out.printf("...done%n");
|
||||
|
||||
|
||||
//out.printf("Writing logistic recalibration data%n");
|
||||
//writeLogisticRecalibrationTable();
|
||||
//out.printf("...done%n");
|
||||
|
|
@ -230,7 +233,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
* @param out
|
||||
*/
|
||||
private void printInfo(PrintStream out) {
|
||||
out.printf("# date \"%s\"%n", new Date());
|
||||
//out.printf("# date \"%s\"%n", new Date());
|
||||
out.printf("# collapsed_pos %b%n", collapsePos);
|
||||
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
||||
out.printf("# counted_sites %d%n", counted_sites);
|
||||
|
|
@ -249,8 +252,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
|
||||
for (String readGroup : new TreeSet<String>(covariateCounter.getReadGroups()) ) {
|
||||
for ( RecalData datum: RecalData.sort(covariateCounter.getRecalData(readGroup)) ) {
|
||||
if ( datum.N > 0 )
|
||||
if ( datum.N > 0 ) {
|
||||
recalTableStream.println(datum.toCSVString(collapsePos));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,13 +42,13 @@ import java.io.FileNotFoundException;
|
|||
import java.lang.reflect.Method;
|
||||
|
||||
@WalkerName("TableRecalibration")
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
@Requires({DataSource.READS}) // , DataSource.REFERENCE})
|
||||
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
@Argument(shortName="params", doc="CountCovariates params file", required=true)
|
||||
@Argument(fullName="params", shortName="params", doc="CountCovariates params file", required=true)
|
||||
public String paramsFile;
|
||||
|
||||
@Argument(fullName="outputBamFile", shortName="outputBAM", doc="output BAM file", required=false)
|
||||
public SAMFileWriter outputBamFile = null;
|
||||
@Argument(fullName="outputBam", shortName="outputBam", doc="output BAM file", required=true)
|
||||
public SAMFileWriter outputBam = null;
|
||||
|
||||
@Argument(shortName="rawQempirical", doc="If provided, we will use raw Qempirical scores calculated from the # mismatches and # bases, rather than the more conservative estimate of # mismatches + 1 / # bases + 1", required=false)
|
||||
public boolean rawQempirical = false;
|
||||
|
|
@ -63,7 +63,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
// Basic static information
|
||||
//
|
||||
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
|
||||
private static String VERSION = "0.2.4";
|
||||
private static String VERSION = "0.2.5";
|
||||
|
||||
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
|
||||
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
|
||||
|
|
@ -271,7 +271,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private void preserveQScores( byte[] originalQuals, byte[] recalQuals, SAMRecord read ) {
|
||||
for ( int i = 0; i < recalQuals.length; i++ ) {
|
||||
if ( originalQuals[i] < preserveQScoresLessThan ) {
|
||||
System.out.printf("Preserving Q%d base at %d in read %s%n", originalQuals[i], i, read.getReadName());
|
||||
//System.out.printf("Preserving Q%d base at %d in read %s%n", originalQuals[i], i, read.getReadName());
|
||||
recalQuals[i] = originalQuals[i];
|
||||
}
|
||||
}
|
||||
|
|
@ -332,7 +332,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
|
||||
public SAMFileWriter reduceInit() {
|
||||
return outputBamFile;
|
||||
return outputBam;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -139,7 +139,7 @@ public abstract class CommandLineProgram {
|
|||
BasicConfigurator.configure();
|
||||
}
|
||||
|
||||
public static int result = 0;
|
||||
public static int result = -1;
|
||||
|
||||
/**
|
||||
* This function is called to start processing the command line, and kick
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@ package org.broadinstitute.sting;
|
|||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
|
||||
import org.junit.Test;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
|
|
@ -97,7 +99,7 @@ public class WalkerTest extends BaseTest {
|
|||
return false;
|
||||
}
|
||||
|
||||
protected List<String> executeTest(final String name, WalkerTestSpec spec) {
|
||||
protected Pair<List<File>, List<String>> executeTest(final String name, WalkerTestSpec spec) {
|
||||
List<File> tmpFiles = new ArrayList<File>();
|
||||
for ( int i = 0; i < spec.nOutputFiles; i++ ) {
|
||||
try {
|
||||
|
|
@ -110,14 +112,19 @@ public class WalkerTest extends BaseTest {
|
|||
final String args = String.format(spec.args, tmpFiles.toArray());
|
||||
logger.warn(Utils.dupString('-', 80));
|
||||
logger.warn(String.format("Executing test %s with GATK arguments: %s", name, args));
|
||||
|
||||
CommandLineGATK instance = new CommandLineGATK();
|
||||
CommandLineExecutable.start(instance, args.split(" "));
|
||||
|
||||
return assertMatchingMD5s(name, tmpFiles, spec.md5s);
|
||||
if ( CommandLineExecutable.result != 0 ) {
|
||||
throw new RuntimeException("Error running the GATK with arguments: " + args);
|
||||
}
|
||||
|
||||
return new Pair<List<File>, List<String>>(tmpFiles, assertMatchingMD5s(name, tmpFiles, spec.md5s));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWalkerTest() {
|
||||
logger.warn("WalkerTest is just a framework");
|
||||
//logger.warn("WalkerTest is just a framework");
|
||||
}
|
||||
}
|
||||
|
|
@ -44,6 +44,26 @@ public class SingleSampleGenotyperTest extends WalkerTest {
|
|||
// return true;
|
||||
//}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing calls with SLX, 454, and SOLID data
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Test
|
||||
public void testMultiTechnologies() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T SingleSampleGenotyper" +
|
||||
" -R /broad/1KG/reference/human_b36_both.fasta" +
|
||||
" -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" +
|
||||
" -varout %s" +
|
||||
" -L 1:10,000,000-10,100,000" +
|
||||
" -m empirical",
|
||||
1,
|
||||
Arrays.asList("b8975b303952edff3b0273165ba91001"));
|
||||
|
||||
executeTest(String.format("testMultiTechnologies"), spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing the cache
|
||||
|
|
@ -56,7 +76,7 @@ public class SingleSampleGenotyperTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec withoutCacheSpec = new WalkerTest.WalkerTestSpec(
|
||||
testGeliLod5() + " -L 1:10,000,000-10,100,000 --disableCache -m " + model.toString(), 1,
|
||||
Arrays.asList(""));
|
||||
List<String> withoutCache = executeTest("empirical1MbTest", withoutCacheSpec );
|
||||
List<String> withoutCache = executeTest("empirical1MbTest", withoutCacheSpec ).getSecond();
|
||||
|
||||
WalkerTest.WalkerTestSpec withCacheSpec = new WalkerTest.WalkerTestSpec(
|
||||
testGeliLod5() + " -L 1:10,000,000-10,100,000 -m " + model.toString(), 1,
|
||||
|
|
@ -114,6 +134,8 @@ public class SingleSampleGenotyperTest extends WalkerTest {
|
|||
executeTest("empirical1MbTest", spec);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing output formats
|
||||
|
|
|
|||
|
|
@ -0,0 +1,66 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.io.File;
|
||||
|
||||
public class RecalibrationWalkersTest extends WalkerTest {
|
||||
static HashMap<String, String> paramsFiles = new HashMap<String, String>();
|
||||
|
||||
@Test
|
||||
public void testCountCovariates1() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "47664c48992f593258932583576b47e4" );
|
||||
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b58185dc5fbdd88ca9539d940dff6c1a" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
String md5 = entry.getValue();
|
||||
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R /broad/1KG/reference/human_b36_both.fasta" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -T CountCovariates" +
|
||||
" -I " + bam +
|
||||
" -L 1:10,000,000-11,000,000" +
|
||||
" --params %s",
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
List<File> result = executeTest("testCountCovariates1", spec).getFirst();
|
||||
paramsFiles.put(bam, result.get(0).getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTableRecalibrator1() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "c98525aca6493179f084159df0264782" );
|
||||
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" );
|
||||
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "28c5026359a952a5c652b1ccc2acac04" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
String md5 = entry.getValue();
|
||||
String paramsFile = paramsFiles.get(bam);
|
||||
System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile);
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-R /broad/1KG/reference/human_b36_both.fasta" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -T TableRecalibration" +
|
||||
" -I " + bam +
|
||||
" -L 1:10,000,000-20,000,000" +
|
||||
" --outputBam %s" +
|
||||
" --params " + paramsFile,
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
executeTest("testTableRecalibrator1", spec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d freeze5.1 -q gsa lists/low_coverage_freeze5.list -n lists/naids_and_pop.txt
|
||||
python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d freeze5.1 -q gsa -s lists/trios_freeze5.list
|
||||
|
||||
Loading…
Reference in New Issue