Improvements to the CountCovariates and TableRecablirator, as well as regression tests for SLX and 454 data
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1539 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2b0d1c52b2
commit
1c3d67f0f3
|
|
@ -22,8 +22,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
@Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false)
|
@Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false)
|
||||||
public int buggyMaxReadLen = 100000;
|
public int buggyMaxReadLen = 100000;
|
||||||
|
|
||||||
@Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Filename root for the outputted logistic regression training files")
|
@Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Depreciated output file root -- now use --params to directly specify the file output name")
|
||||||
public String OUTPUT_FILEROOT = "output";
|
public String OUTPUT_FILEROOT = null; // going to blow up if specified
|
||||||
|
|
||||||
|
@Argument(fullName="params", shortName="params", required=false, doc="Filename root for the outputted logistic regression training files")
|
||||||
|
public String params = "output.recal_data.csv";
|
||||||
|
|
||||||
@Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
|
@Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
|
||||||
public int MIN_MAPPING_QUALITY = 1;
|
public int MIN_MAPPING_QUALITY = 1;
|
||||||
|
|
@ -205,16 +208,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
*/
|
*/
|
||||||
public PrintStream reduceInit() {
|
public PrintStream reduceInit() {
|
||||||
try {
|
try {
|
||||||
return new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" );
|
if ( OUTPUT_FILEROOT != null )
|
||||||
|
throw new RuntimeException("OUTPUT_FILEROOT argument has been removed, please use --params from now on to directly specify the output parameter filename");
|
||||||
|
return new PrintStream( params );
|
||||||
} catch ( FileNotFoundException e ) {
|
} catch ( FileNotFoundException e ) {
|
||||||
throw new RuntimeException("Couldn't open output file", e);
|
throw new RuntimeException("Couldn't open output file", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void onTraversalDone(PrintStream recalTableStream) {
|
public void onTraversalDone(PrintStream recalTableStream) {
|
||||||
printInfo(out);
|
out.printf("Writing raw recalibration data...");
|
||||||
|
|
||||||
out.printf("Writing raw recalibration data..."); out.flush();
|
|
||||||
writeRecalTable(recalTableStream);
|
writeRecalTable(recalTableStream);
|
||||||
out.printf("...done%n");
|
out.printf("...done%n");
|
||||||
|
|
||||||
|
|
@ -230,7 +233,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
* @param out
|
* @param out
|
||||||
*/
|
*/
|
||||||
private void printInfo(PrintStream out) {
|
private void printInfo(PrintStream out) {
|
||||||
out.printf("# date \"%s\"%n", new Date());
|
//out.printf("# date \"%s\"%n", new Date());
|
||||||
out.printf("# collapsed_pos %b%n", collapsePos);
|
out.printf("# collapsed_pos %b%n", collapsePos);
|
||||||
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
||||||
out.printf("# counted_sites %d%n", counted_sites);
|
out.printf("# counted_sites %d%n", counted_sites);
|
||||||
|
|
@ -249,11 +252,12 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
|
recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
|
||||||
for (String readGroup : new TreeSet<String>(covariateCounter.getReadGroups()) ) {
|
for (String readGroup : new TreeSet<String>(covariateCounter.getReadGroups()) ) {
|
||||||
for ( RecalData datum: RecalData.sort(covariateCounter.getRecalData(readGroup)) ) {
|
for ( RecalData datum: RecalData.sort(covariateCounter.getRecalData(readGroup)) ) {
|
||||||
if ( datum.N > 0 )
|
if ( datum.N > 0 ) {
|
||||||
recalTableStream.println(datum.toCSVString(collapsePos));
|
recalTableStream.println(datum.toCSVString(collapsePos));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Doesn't do anything
|
* Doesn't do anything
|
||||||
|
|
|
||||||
|
|
@ -42,13 +42,13 @@ import java.io.FileNotFoundException;
|
||||||
import java.lang.reflect.Method;
|
import java.lang.reflect.Method;
|
||||||
|
|
||||||
@WalkerName("TableRecalibration")
|
@WalkerName("TableRecalibration")
|
||||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
@Requires({DataSource.READS}) // , DataSource.REFERENCE})
|
||||||
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||||
@Argument(shortName="params", doc="CountCovariates params file", required=true)
|
@Argument(fullName="params", shortName="params", doc="CountCovariates params file", required=true)
|
||||||
public String paramsFile;
|
public String paramsFile;
|
||||||
|
|
||||||
@Argument(fullName="outputBamFile", shortName="outputBAM", doc="output BAM file", required=false)
|
@Argument(fullName="outputBam", shortName="outputBam", doc="output BAM file", required=true)
|
||||||
public SAMFileWriter outputBamFile = null;
|
public SAMFileWriter outputBam = null;
|
||||||
|
|
||||||
@Argument(shortName="rawQempirical", doc="If provided, we will use raw Qempirical scores calculated from the # mismatches and # bases, rather than the more conservative estimate of # mismatches + 1 / # bases + 1", required=false)
|
@Argument(shortName="rawQempirical", doc="If provided, we will use raw Qempirical scores calculated from the # mismatches and # bases, rather than the more conservative estimate of # mismatches + 1 / # bases + 1", required=false)
|
||||||
public boolean rawQempirical = false;
|
public boolean rawQempirical = false;
|
||||||
|
|
@ -63,7 +63,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
// Basic static information
|
// Basic static information
|
||||||
//
|
//
|
||||||
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
|
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
|
||||||
private static String VERSION = "0.2.4";
|
private static String VERSION = "0.2.5";
|
||||||
|
|
||||||
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
|
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
|
||||||
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
|
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
|
||||||
|
|
@ -271,7 +271,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
private void preserveQScores( byte[] originalQuals, byte[] recalQuals, SAMRecord read ) {
|
private void preserveQScores( byte[] originalQuals, byte[] recalQuals, SAMRecord read ) {
|
||||||
for ( int i = 0; i < recalQuals.length; i++ ) {
|
for ( int i = 0; i < recalQuals.length; i++ ) {
|
||||||
if ( originalQuals[i] < preserveQScoresLessThan ) {
|
if ( originalQuals[i] < preserveQScoresLessThan ) {
|
||||||
System.out.printf("Preserving Q%d base at %d in read %s%n", originalQuals[i], i, read.getReadName());
|
//System.out.printf("Preserving Q%d base at %d in read %s%n", originalQuals[i], i, read.getReadName());
|
||||||
recalQuals[i] = originalQuals[i];
|
recalQuals[i] = originalQuals[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -332,7 +332,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
}
|
}
|
||||||
|
|
||||||
public SAMFileWriter reduceInit() {
|
public SAMFileWriter reduceInit() {
|
||||||
return outputBamFile;
|
return outputBam;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -139,7 +139,7 @@ public abstract class CommandLineProgram {
|
||||||
BasicConfigurator.configure();
|
BasicConfigurator.configure();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int result = 0;
|
public static int result = -1;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This function is called to start processing the command line, and kick
|
* This function is called to start processing the command line, and kick
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,8 @@ package org.broadinstitute.sting;
|
||||||
|
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||||
|
|
@ -97,7 +99,7 @@ public class WalkerTest extends BaseTest {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected List<String> executeTest(final String name, WalkerTestSpec spec) {
|
protected Pair<List<File>, List<String>> executeTest(final String name, WalkerTestSpec spec) {
|
||||||
List<File> tmpFiles = new ArrayList<File>();
|
List<File> tmpFiles = new ArrayList<File>();
|
||||||
for ( int i = 0; i < spec.nOutputFiles; i++ ) {
|
for ( int i = 0; i < spec.nOutputFiles; i++ ) {
|
||||||
try {
|
try {
|
||||||
|
|
@ -110,14 +112,19 @@ public class WalkerTest extends BaseTest {
|
||||||
final String args = String.format(spec.args, tmpFiles.toArray());
|
final String args = String.format(spec.args, tmpFiles.toArray());
|
||||||
logger.warn(Utils.dupString('-', 80));
|
logger.warn(Utils.dupString('-', 80));
|
||||||
logger.warn(String.format("Executing test %s with GATK arguments: %s", name, args));
|
logger.warn(String.format("Executing test %s with GATK arguments: %s", name, args));
|
||||||
|
|
||||||
CommandLineGATK instance = new CommandLineGATK();
|
CommandLineGATK instance = new CommandLineGATK();
|
||||||
CommandLineExecutable.start(instance, args.split(" "));
|
CommandLineExecutable.start(instance, args.split(" "));
|
||||||
|
|
||||||
return assertMatchingMD5s(name, tmpFiles, spec.md5s);
|
if ( CommandLineExecutable.result != 0 ) {
|
||||||
|
throw new RuntimeException("Error running the GATK with arguments: " + args);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Pair<List<File>, List<String>>(tmpFiles, assertMatchingMD5s(name, tmpFiles, spec.md5s));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWalkerTest() {
|
public void testWalkerTest() {
|
||||||
logger.warn("WalkerTest is just a framework");
|
//logger.warn("WalkerTest is just a framework");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -44,6 +44,26 @@ public class SingleSampleGenotyperTest extends WalkerTest {
|
||||||
// return true;
|
// return true;
|
||||||
//}
|
//}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// testing calls with SLX, 454, and SOLID data
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
@Test
|
||||||
|
public void testMultiTechnologies() {
|
||||||
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
|
"-T SingleSampleGenotyper" +
|
||||||
|
" -R /broad/1KG/reference/human_b36_both.fasta" +
|
||||||
|
" -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" +
|
||||||
|
" -varout %s" +
|
||||||
|
" -L 1:10,000,000-10,100,000" +
|
||||||
|
" -m empirical",
|
||||||
|
1,
|
||||||
|
Arrays.asList("b8975b303952edff3b0273165ba91001"));
|
||||||
|
|
||||||
|
executeTest(String.format("testMultiTechnologies"), spec);
|
||||||
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// testing the cache
|
// testing the cache
|
||||||
|
|
@ -56,7 +76,7 @@ public class SingleSampleGenotyperTest extends WalkerTest {
|
||||||
WalkerTest.WalkerTestSpec withoutCacheSpec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec withoutCacheSpec = new WalkerTest.WalkerTestSpec(
|
||||||
testGeliLod5() + " -L 1:10,000,000-10,100,000 --disableCache -m " + model.toString(), 1,
|
testGeliLod5() + " -L 1:10,000,000-10,100,000 --disableCache -m " + model.toString(), 1,
|
||||||
Arrays.asList(""));
|
Arrays.asList(""));
|
||||||
List<String> withoutCache = executeTest("empirical1MbTest", withoutCacheSpec );
|
List<String> withoutCache = executeTest("empirical1MbTest", withoutCacheSpec ).getSecond();
|
||||||
|
|
||||||
WalkerTest.WalkerTestSpec withCacheSpec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec withCacheSpec = new WalkerTest.WalkerTestSpec(
|
||||||
testGeliLod5() + " -L 1:10,000,000-10,100,000 -m " + model.toString(), 1,
|
testGeliLod5() + " -L 1:10,000,000-10,100,000 -m " + model.toString(), 1,
|
||||||
|
|
@ -114,6 +134,8 @@ public class SingleSampleGenotyperTest extends WalkerTest {
|
||||||
executeTest("empirical1MbTest", spec);
|
executeTest("empirical1MbTest", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// testing output formats
|
// testing output formats
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,66 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.WalkerTest;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
public class RecalibrationWalkersTest extends WalkerTest {
|
||||||
|
static HashMap<String, String> paramsFiles = new HashMap<String, String>();
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCountCovariates1() {
|
||||||
|
HashMap<String, String> e = new HashMap<String, String>();
|
||||||
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "47664c48992f593258932583576b47e4" );
|
||||||
|
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" );
|
||||||
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b58185dc5fbdd88ca9539d940dff6c1a" );
|
||||||
|
|
||||||
|
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||||
|
String bam = entry.getKey();
|
||||||
|
String md5 = entry.getValue();
|
||||||
|
|
||||||
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
|
"-R /broad/1KG/reference/human_b36_both.fasta" +
|
||||||
|
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||||
|
" -T CountCovariates" +
|
||||||
|
" -I " + bam +
|
||||||
|
" -L 1:10,000,000-11,000,000" +
|
||||||
|
" --params %s",
|
||||||
|
1, // just one output file
|
||||||
|
Arrays.asList(md5));
|
||||||
|
List<File> result = executeTest("testCountCovariates1", spec).getFirst();
|
||||||
|
paramsFiles.put(bam, result.get(0).getAbsolutePath());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTableRecalibrator1() {
|
||||||
|
HashMap<String, String> e = new HashMap<String, String>();
|
||||||
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "c98525aca6493179f084159df0264782" );
|
||||||
|
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" );
|
||||||
|
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "28c5026359a952a5c652b1ccc2acac04" );
|
||||||
|
|
||||||
|
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||||
|
String bam = entry.getKey();
|
||||||
|
String md5 = entry.getValue();
|
||||||
|
String paramsFile = paramsFiles.get(bam);
|
||||||
|
System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile);
|
||||||
|
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
"-R /broad/1KG/reference/human_b36_both.fasta" +
|
||||||
|
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||||
|
" -T TableRecalibration" +
|
||||||
|
" -I " + bam +
|
||||||
|
" -L 1:10,000,000-20,000,000" +
|
||||||
|
" --outputBam %s" +
|
||||||
|
" --params " + paramsFile,
|
||||||
|
1, // just one output file
|
||||||
|
Arrays.asList(md5));
|
||||||
|
executeTest("testTableRecalibrator1", spec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d freeze5.1 -q gsa lists/low_coverage_freeze5.list -n lists/naids_and_pop.txt
|
||||||
|
python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d freeze5.1 -q gsa -s lists/trios_freeze5.list
|
||||||
|
|
||||||
Loading…
Reference in New Issue