2 separate changes which both affect lots of UG integration md5s, so I'm committing them together:
1. allele balance annotation is now weighted by genotype quality (so we don't get misled by borderline het calls) 2. Updates to the Unified Genotyper for parallelization: a. verbose writing now works again; arg was moved from UAC to UG b. UG checks for command that don't work with parallelization c. some cleanup git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2432 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
872a9d1c7b
commit
dc96879861
|
|
@ -48,8 +48,9 @@ public class AlleleBalance extends StandardVariantAnnotation {
|
|||
int aCount = Utils.countOccurrences(a, bases);
|
||||
int bCount = Utils.countOccurrences(b, bases);
|
||||
|
||||
refCount += a == ref.getBase() ? aCount : bCount;
|
||||
altCount += a == ref.getBase() ? bCount : aCount;
|
||||
// weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much
|
||||
refCount += genotype.getNegLog10PError() * (a == ref.getBase() ? aCount : bCount);
|
||||
altCount += genotype.getNegLog10PError() * (a == ref.getBase() ? bCount : aCount);
|
||||
}
|
||||
|
||||
// sanity check
|
||||
|
|
|
|||
|
|
@ -50,11 +50,13 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
* @param logger logger
|
||||
* @param UAC unified arg collection
|
||||
* @param outputFormat output format
|
||||
* @param verboseWriter verbose writer
|
||||
*/
|
||||
protected void initialize(Set<String> samples,
|
||||
Logger logger,
|
||||
UnifiedArgumentCollection UAC,
|
||||
GenotypeWriterFactory.GENOTYPE_FORMAT outputFormat) {
|
||||
GenotypeWriterFactory.GENOTYPE_FORMAT outputFormat,
|
||||
PrintWriter verboseWriter) {
|
||||
this.samples = new TreeSet<String>(samples);
|
||||
this.logger = logger;
|
||||
baseModel = UAC.baseModel;
|
||||
|
|
@ -66,24 +68,14 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
POOL_SIZE = UAC.POOLSIZE;
|
||||
CONFIDENCE_THRESHOLD = UAC.CONFIDENCE_THRESHOLD;
|
||||
MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
|
||||
if ( UAC.VERBOSE != null ) {
|
||||
try {
|
||||
verboseWriter = new PrintWriter(UAC.VERBOSE);
|
||||
initializeVerboseWriter(verboseWriter);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Could not open file " + UAC.VERBOSE + " for writing");
|
||||
}
|
||||
}
|
||||
REPORT_SLOD = ! UAC.NO_SLOD;
|
||||
this.verboseWriter = verboseWriter;
|
||||
if ( verboseWriter != null )
|
||||
initializeVerboseWriter(verboseWriter);
|
||||
}
|
||||
|
||||
protected void initializeVerboseWriter(PrintWriter writer) { };
|
||||
|
||||
public void close() {
|
||||
if ( verboseWriter != null )
|
||||
verboseWriter.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param tracker rod data
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
|
|||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.Set;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
|
||||
public class GenotypeCalculationModelFactory {
|
||||
|
|
@ -55,7 +56,8 @@ public class GenotypeCalculationModelFactory {
|
|||
public static GenotypeCalculationModel makeGenotypeCalculation(Set<String> samples,
|
||||
Logger logger,
|
||||
UnifiedArgumentCollection UAC,
|
||||
GenotypeWriterFactory.GENOTYPE_FORMAT outputFormat) {
|
||||
GenotypeWriterFactory.GENOTYPE_FORMAT outputFormat,
|
||||
PrintWriter verboseWriter) {
|
||||
GenotypeCalculationModel gcm;
|
||||
switch ( UAC.genotypeModel ) {
|
||||
case EM_POINT_ESTIMATE:
|
||||
|
|
@ -70,7 +72,7 @@ public class GenotypeCalculationModelFactory {
|
|||
default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel);
|
||||
}
|
||||
|
||||
gcm.initialize(samples, logger, UAC, outputFormat);
|
||||
gcm.initialize(samples, logger, UAC, outputFormat, verboseWriter);
|
||||
return gcm;
|
||||
}
|
||||
}
|
||||
|
|
@ -50,9 +50,6 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output all callable bases?", required = false)
|
||||
public boolean ALL_BASES = false;
|
||||
|
||||
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
|
||||
public String VERBOSE = null;
|
||||
|
||||
@Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false)
|
||||
public boolean NO_SLOD = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -40,6 +40,8 @@ import org.broadinstitute.sting.utils.genotype.glf.GLFGenotypeWriter;
|
|||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -56,11 +58,18 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
|
|||
@Argument(doc = "File to which variants should be written", required = false)
|
||||
public GenotypeWriter writer = null;
|
||||
|
||||
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
|
||||
public String VERBOSE = null;
|
||||
|
||||
|
||||
// the verbose writer
|
||||
private PrintWriter verboseWriter = null;
|
||||
|
||||
// the model used for calculating genotypes
|
||||
private ThreadLocal<GenotypeCalculationModel> gcm = new ThreadLocal<GenotypeCalculationModel>();
|
||||
|
||||
// samples in input
|
||||
private Set<String> samples;
|
||||
private Set<String> samples = new HashSet<String>();
|
||||
|
||||
|
||||
/** Enable deletions in the pileup **/
|
||||
|
|
@ -75,7 +84,6 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
|
|||
*
|
||||
**/
|
||||
public void setUnifiedArgumentCollection(UnifiedArgumentCollection UAC) {
|
||||
//gcm.close();
|
||||
this.UAC = UAC;
|
||||
initialize();
|
||||
}
|
||||
|
|
@ -100,17 +108,36 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
|
|||
throw new IllegalArgumentException(sb.toString());
|
||||
}
|
||||
|
||||
// get all of the unique sample names
|
||||
// if we're supposed to assume a single sample
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||
else
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
// some arguments can't be handled (for now) while we are multi-threaded
|
||||
if ( getToolkit().getArguments().numberOfThreads > 1 ) {
|
||||
// no ASSUME_SINGLE_SAMPLE because the IO system doesn't know how to get the sample name
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||
throw new IllegalArgumentException("For technical reasons, the ASSUME_SINGLE_SAMPLE argument cannot be used with multiple threads");
|
||||
// no VERBOSE because we'd need to deal with parallelizing the writing
|
||||
if ( VERBOSE != null )
|
||||
throw new IllegalArgumentException("For technical reasons, the VERBOSE argument cannot be used with multiple threads");
|
||||
}
|
||||
|
||||
// print them out for debugging (need separate loop to ensure uniqueness)
|
||||
// for ( String sample : samples )
|
||||
// logger.debug("SAMPLE: " + sample);
|
||||
// get all of the unique sample names - unless we're in POOLED mode, in which case we ignore the sample names
|
||||
if ( UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) {
|
||||
// if we're supposed to assume a single sample, do so
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||
else
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
// for ( String sample : samples )
|
||||
// logger.debug("SAMPLE: " + sample);
|
||||
}
|
||||
|
||||
// initialize the verbose writer
|
||||
if ( VERBOSE != null ) {
|
||||
try {
|
||||
verboseWriter = new PrintWriter(VERBOSE);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Could not open file " + VERBOSE + " for writing");
|
||||
}
|
||||
}
|
||||
// *** If we were called by another walker, then we don't ***
|
||||
// *** want to do any of the other initialization steps. ***
|
||||
if ( writer == null )
|
||||
|
|
@ -118,16 +145,8 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
|
|||
|
||||
// *** If we got here, then we were instantiated by the GATK engine ***
|
||||
|
||||
// if we're in POOLED mode, we no longer want the sample names
|
||||
// (although they're still stored in the gcm if needed)
|
||||
if ( UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED )
|
||||
samples.clear();
|
||||
|
||||
// get the optional header fields
|
||||
Set<VCFHeaderLine> headerInfo = getHeaderInfo();
|
||||
|
||||
// initialize the header
|
||||
GenotypeWriterFactory.writeHeader(writer, GenomeAnalysisEngine.instance.getSAMFileHeader(), samples, headerInfo);
|
||||
GenotypeWriterFactory.writeHeader(writer, GenomeAnalysisEngine.instance.getSAMFileHeader(), samples, getHeaderInfo());
|
||||
}
|
||||
|
||||
private Set<VCFHeaderLine> getHeaderInfo() {
|
||||
|
|
@ -177,20 +196,22 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
|
|||
* @param rawContext contextual information around the locus
|
||||
*/
|
||||
public Pair<VariationCall, List<Genotype>> map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
GenotypeWriterFactory.GENOTYPE_FORMAT format = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
|
||||
if(writer != null) {
|
||||
if(writer instanceof VCFGenotypeWriter)
|
||||
format = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
|
||||
else if(writer instanceof GLFGenotypeWriter)
|
||||
format = GenotypeWriterFactory.GENOTYPE_FORMAT.GLF;
|
||||
else if(writer instanceof GeliGenotypeWriter)
|
||||
format = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
|
||||
else
|
||||
throw new StingException("Unsupported genotype format: " + writer.getClass().getName());
|
||||
|
||||
// initialize the GenotypeCalculationModel for this thread if that hasn't been done yet
|
||||
if ( gcm.get() == null ) {
|
||||
GenotypeWriterFactory.GENOTYPE_FORMAT format = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
|
||||
if ( writer != null ) {
|
||||
if ( writer instanceof VCFGenotypeWriter )
|
||||
format = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
|
||||
else if ( writer instanceof GLFGenotypeWriter )
|
||||
format = GenotypeWriterFactory.GENOTYPE_FORMAT.GLF;
|
||||
else if ( writer instanceof GeliGenotypeWriter )
|
||||
format = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
|
||||
else
|
||||
throw new StingException("Unsupported genotype format: " + writer.getClass().getName());
|
||||
}
|
||||
gcm.set(GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, logger, UAC, format, verboseWriter));
|
||||
}
|
||||
|
||||
if(gcm.get() == null)
|
||||
gcm.set(GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, logger, UAC, format));
|
||||
|
||||
char ref = Character.toUpperCase(refContext.getBase());
|
||||
if ( !BaseUtils.isRegularBase(ref) )
|
||||
|
|
@ -281,23 +302,9 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
|
|||
|
||||
// Close any file writers
|
||||
public void onTraversalDone(Integer sum) {
|
||||
//gcm.close();
|
||||
if ( verboseWriter != null )
|
||||
verboseWriter.close();
|
||||
|
||||
logger.info("Processed " + sum + " loci that are callable for SNPs");
|
||||
}
|
||||
|
||||
/**
|
||||
* A class to keep track of some basic metrics about our calls
|
||||
*/
|
||||
protected class CallMetrics {
|
||||
long nConfidentCalls = 0;
|
||||
long nNonConfidentCalls = 0;
|
||||
long nCalledBases = 0;
|
||||
|
||||
CallMetrics() {}
|
||||
|
||||
public String toString() {
|
||||
return String.format("UG: %d confident and %d non-confident calls were made at %d bases",
|
||||
nConfidentCalls, nNonConfidentCalls, nCalledBases);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1PointEM() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("44942d4e56aec62520903d297cee5fd8"));
|
||||
Arrays.asList("b2e7854a474fbd24add057673ba469f1"));
|
||||
executeTest("testMultiSamplePilot1 - Point Estimate EM", spec);
|
||||
}
|
||||
|
||||
|
|
@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot2PointEM() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("aabea51b271df324eb543bc3417c0a42"));
|
||||
Arrays.asList("c1ea2a36a7dacb278d82be9963dc6053"));
|
||||
executeTest("testMultiSamplePilot2 - Point Estimate EM", spec);
|
||||
}
|
||||
|
||||
|
|
@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testPooled1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1,
|
||||
Arrays.asList("a733878ce9c257148a224aa00c0e63e9"));
|
||||
Arrays.asList("c2364a23fcca9d08c7424e7a68c3ba04"));
|
||||
executeTest("testPooled1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -56,7 +56,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("b68ad77195f6e926c4fce5f1a91f3bc9"));
|
||||
Arrays.asList("293ea49e8c6854aa04099a48293a2c65"));
|
||||
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
@ -64,7 +64,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot2Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("b63c43ba827cfc74bb8ef679ce3d847d"));
|
||||
Arrays.asList("44362bd78c9559c194149fa0be20004b"));
|
||||
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
@ -72,7 +72,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testSingleSamplePilot2Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("8ae0be3b65c83d973519abad858c0073"));
|
||||
Arrays.asList("9013c944a894d1d941167c2d1d57f0f9"));
|
||||
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
@ -85,11 +85,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testParameter() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "-genotype", "c984952b91193d9550066f9514434183" );
|
||||
e.put( "-all_bases", "0062942f2323ea0f2c9d3a3739a28d20" );
|
||||
e.put( "--min_base_quality_score 10", "8a645a76d85e8a6edda6acdf8071ee6b" );
|
||||
e.put( "--min_mapping_quality_score 10", "cf6c38de9872bdbf5da1a22d8c962ebe" );
|
||||
e.put( "--max_mismatches_in_40bp_window 5", "e21a319eb0efbe11c3c1eeea7c9c0144" );
|
||||
e.put( "-genotype", "9f7886f0f63de56ebcd187c22725b1f7" );
|
||||
e.put( "-all_bases", "17f08368b48ebb5e18063da47f689e4e" );
|
||||
e.put( "--min_base_quality_score 10", "cdac609651438aa8e74cb859aa04e2f0" );
|
||||
e.put( "--min_mapping_quality_score 10", "ee1afd51eda0d005e5b3e908af63eb27" );
|
||||
e.put( "--max_mismatches_in_40bp_window 5", "b9bec75e1aa4c3f45e0a88ee74b41321" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
@ -103,7 +103,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testConfidence() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1,
|
||||
Arrays.asList("6823ce46547cf9693cda040dcbd4c9c1"));
|
||||
Arrays.asList("022cf14445f7cd6beb04957c8fea9ae5"));
|
||||
executeTest("testConfidence", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue