2 separate changes which both affect lots of UG integration md5s, so I'm committing them together:

1. allele balance annotation is now weighted by genotype quality (so we don't get misled by borderline het calls)

2. Updates to the Unified Genotyper for parallelization:
   a. verbose writing now works again; arg was moved from UAC to UG
   b. UG checks for command that don't work with parallelization
   c. some cleanup



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2432 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-12-23 19:03:56 +00:00
parent 872a9d1c7b
commit dc96879861
6 changed files with 82 additions and 83 deletions

View File

@ -48,8 +48,9 @@ public class AlleleBalance extends StandardVariantAnnotation {
int aCount = Utils.countOccurrences(a, bases);
int bCount = Utils.countOccurrences(b, bases);
refCount += a == ref.getBase() ? aCount : bCount;
altCount += a == ref.getBase() ? bCount : aCount;
// weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much
refCount += genotype.getNegLog10PError() * (a == ref.getBase() ? aCount : bCount);
altCount += genotype.getNegLog10PError() * (a == ref.getBase() ? bCount : aCount);
}
// sanity check

View File

@ -50,11 +50,13 @@ public abstract class GenotypeCalculationModel implements Cloneable {
* @param logger logger
* @param UAC unified arg collection
* @param outputFormat output format
* @param verboseWriter verbose writer
*/
protected void initialize(Set<String> samples,
Logger logger,
UnifiedArgumentCollection UAC,
GenotypeWriterFactory.GENOTYPE_FORMAT outputFormat) {
GenotypeWriterFactory.GENOTYPE_FORMAT outputFormat,
PrintWriter verboseWriter) {
this.samples = new TreeSet<String>(samples);
this.logger = logger;
baseModel = UAC.baseModel;
@ -66,24 +68,14 @@ public abstract class GenotypeCalculationModel implements Cloneable {
POOL_SIZE = UAC.POOLSIZE;
CONFIDENCE_THRESHOLD = UAC.CONFIDENCE_THRESHOLD;
MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
if ( UAC.VERBOSE != null ) {
try {
verboseWriter = new PrintWriter(UAC.VERBOSE);
initializeVerboseWriter(verboseWriter);
} catch (FileNotFoundException e) {
throw new StingException("Could not open file " + UAC.VERBOSE + " for writing");
}
}
REPORT_SLOD = ! UAC.NO_SLOD;
this.verboseWriter = verboseWriter;
if ( verboseWriter != null )
initializeVerboseWriter(verboseWriter);
}
protected void initializeVerboseWriter(PrintWriter writer) { };
public void close() {
if ( verboseWriter != null )
verboseWriter.close();
}
/**
* Must be overridden by concrete subclasses
* @param tracker rod data

View File

@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
import org.apache.log4j.Logger;
import java.util.Set;
import java.io.PrintWriter;
public class GenotypeCalculationModelFactory {
@ -55,7 +56,8 @@ public class GenotypeCalculationModelFactory {
public static GenotypeCalculationModel makeGenotypeCalculation(Set<String> samples,
Logger logger,
UnifiedArgumentCollection UAC,
GenotypeWriterFactory.GENOTYPE_FORMAT outputFormat) {
GenotypeWriterFactory.GENOTYPE_FORMAT outputFormat,
PrintWriter verboseWriter) {
GenotypeCalculationModel gcm;
switch ( UAC.genotypeModel ) {
case EM_POINT_ESTIMATE:
@ -70,7 +72,7 @@ public class GenotypeCalculationModelFactory {
default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel);
}
gcm.initialize(samples, logger, UAC, outputFormat);
gcm.initialize(samples, logger, UAC, outputFormat, verboseWriter);
return gcm;
}
}

View File

@ -50,9 +50,6 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output all callable bases?", required = false)
public boolean ALL_BASES = false;
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
public String VERBOSE = null;
@Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false)
public boolean NO_SLOD = false;

View File

@ -40,6 +40,8 @@ import org.broadinstitute.sting.utils.genotype.glf.GLFGenotypeWriter;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.util.*;
import java.io.PrintWriter;
import java.io.FileNotFoundException;
/**
@ -56,11 +58,18 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
@Argument(doc = "File to which variants should be written", required = false)
public GenotypeWriter writer = null;
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
public String VERBOSE = null;
// the verbose writer
private PrintWriter verboseWriter = null;
// the model used for calculating genotypes
private ThreadLocal<GenotypeCalculationModel> gcm = new ThreadLocal<GenotypeCalculationModel>();
// samples in input
private Set<String> samples;
private Set<String> samples = new HashSet<String>();
/** Enable deletions in the pileup **/
@ -75,7 +84,6 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
*
**/
public void setUnifiedArgumentCollection(UnifiedArgumentCollection UAC) {
//gcm.close();
this.UAC = UAC;
initialize();
}
@ -100,17 +108,36 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
throw new IllegalArgumentException(sb.toString());
}
// get all of the unique sample names
// if we're supposed to assume a single sample
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
else
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
// some arguments can't be handled (for now) while we are multi-threaded
if ( getToolkit().getArguments().numberOfThreads > 1 ) {
// no ASSUME_SINGLE_SAMPLE because the IO system doesn't know how to get the sample name
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
throw new IllegalArgumentException("For technical reasons, the ASSUME_SINGLE_SAMPLE argument cannot be used with multiple threads");
// no VERBOSE because we'd need to deal with parallelizing the writing
if ( VERBOSE != null )
throw new IllegalArgumentException("For technical reasons, the VERBOSE argument cannot be used with multiple threads");
}
// print them out for debugging (need separate loop to ensure uniqueness)
// for ( String sample : samples )
// logger.debug("SAMPLE: " + sample);
// get all of the unique sample names - unless we're in POOLED mode, in which case we ignore the sample names
if ( UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) {
// if we're supposed to assume a single sample, do so
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
else
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
// for ( String sample : samples )
// logger.debug("SAMPLE: " + sample);
}
// initialize the verbose writer
if ( VERBOSE != null ) {
try {
verboseWriter = new PrintWriter(VERBOSE);
} catch (FileNotFoundException e) {
throw new StingException("Could not open file " + VERBOSE + " for writing");
}
}
// *** If we were called by another walker, then we don't ***
// *** want to do any of the other initialization steps. ***
if ( writer == null )
@ -118,16 +145,8 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
// *** If we got here, then we were instantiated by the GATK engine ***
// if we're in POOLED mode, we no longer want the sample names
// (although they're still stored in the gcm if needed)
if ( UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED )
samples.clear();
// get the optional header fields
Set<VCFHeaderLine> headerInfo = getHeaderInfo();
// initialize the header
GenotypeWriterFactory.writeHeader(writer, GenomeAnalysisEngine.instance.getSAMFileHeader(), samples, headerInfo);
GenotypeWriterFactory.writeHeader(writer, GenomeAnalysisEngine.instance.getSAMFileHeader(), samples, getHeaderInfo());
}
private Set<VCFHeaderLine> getHeaderInfo() {
@ -177,20 +196,22 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
* @param rawContext contextual information around the locus
*/
public Pair<VariationCall, List<Genotype>> map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
GenotypeWriterFactory.GENOTYPE_FORMAT format = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
if(writer != null) {
if(writer instanceof VCFGenotypeWriter)
format = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
else if(writer instanceof GLFGenotypeWriter)
format = GenotypeWriterFactory.GENOTYPE_FORMAT.GLF;
else if(writer instanceof GeliGenotypeWriter)
format = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
else
throw new StingException("Unsupported genotype format: " + writer.getClass().getName());
// initialize the GenotypeCalculationModel for this thread if that hasn't been done yet
if ( gcm.get() == null ) {
GenotypeWriterFactory.GENOTYPE_FORMAT format = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
if ( writer != null ) {
if ( writer instanceof VCFGenotypeWriter )
format = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
else if ( writer instanceof GLFGenotypeWriter )
format = GenotypeWriterFactory.GENOTYPE_FORMAT.GLF;
else if ( writer instanceof GeliGenotypeWriter )
format = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
else
throw new StingException("Unsupported genotype format: " + writer.getClass().getName());
}
gcm.set(GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, logger, UAC, format, verboseWriter));
}
if(gcm.get() == null)
gcm.set(GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, logger, UAC, format));
char ref = Character.toUpperCase(refContext.getBase());
if ( !BaseUtils.isRegularBase(ref) )
@ -281,23 +302,9 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
// Close any file writers
public void onTraversalDone(Integer sum) {
//gcm.close();
if ( verboseWriter != null )
verboseWriter.close();
logger.info("Processed " + sum + " loci that are callable for SNPs");
}
/**
* A class to keep track of some basic metrics about our calls
*/
protected class CallMetrics {
long nConfidentCalls = 0;
long nNonConfidentCalls = 0;
long nCalledBases = 0;
CallMetrics() {}
public String toString() {
return String.format("UG: %d confident and %d non-confident calls were made at %d bases",
nConfidentCalls, nNonConfidentCalls, nCalledBases);
}
}
}

View File

@ -22,7 +22,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1PointEM() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
Arrays.asList("44942d4e56aec62520903d297cee5fd8"));
Arrays.asList("b2e7854a474fbd24add057673ba469f1"));
executeTest("testMultiSamplePilot1 - Point Estimate EM", spec);
}
@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot2PointEM() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
Arrays.asList("aabea51b271df324eb543bc3417c0a42"));
Arrays.asList("c1ea2a36a7dacb278d82be9963dc6053"));
executeTest("testMultiSamplePilot2 - Point Estimate EM", spec);
}
@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testPooled1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1,
Arrays.asList("a733878ce9c257148a224aa00c0e63e9"));
Arrays.asList("c2364a23fcca9d08c7424e7a68c3ba04"));
executeTest("testPooled1", spec);
}
@ -56,7 +56,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("b68ad77195f6e926c4fce5f1a91f3bc9"));
Arrays.asList("293ea49e8c6854aa04099a48293a2c65"));
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
}
@ -64,7 +64,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot2Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("b63c43ba827cfc74bb8ef679ce3d847d"));
Arrays.asList("44362bd78c9559c194149fa0be20004b"));
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
}
@ -72,7 +72,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("8ae0be3b65c83d973519abad858c0073"));
Arrays.asList("9013c944a894d1d941167c2d1d57f0f9"));
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
}
@ -85,11 +85,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testParameter() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "-genotype", "c984952b91193d9550066f9514434183" );
e.put( "-all_bases", "0062942f2323ea0f2c9d3a3739a28d20" );
e.put( "--min_base_quality_score 10", "8a645a76d85e8a6edda6acdf8071ee6b" );
e.put( "--min_mapping_quality_score 10", "cf6c38de9872bdbf5da1a22d8c962ebe" );
e.put( "--max_mismatches_in_40bp_window 5", "e21a319eb0efbe11c3c1eeea7c9c0144" );
e.put( "-genotype", "9f7886f0f63de56ebcd187c22725b1f7" );
e.put( "-all_bases", "17f08368b48ebb5e18063da47f689e4e" );
e.put( "--min_base_quality_score 10", "cdac609651438aa8e74cb859aa04e2f0" );
e.put( "--min_mapping_quality_score 10", "ee1afd51eda0d005e5b3e908af63eb27" );
e.put( "--max_mismatches_in_40bp_window 5", "b9bec75e1aa4c3f45e0a88ee74b41321" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -103,7 +103,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1,
Arrays.asList("6823ce46547cf9693cda040dcbd4c9c1"));
Arrays.asList("022cf14445f7cd6beb04957c8fea9ae5"));
executeTest("testConfidence", spec);
}