No longer allow the lod_threshold argument - use confidence instead.

Have UG output qscores in all cases.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1968 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-11-03 16:18:51 +00:00
parent 2fb45dbd73
commit 11d950abe0
6 changed files with 66 additions and 31 deletions

View File

@ -26,28 +26,46 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
// run the EM calculation
EMOutput overall = runEM(ref, contexts, priors, StratifiedContext.OVERALL);
double lod = overall.getPofD() - overall.getPofNull();
logger.debug(String.format("LOD=%f", lod));
// return a null call if we don't pass the lod cutoff
if ( !ALL_BASE_MODE && lod < LOD_THRESHOLD )
return new Pair<List<Genotype>, GenotypeLocusData>(null, null);
double PofD = Math.pow(10, overall.getPofD());
double PofNull = Math.pow(10, overall.getPofNull());
double sum = PofD + PofNull;
// calculate the phred-scaled confidence score
double phredScaledConfidence;
boolean bestIsRef = false;
if ( PofD > PofNull ) {
phredScaledConfidence = -10.0 * Math.log10(PofNull / sum);
} else {
phredScaledConfidence = -10.0 * Math.log10(PofD / sum);
bestIsRef = true;
}
// are we above the lod threshold for emitting calls (and not in all-bases mode)?
if ( !ALL_BASE_MODE && (bestIsRef || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) {
return new Pair<List<Genotype>, GenotypeLocusData>(null, null);
}
// generate the calls
GenotypeLocusData locusdata = GenotypeWriterFactory.createSupportedGenotypeLocusData(OUTPUT_FORMAT, ref, context.getLocation());
if ( locusdata != null ) {
if ( locusdata instanceof ConfidenceBacked ) {
((ConfidenceBacked)locusdata).setConfidence(lod);
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
}
if ( locusdata instanceof SLODBacked ) {
// calculate strand score
double lod = overall.getPofD() - overall.getPofNull();
logger.debug(String.format("LOD=%f", lod));
EMOutput forward = runEM(ref, contexts, priors, StratifiedContext.FORWARD);
EMOutput reverse = runEM(ref, contexts, priors, StratifiedContext.REVERSE);
double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull();
double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull();
logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
double strandScore = Math.max(forwardLod - lod, reverseLod - lod);
logger.debug(String.format("SLOD=%f", lod, strandScore));
logger.debug(String.format("SLOD=%f", strandScore));
((SLODBacked)locusdata).setSLOD(strandScore);
}

View File

@ -35,7 +35,6 @@ public abstract class GenotypeCalculationModel implements Cloneable {
protected boolean GENOTYPE_MODE;
protected boolean POOLED_INPUT;
protected int POOL_SIZE;
protected double LOD_THRESHOLD;
protected double CONFIDENCE_THRESHOLD;
protected double MINIMUM_ALLELE_FREQUENCY;
protected int maxDeletionsInPileup;
@ -68,7 +67,6 @@ public abstract class GenotypeCalculationModel implements Cloneable {
GENOTYPE_MODE = UAC.GENOTYPE;
POOLED_INPUT = UAC.POOLED;
POOL_SIZE = UAC.POOLSIZE;
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
CONFIDENCE_THRESHOLD = UAC.CONFIDENCE_THRESHOLD;
MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
maxDeletionsInPileup = UAC.MAX_DELETIONS;

View File

@ -45,16 +45,27 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
// get the genotype likelihoods
Pair<ReadBackedPileup, GenotypeLikelihoods> discoveryGL = getSingleSampleLikelihoods(ref, sampleContext, priors, StratifiedContext.OVERALL);
// calculate the lod threshold
double[] posteriors = discoveryGL.second.getPosteriors();
// find the index of the best genotype
double[] posteriors = discoveryGL.second.getNormalizedPosteriors();
Integer sortedPosteriors[] = Utils.SortPermutation(posteriors);
double bestGenotype = posteriors[sortedPosteriors[sortedPosteriors.length - 1]];
double nextBestGenotype = posteriors[sortedPosteriors[sortedPosteriors.length - 2]];
double refGenotype = posteriors[DiploidGenotype.createHomGenotype(ref).ordinal()];
double lodConfidence = (GENOTYPE_MODE ? (bestGenotype - nextBestGenotype) : (bestGenotype - refGenotype));
int bestIndex = sortedPosteriors[sortedPosteriors.length - 1];
// flag to determine if ref is the best call (not necessary in genotype mode)
boolean bestIsRef = false;
// calculate the phred-scaled confidence score
double phredScaledConfidence;
if ( GENOTYPE_MODE ) {
phredScaledConfidence = -10.0 * Math.log10(1.0 - posteriors[bestIndex]);
} else {
int refIndex = DiploidGenotype.createHomGenotype(ref).ordinal();
bestIsRef = (refIndex == bestIndex);
double pError = (bestIsRef ? 1.0 - posteriors[refIndex] : posteriors[refIndex]);
phredScaledConfidence = -10.0 * Math.log10(pError);
}
// are we above the lod threshold for emitting calls (and not in all-bases mode)?
if ( !ALL_BASE_MODE && lodConfidence < LOD_THRESHOLD ) {
if ( !ALL_BASE_MODE && (bestIsRef || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) {
return new Pair<List<Genotype>, GenotypeLocusData>(null, null);
}
@ -77,7 +88,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
GenotypeLocusData locusdata = GenotypeWriterFactory.createSupportedGenotypeLocusData(OUTPUT_FORMAT, ref, context.getLocation());
if ( locusdata != null ) {
if ( locusdata instanceof ConfidenceBacked ) {
((ConfidenceBacked)locusdata).setConfidence(lodConfidence);
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
}
}

View File

@ -71,11 +71,11 @@ public class UnifiedArgumentCollection {
// control the various parameters to be used
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold by which variants should be filtered (for EM_POINT_ESTIMATE model)", required = false)
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "[DEPRECATED] The lod threshold by which variants should be filtered", required = false)
public double LOD_THRESHOLD = Double.MIN_VALUE;
@Argument(fullName = "min_confidence_threshold", shortName = "confidence", doc = "The phred-scaled confidence threshold by which variants should be filtered (for JOINT_ESTIMATE model)", required = false)
public double CONFIDENCE_THRESHOLD = Double.MIN_VALUE;
@Argument(fullName = "min_confidence_threshold", shortName = "confidence", doc = "The phred-scaled confidence threshold by which variants should be filtered", required = false)
public double CONFIDENCE_THRESHOLD = 0.0;
@Argument(fullName = "max_deletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false)
public Integer MAX_DELETIONS = 1;

View File

@ -100,9 +100,17 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<Genotype>, GenotypeL
**/
public void initialize() {
// deal with input errors
if ( UAC.POOLED && UAC.genotypeModel == GenotypeCalculationModel.Model.EM_POINT_ESTIMATE ) {
throw new StingException("This was an attempt to use an EM Point Estimate model with pooled genotype calculations. This model does not work with pooled data.");
}
if ( UAC.LOD_THRESHOLD > Double.MIN_VALUE ) {
StringBuilder sb = new StringBuilder();
sb.append("\n***\tThe --lod_threshold argument is no longer supported; instead, please use --min_confidence_threshold.");
sb.append("\n***\tThere is approximately a 10-to-1 mapping from confidence to LOD.");
sb.append("\n***\tUse Q" + (10.0 * UAC.LOD_THRESHOLD) + " as an approximate equivalent to your LOD " + UAC.LOD_THRESHOLD + " cutoff");
throw new StingException(sb.toString());
}
// get all of the unique sample names
samples = new HashSet<String>();

View File

@ -13,7 +13,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
}
public static String testGeliLod5() {
return baseTestString() + " --variant_output_format GELI -lod 5";
return baseTestString() + " --variant_output_format GELI -confidence 50";
}
private static String OneMb1StateMD5 = "7e3fc1d8427329eb2a3e05a81011749a";
@ -42,16 +42,16 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1,
Arrays.asList("dba7f51b0ddb1afebcd8f229444f70e5"));
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 50", 1,
Arrays.asList("9985a92060b512b5d27b4074faf8b60b"));
executeTest("testMultiSamplePilot1", spec);
}
@Test
public void testMultiSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1,
Arrays.asList("c2a7dd9bec6819b2d7702564955c993b"));
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 50", 1,
Arrays.asList("394f009c9bad34eb584fa10d133d79e0"));
executeTest("testMultiSamplePilot2", spec);
}
@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void genotypeTest() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
testGeliLod5() + " -L 1:10,000,000-10,100,000 -bm empirical --genotype", 1,
Arrays.asList("6a930a8f115495f5319541c21a892d8e"));
Arrays.asList("45da29e3b1306d546a7b80c30c979ad4"));
executeTest("genotypeTest", spec);
}
@ -167,12 +167,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testLOD() {
HashMap<Double, String> e = new HashMap<Double, String>();
e.put( 10.0, "94c5b48c0c956fcdacbffaa38a80d926" );
e.put( 3.0, "f1f53b944b821f05d5dd1ad8fda2f57b" );
e.put( 100.0, "94c5b48c0c956fcdacbffaa38a80d926" );
e.put( 30.0, "df455abc1b2bc533aa1dc6eb088a835a" );
for ( Map.Entry<Double, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseTestString() + " --variant_output_format GELI -L 1:10,000,000-11,000,000 -bm EMPIRICAL -lod " + entry.getKey(), 1,
baseTestString() + " --variant_output_format GELI -L 1:10,000,000-11,000,000 -bm EMPIRICAL -confidence " + entry.getKey(), 1,
Arrays.asList(entry.getValue()));
executeTest("testLOD", spec);
}
@ -186,7 +186,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testHeterozyosity() {
HashMap<Double, String> e = new HashMap<Double, String>();
e.put( 0.01, "953fe510a81e357d0b56feac2a90095d" );
e.put( 0.01, "b8837be7e8beb3ab2ed7150cdc022c65" );
e.put( 0.0001, "ef0f2af7d13f166829d86b15fabc2b81" );
e.put( 1.0 / 1850, "a435c8c966c11f4393a25a9d01c4fc3d" );
@ -204,7 +204,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void empirical1MbTestBinaryGeli() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseTestString() + " -L 1:10,000,000-11,000,000 -bm empirical --variant_output_format GELI_BINARY -lod 5", 1,
baseTestString() + " -L 1:10,000,000-11,000,000 -bm empirical --variant_output_format GELI_BINARY -confidence 50", 1,
Arrays.asList("915abf82a04fcd1842f6865501bae67c"));
executeTest("empirical1MbTestBinaryGeli", spec);
}