Output changes for VCF and UG:

1. Don't cap q-scores at 99
2. Scale SLOD to allow more resolution in the output
3. UG outputs weighted allele balance (AB) and on-off genotype (OO) info fields for het genotype calls (works for joint estimation model and SSG)


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2011 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-11-10 16:31:31 +00:00
parent 1e7ddd2d9f
commit 6a37090529
9 changed files with 186 additions and 23 deletions

View File

@ -66,6 +66,8 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
double strandScore = Math.max(forwardLod - lod, reverseLod - lod);
logger.debug(String.format("SLOD=%f", strandScore));
// rescale by a factor of 10
strandScore *= 10.0;
((SLODBacked)locusdata).setSLOD(strandScore);
}

View File

@ -333,6 +333,61 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
if ( locusdata instanceof AlleleFrequencyBacked ) {
((AlleleFrequencyBacked)locusdata).setAlleleFrequency(bestAFguess);
}
if ( locusdata instanceof AlleleBalanceBacked ) {
ArrayList<Double> refBalances = new ArrayList<Double>();
ArrayList<Double> onOffBalances = new ArrayList<Double>();
ArrayList<Double> weights = new ArrayList<Double>();
// accumulate ratios and weights
for ( java.util.Map.Entry<String, GenotypeLikelihoods> entry : GLs.entrySet() ) {
// determine the best genotype
Integer sorted[] = Utils.SortPermutation(entry.getValue().getPosteriors());
DiploidGenotype bestGenotype = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 1]];
// we care only about het calls
if ( bestGenotype.isHetRef(ref) ) {
// make sure the alt base is our target alt
char altBase = bestGenotype.base1 != ref ? bestGenotype.base1 : bestGenotype.base2;
if ( altBase != baseOfMax )
continue;
// get the base counts at this pileup (minus deletions)
ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts.get(entry.getKey()).getContext(StratifiedContext.OVERALL));
int[] counts = pileup.getBasePileupAsCounts();
int refCount = counts[BaseUtils.simpleBaseToBaseIndex(ref)];
int altCount = counts[BaseUtils.simpleBaseToBaseIndex(altBase)];
int totalCount = 0;
for (int i = 0; i < counts.length; i++)
totalCount += counts[i];
// add the entries
weights.add(entry.getValue().getNormalizedPosteriors()[bestGenotype.ordinal()]);
refBalances.add((double)refCount / (double)(refCount + altCount));
onOffBalances.add((double)(refCount + altCount) / (double)totalCount);
}
}
if ( weights.size() > 0 ) {
// normalize the weights
double sum = 0.0;
for (int i = 0; i < weights.size(); i++)
sum += weights.get(i);
for (int i = 0; i < weights.size(); i++)
weights.set(i, weights.get(i) / sum);
// calculate total weighted ratios
double normalizedRefRatio = 0.0;
double normalizedOnOffRatio = 0.0;
for (int i = 0; i < weights.size(); i++) {
normalizedRefRatio += weights.get(i) * refBalances.get(i);
normalizedOnOffRatio += weights.get(i) * onOffBalances.get(i);
}
((AlleleBalanceBacked)locusdata).setRefRatio(normalizedRefRatio);
((AlleleBalanceBacked)locusdata).setOnOffRatio(normalizedOnOffRatio);
}
}
if ( locusdata instanceof SLODBacked ) {
// the overall lod
double overallLog10PofNull = Math.log10(alleleFrequencyPosteriors[indexOfMax][0]);
@ -354,7 +409,11 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofNull;
double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofNull;
//logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
// strand score is max bias between forward and reverse strands
double strandScore = Math.max(forwardLod - lod, reverseLod - lod);
// rescale by a factor of 10
strandScore *= 10.0;
//logger.debug(String.format("SLOD=%f", strandScore));
((SLODBacked)locusdata).setSLOD(strandScore);

View File

@ -90,6 +90,27 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
if ( locusdata instanceof ConfidenceBacked ) {
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
}
if ( locusdata instanceof AlleleBalanceBacked ) {
DiploidGenotype bestGenotype = DiploidGenotype.values()[bestIndex];
// we care only about het calls
if ( bestGenotype.isHetRef(ref) ) {
char altBase = bestGenotype.base1 != ref ? bestGenotype.base1 : bestGenotype.base2;
// get the base counts at this pileup (minus deletions)
int[] counts = discoveryGL.first.getBasePileupAsCounts();
int refCount = counts[BaseUtils.simpleBaseToBaseIndex(ref)];
int altCount = counts[BaseUtils.simpleBaseToBaseIndex(altBase)];
int totalCount = 0;
for (int i = 0; i < counts.length; i++)
totalCount += counts[i];
// set the ratios
((AlleleBalanceBacked)locusdata).setRefRatio((double)refCount / (double)(refCount + altCount));
((AlleleBalanceBacked)locusdata).setOnOffRatio((double)(refCount + altCount) / (double)totalCount);
}
}
}
return new Pair<List<Genotype>, GenotypeLocusData>(Arrays.asList(call), locusdata);

View File

@ -80,24 +80,28 @@ public class ReadBackedPileup extends BasicPileup {
return secondaryQualPileupAsString(reads, offsets);
}
public String getBasePileupAsCountsString() {
String bases = basePileupAsString(reads, offsets, includeDeletions);
public int[] getBasePileupAsCounts() {
int[] counts = new int[4];
for (int i = 0; i < reads.size(); i++) {
// skip deletion sites
if ( offsets.get(i) == -1 )
continue;
char base = Character.toUpperCase((char)(reads.get(i).getReadBases()[offsets.get(i)]));
if (BaseUtils.simpleBaseToBaseIndex(base) == -1)
continue;
counts[BaseUtils.simpleBaseToBaseIndex(base)]++;
}
int[] counts = new int[4];
for (int i = 0; i < reads.size(); i++)
{
// skip deletion sites
if ( offsets.get(i) == -1 )
continue;
char base = Character.toUpperCase((char)(reads.get(i).getReadBases()[offsets.get(i)]));
if (BaseUtils.simpleBaseToBaseIndex(base) == -1) { continue; }
counts[BaseUtils.simpleBaseToBaseIndex(base)]++;
}
return String.format("A[%d] C[%d] G[%d] T[%d]",
counts[0],
counts[1],
counts[2],
counts[3]);
return counts;
}
public String getBasePileupAsCountsString() {
int[] counts = getBasePileupAsCounts();
return String.format("A[%d] C[%d] G[%d] T[%d]",
counts[0],
counts[1],
counts[2],
counts[3]);
}
public String getProbDistPileup() {

View File

@ -0,0 +1,35 @@
package org.broadinstitute.sting.utils.genotype;
/**
* @author ebanks
* Interface AlleleBalanceBacked
*
* this interface indicates that the genotype is
* backed up by allele balance information.
*/
public interface AlleleBalanceBacked {
/**
*
* @return returns the ratio of ref/(ref+alt) bases for hets
*/
public double getRefRatio();
/**
*
* @param ratio the ref/(ref+alt) ratio
*/
public void setRefRatio(double ratio);
/**
*
* @return returns the ratio of (ref+alt)/total bases for hets
*/
public double getOnOffRatio();
/**
*
* @param ratio the (ref+alt)/total ratio
*/
public void setOnOffRatio(double ratio);
}

View File

@ -33,7 +33,6 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked,
// the sample name, used to propulate the SampleBacked interface
private String mSampleName;
public VCFGenotypeCall(char ref, GenomeLoc loc) {
mRefBase = ref;
mLocation = loc;

View File

@ -10,7 +10,7 @@ import org.broadinstitute.sting.utils.genotype.*;
* <p/>
* represents the meta data for a genotype object.
*/
public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked, SLODBacked, AlleleFrequencyBacked {
public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked, SLODBacked, AlleleFrequencyBacked, AlleleBalanceBacked {
// the discovery lod score
private double mConfidence = 0.0;
@ -27,6 +27,10 @@ public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked
// the ref base
private char mRefBase;
// the various allele ratios
private double mRefRatio = 0.0;
private double mOnOffRatio = 0.0;
/**
* create a basic genotype meta data pbject, given the following fields
*
@ -105,4 +109,40 @@ public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked
public void setAlleleFrequency(double frequency) {
mAlleleFrequency = frequency;
}
/**
* @return returns true if the ref/(ref+alt) ratio for this genotype has been set
*/
public boolean hasRefRatio() {
return mRefRatio > 0.0;
}
/**
* @return returns the ref/(ref+alt) ratio for this genotype
*/
public double getRefRatio() {
return mRefRatio;
}
public void setRefRatio(double ratio) {
mRefRatio = ratio;
}
/**
* @return returns true if the (ref+alt)/total ratio for this genotype has been set
*/
public boolean hasOnOffRatio() {
return mOnOffRatio > 0.0;
}
/**
* @return returns the (ref+alt)/total ratio for this genotype
*/
public double getOnOffRatio() {
return mOnOffRatio;
}
public void setOnOffRatio(double ratio) {
mOnOffRatio = ratio;
}
}

View File

@ -164,8 +164,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
infoFields.put("DP", String.format("%d", totalReadDepth));
double qual = (locusdata == null) ? 0 : ((VCFGenotypeLocusData)locusdata).getConfidence();
// maintain 0-99 based Q-scores
qual = Math.min(qual, 99);
// min Q-score is zero
qual = Math.max(qual, 0);
VCFRecord vcfRecord = new VCFRecord(params.getReferenceBase(),
@ -195,6 +194,10 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
if ( locusdata != null ) {
infoFields.put("SB", String.format("%.2f", locusdata.getSLOD()));
infoFields.put("AF", String.format("%.2f", locusdata.getAlleleFrequency()));
if ( locusdata.hasRefRatio() )
infoFields.put("AB", String.format("%.2f", locusdata.getRefRatio()));
if ( locusdata.hasOnOffRatio() )
infoFields.put("OO", String.format("%.2f", locusdata.getOnOffRatio()));
}
infoFields.put("NS", String.valueOf(params.getGenotypesRecords().size()));
return infoFields;

View File

@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 50", 1,
Arrays.asList("4f14792e79adef415efaf97e946460e3"));
Arrays.asList("8f14c1efea00b32984ec52db7a800d1b"));
executeTest("testMultiSamplePilot1", spec);
}
@ -51,7 +51,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 50", 1,
Arrays.asList("e3a701fd5f1fe4a84746f17ad4b3b3c3"));
Arrays.asList("89c600d72a815c09412c97f82fa2281e"));
executeTest("testMultiSamplePilot2", spec);
}