3 changes to UG which break integration tests:

1. emit AA,AB,BB likelihoods in the FORMAT field for Mark
2. remove constraint that genotype alleles (in the GT field) need to be lexigraphically sorted.
3. Add bam file(s) used by genotyper to header for Kiran


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2963 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-03-09 17:16:47 +00:00
parent aa7191353a
commit 5a20bf0e64
7 changed files with 41 additions and 31 deletions

View File

@ -162,8 +162,7 @@ public class VariantContextAdaptors {
alleles.add(refAllele);
for ( String alt : vcf.getAlternateAlleleList() ) {
if ( ! Allele.acceptableAlleleBases(alt) ) {
// todo -- cleanup
System.out.printf("Excluding vcf record %s%n", vcf);
//System.out.printf("Excluding vcf record %s%n", vcf);
return null;
}
alleles.add(new Allele(alt, false));
@ -288,15 +287,7 @@ public class VariantContextAdaptors {
for ( Genotype g : vc.getGenotypesSortedByName() ) {
List<VCFGenotypeEncoding> encodings = new ArrayList<VCFGenotypeEncoding>(g.getPloidy());
// TODO -- REMOVE ME ONCE INTEGRATION TESTS PASS!!!
ArrayList<Allele> temporaryList = new ArrayList<Allele>(g.getAlleles());
Collections.sort(temporaryList, new Comparator<Allele>() {
public int compare(Allele a1, Allele a2) {
return a1.toString().charAt(0) - a2.toString().charAt(0);
}
});
for ( Allele a : temporaryList ) {
//for ( Allele a : g.getAlleles() ) {
for ( Allele a : g.getAlleles() ) {
encodings.add(alleleMap.get(a.toString()));
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.genotype.CalledGenotype;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
@ -93,6 +94,9 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul
AlleleFrequencyMatrix matrix = AFMatrixMap.get(alt);
Allele refAllele = new Allele(Character.toString(ref), true);
Allele altAllele = new Allele(Character.toString(alt), false);
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref);
DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(ref, alt);
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(alt);
for ( String sample : GLs.keySet() ) {
@ -113,17 +117,21 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul
CalledGenotype cg = new CalledGenotype(sample, myAlleles, AFbasedGenotype.second);
cg.setLikelihoods(GLs.get(sample).getLikelihoods());
cg.setPosteriors(GLs.get(sample).getPosteriors());
cg.setReadBackedPileup(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup());
double[] posteriors = GLs.get(sample).getPosteriors();
cg.setPosteriors(posteriors);
String GL = String.format("%.2f,%.2f,%.2f",
posteriors[refGenotype.ordinal()],
posteriors[hetGenotype.ordinal()],
posteriors[homGenotype.ordinal()]);
cg.putAttribute(VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY, GL);
calls.put(sample, cg);
}
// output to beagle file if requested
if ( beagleWriter != null ) {
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref);
DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(ref, alt);
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(alt);
for ( String sample : samples ) {
GenotypeLikelihoods gl = GLs.get(sample);
if ( gl == null ) {

View File

@ -37,6 +37,7 @@ import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.util.*;
import java.io.PrintStream;
import java.io.File;
/**
@ -139,7 +140,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
if ( !UAC.NO_SLOD )
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.STRAND_BIAS_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Strand Bias"));
// FORMAT fields if not in POOLED mode
// FORMAT and INFO fields if not in POOLED mode
if ( UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) {
headerInfo.addAll(VCFGenotypeRecord.getSupportedHeaderStrings());
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.ALLELE_COUNT_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"));
@ -153,6 +154,9 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
Map<String,String> commandLineArgs = CommandLineUtils.getApproximateCommandLineArguments(args);
for ( Map.Entry<String, String> commandLineArg : commandLineArgs.entrySet() )
headerInfo.add(new VCFHeaderLine(String.format("UG_%s", commandLineArg.getKey()), commandLineArg.getValue()));
// also, the list of input bams
for ( File file : getToolkit().getArguments().samFiles )
headerInfo.add(new VCFHeaderLine("UG_bam_file_used", file.getName()));
return headerInfo;
}

View File

@ -6,16 +6,18 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.*;
/**
* This class emcompasses all the basic information about a genotype. It is immutable.
* This class emcompasses all the basic information about a called genotype.
*
* @author ebanks
*/
public class CalledGenotype extends MutableGenotype {
// key names for standard genotype attributes
public static final String LIKELIHOODS_ATTRIBUTE_KEY = "Likelihoods";
public static final String POSTERIORS_ATTRIBUTE_KEY = "Posteriors";
public static final String READBACKEDPILEUP_ATTRIBUTE_KEY = "ReadBackedPileup";
public CalledGenotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean genotypesArePhased) {
super(sampleName, alleles, negLog10PError, filters, attributes, genotypesArePhased);
}

View File

@ -19,6 +19,7 @@ public class VCFGenotypeRecord {
public static final String DEPTH_KEY = "DP";
public static final String HAPLOTYPE_QUALITY_KEY = "HQ";
public static final String GENOTYPE_FILTER_KEY = "FT";
public static final String GENOTYPE_POSTERIORS_TRIPLET_KEY = "GL";
public static final String OLD_DEPTH_KEY = "RD";
// the values for empty fields
@ -313,6 +314,8 @@ public class VCFGenotypeRecord {
result = String.valueOf(MISSING_DEPTH);
else if ( field.equals(GENOTYPE_FILTER_KEY) )
result = UNFILTERED;
else if ( field.equals(GENOTYPE_POSTERIORS_TRIPLET_KEY) )
result = "0,0,0";
// TODO -- support haplotype quality
//else if ( field.equals(HAPLOTYPE_QUALITY_KEY) )
// result = String.valueOf(MISSING_HAPLOTYPE_QUALITY);
@ -324,6 +327,7 @@ public class VCFGenotypeRecord {
result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.String, "Genotype"));
result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Float, "Genotype Quality"));
result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Read Depth (only filtered reads used for calling)"));
result.add(new VCFFormatHeaderLine(GENOTYPE_POSTERIORS_TRIPLET_KEY, 3, VCFFormatHeaderLine.INFO_TYPE.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
//result.add(new VCFFormatHeaderLine(HAPLOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Haplotype Quality"));
return result;
}

View File

@ -11,7 +11,7 @@ import java.util.*;
/**
* @author aaron
* @author aaron, ebanks
* <p/>
* Class VCFGenotypeWriterAdapter
* <p/>
@ -33,7 +33,8 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
// standard genotype format strings
private static String[] standardGenotypeFormatStrings = { VCFGenotypeRecord.GENOTYPE_KEY,
VCFGenotypeRecord.DEPTH_KEY,
VCFGenotypeRecord.GENOTYPE_QUALITY_KEY };
VCFGenotypeRecord.GENOTYPE_QUALITY_KEY,
VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY };
public VCFGenotypeWriterAdapter(File writeTo) {
if (writeTo == null) throw new RuntimeException("VCF output file must not be null");

View File

@ -22,7 +22,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testPooled1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1,
Arrays.asList("5a86c53e8f0897a71ff74662c5696dc4"));
Arrays.asList("c30af5d192661abd77b05a316f1d8923"));
executeTest("testPooled1", spec);
}
@ -35,7 +35,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("2f928b1261963044ca1781601cae4bf7"));
Arrays.asList("882b2fae1cd1ba65cac3cadacec0ce2b"));
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
}
@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot2Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("10fe265d0140243b52f500c3882230f2"));
Arrays.asList("aa0cff414e6623c36465726a987a645d"));
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
}
@ -51,7 +51,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("5c455e1a33e3f82d13898b20ee71ac69"));
Arrays.asList("53df224164083cc7d8ad85f3d16ba38f"));
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
}
@ -64,7 +64,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testParallelization() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,400,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 -nt 4", 1,
Arrays.asList("c827c74e59263b0f6b526089b050c100"));
Arrays.asList("3ade750c0d261594ea549db7b127a1e3"));
executeTest("test parallelization", spec);
}
@ -77,11 +77,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testParameter() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "-genotype", "8818584bf7053ecf52844f3b404ab630" );
e.put( "-all_bases", "bf6ea1c04bf52002e3365c1f47103d4a" );
e.put( "--min_base_quality_score 26", "3f192cd301f057698d0bc6c41841ce81" );
e.put( "--min_mapping_quality_score 26", "2631025243ac6b52df852309235ec8d3" );
e.put( "--max_mismatches_in_40bp_window 5", "e540a2057164e3b05a5d635805f1167e" );
e.put( "-genotype", "bee9fa71d70fdde094ab30785d4fa84e" );
e.put( "-all_bases", "410cff9d97cd017becd1f6260c7abeeb" );
e.put( "--min_base_quality_score 26", "85e1c35d3926afc68761aefea3f41332" );
e.put( "--min_mapping_quality_score 26", "1c49a7d5e6ad295c0450b8a35053050f" );
e.put( "--max_mismatches_in_40bp_window 5", "7e7db5a0d859704e12a4b89d35065682" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -95,7 +95,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1,
Arrays.asList("bfa93ee89aa0807b9c4e4793363452b4"));
Arrays.asList("c67dd3e97cb188b117074d2c4692fcfa"));
executeTest("testConfidence", spec);
}
@ -106,7 +106,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// --------------------------------------------------------------------------------------------------------------
@Test
public void testOtherOutput() {
String[] md5s = {"8c7dd53a402b727753002ebcd76168ac", "8cba0b8752f18fc620b4697840bc7291"};
String[] md5s = {"ce0024816a092af9f998a7561ffb4fb2", "8cba0b8752f18fc620b4697840bc7291"};
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper" +
" -R " + oneKGLocation + "reference/human_b36_both.fasta" +