And now the UnifiedGenotyper can officially annotate genotype (FORMAT) fields too.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3039 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e757f6f078
commit
03480c955c
|
|
@ -220,7 +220,7 @@ public class VariantContextAdaptors {
|
|||
return toVCF(vc, vcfRefBase, null, true);
|
||||
}
|
||||
|
||||
public static VCFRecord toVCF(VariantContext vc, char vcfRefBase, List<String> vcfGenotypeAttributeKeys, boolean filtersWereAppliedToContext) {
|
||||
public static VCFRecord toVCF(VariantContext vc, char vcfRefBase, List<String> allowedGenotypeAttributeKeys, boolean filtersWereAppliedToContext) {
|
||||
// deal with the reference
|
||||
String referenceBases = new String(vc.getReference().getBases());
|
||||
|
||||
|
|
@ -277,11 +277,12 @@ public class VariantContextAdaptors {
|
|||
alleleMap.put(a, encoding);
|
||||
}
|
||||
|
||||
if ( vcfGenotypeAttributeKeys == null ) {
|
||||
vcfGenotypeAttributeKeys = new ArrayList<String>();
|
||||
if ( vc.hasGenotypes() ) {
|
||||
vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_KEY);
|
||||
vcfGenotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc));
|
||||
List<String> vcfGenotypeAttributeKeys = new ArrayList<String>();
|
||||
if ( vc.hasGenotypes() ) {
|
||||
vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_KEY);
|
||||
for ( String key : calcVCFGenotypeKeys(vc) ) {
|
||||
if ( allowedGenotypeAttributeKeys == null || allowedGenotypeAttributeKeys.contains(key) )
|
||||
vcfGenotypeAttributeKeys.add(key);
|
||||
}
|
||||
}
|
||||
String genotypeFormatString = Utils.join(VCFRecord.GENOTYPE_FIELD_SEPERATOR, vcfGenotypeAttributeKeys);
|
||||
|
|
|
|||
|
|
@ -49,5 +49,5 @@ public class DepthPerAlleleBySample implements GenotypeAnnotation, ExperimentalA
|
|||
|
||||
public String getKeyName() { return "AD"; }
|
||||
|
||||
public VCFFormatHeaderLine getDescription() { return new VCFFormatHeaderLine(getKeyName(), 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Depth in genotypes for each ALT allele, in the same order as listed"); }
|
||||
public VCFFormatHeaderLine getDescription() { return new VCFFormatHeaderLine(getKeyName(), 1, VCFFormatHeaderLine.FORMAT_TYPE.Integer, "Depth in genotypes for each ALT allele, in the same order as listed"); }
|
||||
}
|
||||
|
|
@ -119,6 +119,7 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul
|
|||
CalledGenotype cg = new CalledGenotype(sample, myAlleles, AFbasedGenotype.second);
|
||||
cg.setLikelihoods(GLs.get(sample).getLikelihoods());
|
||||
cg.setReadBackedPileup(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup());
|
||||
cg.putAttribute(VCFGenotypeRecord.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
|
||||
|
||||
double[] posteriors = GLs.get(sample).getPosteriors();
|
||||
cg.setPosteriors(posteriors);
|
||||
|
|
|
|||
|
|
@ -12,15 +12,15 @@ import org.broadinstitute.sting.utils.Utils;
|
|||
*/
|
||||
public class VCFFormatHeaderLine extends VCFHeaderLine {
|
||||
|
||||
// the info field types
|
||||
public enum INFO_TYPE {
|
||||
// the format field types
|
||||
public enum FORMAT_TYPE {
|
||||
Integer, Float, String
|
||||
}
|
||||
|
||||
private String mName;
|
||||
private int mCount;
|
||||
private String mDescription;
|
||||
private INFO_TYPE mType;
|
||||
private FORMAT_TYPE mType;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -31,7 +31,7 @@ public class VCFFormatHeaderLine extends VCFHeaderLine {
|
|||
* @param type the type for this header line
|
||||
* @param description the description for this header line
|
||||
*/
|
||||
public VCFFormatHeaderLine(String name, int count, INFO_TYPE type, String description) {
|
||||
public VCFFormatHeaderLine(String name, int count, FORMAT_TYPE type, String description) {
|
||||
super("FORMAT", "");
|
||||
mName = name;
|
||||
mCount = count;
|
||||
|
|
@ -52,7 +52,7 @@ public class VCFFormatHeaderLine extends VCFHeaderLine {
|
|||
|
||||
mName = pieces[0];
|
||||
mCount = Integer.valueOf(pieces[1]);
|
||||
mType = INFO_TYPE.valueOf(pieces[2]);
|
||||
mType = FORMAT_TYPE.valueOf(pieces[2]);
|
||||
mDescription = Utils.trim(pieces[3], '"');
|
||||
// just in case there were some commas in the description
|
||||
for (int i = 4; i < pieces.length; i++)
|
||||
|
|
@ -63,6 +63,11 @@ public class VCFFormatHeaderLine extends VCFHeaderLine {
|
|||
return String.format("FORMAT=%s,%d,%s,\"%s\"", mName, mCount, mType.toString(), mDescription);
|
||||
}
|
||||
|
||||
public String getName() { return mName; }
|
||||
public int getCount() { return mCount; }
|
||||
public String getDescription() { return mDescription; }
|
||||
public FORMAT_TYPE getType() { return mType; }
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if ( !(o instanceof VCFFormatHeaderLine) )
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -333,10 +333,10 @@ public class VCFGenotypeRecord {
|
|||
|
||||
public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings() {
|
||||
Set<VCFFormatHeaderLine> result = new HashSet<VCFFormatHeaderLine>();
|
||||
result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.String, "Genotype"));
|
||||
result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Float, "Genotype Quality"));
|
||||
result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Read Depth (only filtered reads used for calling)"));
|
||||
result.add(new VCFFormatHeaderLine(GENOTYPE_POSTERIORS_TRIPLET_KEY, 3, VCFFormatHeaderLine.INFO_TYPE.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||
result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.String, "Genotype"));
|
||||
result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Genotype Quality"));
|
||||
result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Integer, "Read Depth (only filtered reads used for calling)"));
|
||||
result.add(new VCFFormatHeaderLine(GENOTYPE_POSTERIORS_TRIPLET_KEY, 3, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||
//result.add(new VCFFormatHeaderLine(HAPLOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Haplotype Quality"));
|
||||
return result;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,11 +30,8 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
|
|||
// validation stringency
|
||||
private VALIDATION_STRINGENCY validationStringency = VALIDATION_STRINGENCY.STRICT;
|
||||
|
||||
// standard genotype format strings
|
||||
private static String[] standardGenotypeFormatStrings = { VCFGenotypeRecord.GENOTYPE_KEY,
|
||||
VCFGenotypeRecord.DEPTH_KEY,
|
||||
VCFGenotypeRecord.GENOTYPE_QUALITY_KEY,
|
||||
VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY };
|
||||
// allowed genotype format strings
|
||||
private List<String> allowedGenotypeFormatStrings;
|
||||
|
||||
public VCFGenotypeWriterAdapter(File writeTo) {
|
||||
if (writeTo == null) throw new RuntimeException("VCF output file must not be null");
|
||||
|
|
@ -55,14 +52,21 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
|
|||
public void writeHeader(Set<String> sampleNames, Set<VCFHeaderLine> headerInfo) {
|
||||
mSampleNames.addAll(sampleNames);
|
||||
|
||||
// setup the header fields
|
||||
// set up the header fields
|
||||
Set<VCFHeaderLine> hInfo = new TreeSet<VCFHeaderLine>();
|
||||
hInfo.add(new VCFHeaderLine(VCFHeader.FILE_FORMAT_KEY, VCFHeader.VCF_VERSION));
|
||||
hInfo.addAll(headerInfo);
|
||||
|
||||
// setup the sample names
|
||||
// set up the sample names
|
||||
mHeader = new VCFHeader(hInfo, mSampleNames);
|
||||
mWriter.writeHeader(mHeader);
|
||||
|
||||
// set up the allowed genotype format fields
|
||||
allowedGenotypeFormatStrings = new ArrayList<String>();
|
||||
for ( VCFHeaderLine field : headerInfo ) {
|
||||
if ( field instanceof VCFFormatHeaderLine )
|
||||
allowedGenotypeFormatStrings.add(((VCFFormatHeaderLine)field).getName());
|
||||
}
|
||||
}
|
||||
|
||||
/** finish writing, closing any open files. */
|
||||
|
|
@ -79,12 +83,7 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
|
|||
if ( mHeader == null )
|
||||
throw new IllegalStateException("The VCF Header must be written before records can be added");
|
||||
|
||||
List<String> formatStrings;
|
||||
if ( vc.getChromosomeCount() > 0 )
|
||||
formatStrings = Arrays.asList(standardGenotypeFormatStrings);
|
||||
else
|
||||
formatStrings = new ArrayList<String>();
|
||||
VCFRecord call = VariantContextAdaptors.toVCF(vc, vc.getReference().toString().charAt(0), formatStrings, false);
|
||||
VCFRecord call = VariantContextAdaptors.toVCF(vc, vc.getReference().toString().charAt(0), allowedGenotypeFormatStrings, false);
|
||||
|
||||
Set<Allele> altAlleles = vc.getAlternateAlleles();
|
||||
StringBuffer altAlleleCountString = new StringBuffer();
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("d8af2cb687aa89d21c5492c98f100b5f"));
|
||||
Arrays.asList("2a31f997f9c07b6259cf3cdf2459c74b"));
|
||||
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot2Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("724bc2b640e111df82b9ebd261ddb5d9"));
|
||||
Arrays.asList("2ab8e48cd7a3d0474e187c2af9694628"));
|
||||
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
@ -51,7 +51,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testSingleSamplePilot2Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("304ec09a459705f5738a9a82b603ae1f"));
|
||||
Arrays.asList("ed35f375beb0ac849bfe98daffc1cee2"));
|
||||
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
@ -64,7 +64,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testParallelization() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,400,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 -nt 4", 1,
|
||||
Arrays.asList("33e9fe3b8c1ed729c22196d5db3e0d11"));
|
||||
Arrays.asList("75488092bb229d6345b84d9b793e4e55"));
|
||||
executeTest("test parallelization", spec);
|
||||
}
|
||||
|
||||
|
|
@ -77,11 +77,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testParameter() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "-genotype", "fb3ffa0f101cf9f8ffc6892b0acab414" );
|
||||
e.put( "-all_bases", "3888d0856370f9a5b18c078e2caaec2a" );
|
||||
e.put( "--min_base_quality_score 26", "66f729d1948dc057486832731278c226" );
|
||||
e.put( "--min_mapping_quality_score 26", "80a7fca199b899a3d0bc1293eb7bf7e5" );
|
||||
e.put( "--max_mismatches_in_40bp_window 5", "c6f8846865dcd9021372df917f6c962b" );
|
||||
e.put( "-genotype", "02316b8892d439e23d6fbbc65232f921" );
|
||||
e.put( "-all_bases", "bcce54904e4c3352168bbfb39f2b9a2f" );
|
||||
e.put( "--min_base_quality_score 26", "a8a286e61af8a6b9ba72c2eab19573bb" );
|
||||
e.put( "--min_mapping_quality_score 26", "c599b0c3a1e42772e9d1f9ff7112d1e4" );
|
||||
e.put( "--max_mismatches_in_40bp_window 5", "46129c47e7d283d8950e029ed39dc1e8" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
@ -95,7 +95,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testConfidence() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1,
|
||||
Arrays.asList("7854c02fcc0c8fcc879f6e35fef2e11f"));
|
||||
Arrays.asList("64c8fffc735f72663f27b2257fff5583"));
|
||||
executeTest("testConfidence", spec);
|
||||
}
|
||||
|
||||
|
|
@ -106,7 +106,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Test
|
||||
public void testOtherOutput() {
|
||||
String[] md5s = {"5f3b9abe1b2c30c2ede0007c43e1934c", "8cba0b8752f18fc620b4697840bc7291"};
|
||||
String[] md5s = {"e9d7c27ed63d2e92a17ba44501ab1138", "8cba0b8752f18fc620b4697840bc7291"};
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper" +
|
||||
" -R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
|
|
|
|||
Loading…
Reference in New Issue