From 8405156ae1afd53d2afd9d521b6ccbd92ffe4a28 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 4 Jun 2012 14:28:32 -0400 Subject: [PATCH] Refactored VariantsToTable so that 1) genotype-level fields can be specified (stabilized and supported code) and 2) the --moltenize argument could be supported to produce molten output of the data. Added tests that cover these capabilities. --- .../walkers/variantutils/VariantsToTable.java | 135 ++++++++++++------ .../VariantsToTableIntegrationTest.java | 65 ++++++++- 2 files changed, 152 insertions(+), 48 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index dce8378f6..ebe4db629 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; @@ -111,13 +110,16 @@ public class VariantsToTable extends RodWalker { /** * -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10). - * Note that this tool does not support capturing any GENOTYPE field values. Note this argument - * accepts any number of inputs. So -F CHROM -F POS is allowed. + * Note that to capture GENOTYPE (FORMAT) field values, see the GF argument. This argument accepts any number + * of inputs. So -F CHROM -F POS is allowed. */ - @Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true) + @Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=false) public List fieldsToTake = new ArrayList(); - @Hidden + /** + * -GF NAME can be any binding in the FORMAT field (e.g., GQ, PL). + * Note this argument accepts any number of inputs. So -F GQ -F PL is allowed. + */ @Argument(fullName="genotypeFields", shortName="GF", doc="The name of each genotype field to capture for output in the table", required=false) public List genotypeFieldsToTake = new ArrayList(); @@ -130,12 +132,11 @@ public class VariantsToTable extends RodWalker { public boolean showFiltered = false; /** - * If provided, then this tool will exit with success after this number of records have been emitted to the file. + * If provided, then this tool will exit with success after this number of VCF records have been emitted to the file. */ - @Advanced @Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false) public int MAX_RECORDS = -1; - int nRecords = 0; + long nRecords = 0L; /** * By default, records with multiple ALT alleles will comprise just one line of output; note that in general this can make your resulting file @@ -146,6 +147,15 @@ public class VariantsToTable extends RodWalker { @Argument(fullName="splitMultiAllelic", shortName="SMA", doc="If provided, we will split multi-allelic records into multiple lines of output", required=false) public boolean splitMultiAllelic = false; + /** + * By default, this tool emits one line per usable VCF record (or per allele if the -SMA flag is provided). Using the -moltenize flag + * will cause records to be split into multiple lines of output: one for each field provided with -F or one for each combination of sample + * and field provided with -GF. Note that the "Sample" column for -F fields will always be "site". + */ + @Advanced + @Argument(fullName="moltenize", shortName="moltenize", doc="If provided, we will produce molten output", required=false) + public boolean moltenizeOutput = false; + /** * By default, this tool throws a UserException when it encounters a field without a value in some record. This * is generally useful when you mistype -F CHROM, so that you get a friendly warning about CHROM not being @@ -158,27 +168,29 @@ public class VariantsToTable extends RodWalker { public boolean ALLOW_MISSING_DATA = false; private final static String MISSING_DATA = "NA"; - private TreeSet samples = new TreeSet(); + private final List samples = new ArrayList(); public void initialize() { - String genotypeHeader = ""; - if (!genotypeFieldsToTake.isEmpty()) { + if ( !genotypeFieldsToTake.isEmpty() ) { Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), variants); TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); samples.addAll(vcfSamples); - - StringBuilder sb = new StringBuilder(); - sb.append("\t"); - for (final String sample : samples) { - for (final String gf : genotypeFieldsToTake) { - sb.append(sample+"."+gf+"\t"); - } - } - genotypeHeader = sb.toString(); + + // optimization: if there are no samples, we don't have to worry about any genotype fields + if ( samples.isEmpty() ) + genotypeFieldsToTake.clear(); } + // print out the header - out.println(Utils.join("\t", fieldsToTake) + genotypeHeader); + if ( moltenizeOutput ) { + out.println("RecordID\tSample\tVariable\tValue"); + } else { + final String baseHeader = Utils.join("\t", fieldsToTake); + final String genotypeHeader = createGenotypeHeader(genotypeFieldsToTake, samples); + final String separator = (!baseHeader.isEmpty() && !genotypeHeader.isEmpty()) ? "\t" : ""; + out.println(baseHeader + separator + genotypeHeader); + } } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @@ -186,10 +198,14 @@ public class VariantsToTable extends RodWalker { return 0; for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) { + nRecords++; if ( showFiltered || vc.isNotFiltered() ) { - for ( final List record : extractFields(vc, fieldsToTake, genotypeFieldsToTake, samples, - ALLOW_MISSING_DATA, splitMultiAllelic) ) - out.println(Utils.join("\t", record)); + for ( final List record : extractFields(vc, fieldsToTake, genotypeFieldsToTake, samples, ALLOW_MISSING_DATA, splitMultiAllelic) ) { + if ( moltenizeOutput ) + emitMoltenizedOutput(record); + else + out.println(Utils.join("\t", record)); + } } } @@ -198,36 +214,68 @@ public class VariantsToTable extends RodWalker { @Override public boolean isDone() { - boolean done = MAX_RECORDS != -1 && nRecords >= MAX_RECORDS; - if ( done) logger.warn("isDone() will return true to leave after " + nRecords + " records"); - return done ; + return (MAX_RECORDS != -1 && nRecords >= MAX_RECORDS); } private static final boolean isWildCard(String s) { return s.endsWith("*"); } + private static String createGenotypeHeader(final List genotypeFieldsToTake, final List samples) { + boolean firstEntry = true; + + final StringBuilder sb = new StringBuilder(); + for ( final String sample : samples ) { + for ( final String gf : genotypeFieldsToTake ) { + if ( firstEntry ) + firstEntry = false; + else + sb.append("\t"); + sb.append(sample); + sb.append("."); + sb.append(gf); + } + } + return sb.toString(); + } + + private void emitMoltenizedOutput(final List record) { + int index = 0; + for ( final String field : fieldsToTake ) { + out.println(String.format("%d\tsite\t%s\t%s", nRecords, field, record.get(index++))); + } + for ( final String sample : samples ) { + for ( final String gf : genotypeFieldsToTake ) { + out.println(String.format("%d\t%s\t%s\t%s", nRecords, sample, gf, record.get(index++))); + } + } + } + /** * Utility function that returns the list of values for each field in fields from vc. * - * @param vc the VariantContext whose field values we can to capture - * @param fields a non-null list of fields to capture from VC - * @param genotypeFields a (possibly) null) list of fields to capture from each genotype - * @param samples set of samples in vc, can be null in case of sites-only file - * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise provides a value of NA - * @param splitMultiAllelic if true, multiallelic variants are to be split into multiple records + * @param vc the VariantContext whose field values we can to capture + * @param fields a non-null list of fields to capture from VC + * @param genotypeFields a (possibly null) list of fields to capture from each genotype + * @param samples list of samples in vc + * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise provides a value of NA + * @param splitMultiAllelic if true, multiallelic variants are to be split into multiple records * @return List of lists of field values */ - private static List> extractFields(VariantContext vc, List fields, List genotypeFields, - Set samples, boolean allowMissingData, boolean splitMultiAllelic) { + private static List> extractFields(final VariantContext vc, + final List fields, + final List genotypeFields, + final List samples, + final boolean allowMissingData, + final boolean splitMultiAllelic) { final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1; final List> records = new ArrayList>(numRecordsToProduce); int numFields = fields.size(); - final boolean addGenotypeFields = (genotypeFields != null && !genotypeFields.isEmpty() && samples != null && !samples.isEmpty()); - if (addGenotypeFields) - numFields += genotypeFields.size()*samples.size(); + final boolean addGenotypeFields = genotypeFields != null && !genotypeFields.isEmpty(); + if ( addGenotypeFields ) + numFields += genotypeFields.size() * samples.size(); for ( int i = 0; i < numRecordsToProduce; i++ ) records.add(new ArrayList(numFields)); @@ -263,16 +311,17 @@ public class VariantsToTable extends RodWalker { } } - if (addGenotypeFields) { - for (final String sample : samples) { - for (final String gf : genotypeFields) { - if (vc.hasGenotype(sample) && vc.getGenotype(sample).hasAttribute(gf)) - addFieldValue(vc.getGenotype(sample).getAttribute(gf),records); + if ( addGenotypeFields ) { + for ( final String sample : samples ) { + for ( final String gf : genotypeFields ) { + if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAttribute(gf) ) + addFieldValue(vc.getGenotype(sample).getAttribute(gf), records); else addFieldValue(MISSING_DATA, records); } } } + return records; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java index b32eb53a7..85773aa27 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java @@ -33,7 +33,7 @@ import java.util.*; public class VariantsToTableIntegrationTest extends WalkerTest { private String variantsToTableCmd(String moreArgs) { return "-R " + hg18Reference + - " --variant:vcf " + testDir + "/soap_gatk_annotated.vcf" + + " --variant:vcf " + testDir + "soap_gatk_annotated.vcf" + " -T VariantsToTable" + " -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F TRANSITION -F DP -F SB -F set -F RankSumP -F refseq.functionalClass*" + " -L chr1 -o %s" + moreArgs; @@ -41,7 +41,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest { private String variantsToTableMultiAllelicCmd(String moreArgs) { return "-R " + b37KGReference + - " --variant " + testDir + "/multiallelic.vcf" + + " --variant " + testDir + "multiallelic.vcf" + " -T VariantsToTable" + " -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F MULTI-ALLELIC -F AC -F AF" + " -o %s" + moreArgs; @@ -51,7 +51,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest { public void testComplexVariantsToTable() { WalkerTestSpec spec = new WalkerTestSpec(variantsToTableCmd(" -AMD"), Arrays.asList("e8f771995127b727fb433da91dd4ee98")); - executeTest("testComplexVariantsToTable", spec).getFirst(); + executeTest("testComplexVariantsToTable", spec); } @Test(enabled = true) @@ -64,13 +64,68 @@ public class VariantsToTableIntegrationTest extends WalkerTest { public void testMultiAllelicOneRecord() { WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(""), Arrays.asList("13dd36c08be6c800f23988e6000d963e")); - executeTest("testMultiAllelicOneRecord", spec).getFirst(); + executeTest("testMultiAllelicOneRecord", spec); } @Test(enabled = true) public void testMultiAllelicSplitRecords() { WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(" -SMA"), Arrays.asList("17a0fc80409d2fc00ad2bbb94b3a346b")); - executeTest("testMultiAllelicSplitRecords", spec).getFirst(); + executeTest("testMultiAllelicSplitRecords", spec); + } + + @Test(enabled = true) + public void testGenotypeFields() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b36KGReference + + " --variant " + testDir + "vcfexample2.vcf" + + " -T VariantsToTable" + + " -GF RD" + + " -o %s", + 1, + Arrays.asList("f80c4714d83226b6a6db8bf281b3bcba")); + executeTest("testGenotypeFields", spec); + } + + @Test(enabled = true) + public void testMoltenOutput() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b36KGReference + + " --variant " + testDir + "vcfexample2.vcf" + + " -T VariantsToTable" + + " -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER" + + " --moltenize" + + " -o %s", + 1, + Arrays.asList("30047a5e78a7f523bd2872ac8baccc0e")); + executeTest("testMoltenOutput", spec); + } + + @Test(enabled = true) + public void testMoltenOutputWithGenotypeFields() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b36KGReference + + " --variant " + testDir + "vcfexample2.vcf" + + " -T VariantsToTable" + + " -GF RD" + + " --moltenize" + + " -o %s", + 1, + Arrays.asList("132890fd33d16946e04b41cfd7453c0e")); + executeTest("testMoltenOutputWithGenotypeFields", spec); + } + + @Test(enabled = true) + public void testMoltenOutputWithMultipleAlleles() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " --variant " + testDir + "multiallelic.vcf" + + " -T VariantsToTable" + + " -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F MULTI-ALLELIC -F AC -F AF" + + " --moltenize -SMA" + + " -o %s", + 1, + Arrays.asList("c131e2c3cfb673c456cb160bda476101")); + executeTest("testMoltenOutputWithMultipleAlleles", spec); } }