Refactored VariantsToTable so that 1) genotype-level fields can be specified (stabilized and supported code) and 2) the --moltenize argument could be supported to produce molten output of the data. Added tests that cover these capabilities.

This commit is contained in:
Eric Banks 2012-06-04 14:28:32 -04:00
parent f11e7ebc3a
commit 8405156ae1
2 changed files with 152 additions and 48 deletions

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.gatk.walkers.variantutils; package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
@ -111,13 +110,16 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
/** /**
* -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10). * -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10).
* Note that this tool does not support capturing any GENOTYPE field values. Note this argument * Note that to capture GENOTYPE (FORMAT) field values, see the GF argument. This argument accepts any number
* accepts any number of inputs. So -F CHROM -F POS is allowed. * of inputs. So -F CHROM -F POS is allowed.
*/ */
@Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true) @Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=false)
public List<String> fieldsToTake = new ArrayList<String>(); public List<String> fieldsToTake = new ArrayList<String>();
@Hidden /**
* -GF NAME can be any binding in the FORMAT field (e.g., GQ, PL).
* Note this argument accepts any number of inputs. So -F GQ -F PL is allowed.
*/
@Argument(fullName="genotypeFields", shortName="GF", doc="The name of each genotype field to capture for output in the table", required=false) @Argument(fullName="genotypeFields", shortName="GF", doc="The name of each genotype field to capture for output in the table", required=false)
public List<String> genotypeFieldsToTake = new ArrayList<String>(); public List<String> genotypeFieldsToTake = new ArrayList<String>();
@ -130,12 +132,11 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
public boolean showFiltered = false; public boolean showFiltered = false;
/** /**
* If provided, then this tool will exit with success after this number of records have been emitted to the file. * If provided, then this tool will exit with success after this number of VCF records have been emitted to the file.
*/ */
@Advanced
@Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false) @Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false)
public int MAX_RECORDS = -1; public int MAX_RECORDS = -1;
int nRecords = 0; long nRecords = 0L;
/** /**
* By default, records with multiple ALT alleles will comprise just one line of output; note that in general this can make your resulting file * By default, records with multiple ALT alleles will comprise just one line of output; note that in general this can make your resulting file
@ -146,6 +147,15 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
@Argument(fullName="splitMultiAllelic", shortName="SMA", doc="If provided, we will split multi-allelic records into multiple lines of output", required=false) @Argument(fullName="splitMultiAllelic", shortName="SMA", doc="If provided, we will split multi-allelic records into multiple lines of output", required=false)
public boolean splitMultiAllelic = false; public boolean splitMultiAllelic = false;
/**
* By default, this tool emits one line per usable VCF record (or per allele if the -SMA flag is provided). Using the -moltenize flag
* will cause records to be split into multiple lines of output: one for each field provided with -F or one for each combination of sample
* and field provided with -GF. Note that the "Sample" column for -F fields will always be "site".
*/
@Advanced
@Argument(fullName="moltenize", shortName="moltenize", doc="If provided, we will produce molten output", required=false)
public boolean moltenizeOutput = false;
/** /**
* By default, this tool throws a UserException when it encounters a field without a value in some record. This * By default, this tool throws a UserException when it encounters a field without a value in some record. This
* is generally useful when you mistype -F CHROM, so that you get a friendly warning about CHROM not being * is generally useful when you mistype -F CHROM, so that you get a friendly warning about CHROM not being
@ -158,27 +168,29 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
public boolean ALLOW_MISSING_DATA = false; public boolean ALLOW_MISSING_DATA = false;
private final static String MISSING_DATA = "NA"; private final static String MISSING_DATA = "NA";
private TreeSet<String> samples = new TreeSet<String>(); private final List<String> samples = new ArrayList<String>();
public void initialize() { public void initialize() {
String genotypeHeader = ""; if ( !genotypeFieldsToTake.isEmpty() ) {
if (!genotypeFieldsToTake.isEmpty()) {
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), variants); Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), variants);
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
samples.addAll(vcfSamples); samples.addAll(vcfSamples);
StringBuilder sb = new StringBuilder(); // optimization: if there are no samples, we don't have to worry about any genotype fields
sb.append("\t"); if ( samples.isEmpty() )
for (final String sample : samples) { genotypeFieldsToTake.clear();
for (final String gf : genotypeFieldsToTake) {
sb.append(sample+"."+gf+"\t");
}
}
genotypeHeader = sb.toString();
} }
// print out the header // print out the header
out.println(Utils.join("\t", fieldsToTake) + genotypeHeader); if ( moltenizeOutput ) {
out.println("RecordID\tSample\tVariable\tValue");
} else {
final String baseHeader = Utils.join("\t", fieldsToTake);
final String genotypeHeader = createGenotypeHeader(genotypeFieldsToTake, samples);
final String separator = (!baseHeader.isEmpty() && !genotypeHeader.isEmpty()) ? "\t" : "";
out.println(baseHeader + separator + genotypeHeader);
}
} }
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@ -186,10 +198,14 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
return 0; return 0;
for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) { for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) {
nRecords++;
if ( showFiltered || vc.isNotFiltered() ) { if ( showFiltered || vc.isNotFiltered() ) {
for ( final List<String> record : extractFields(vc, fieldsToTake, genotypeFieldsToTake, samples, for ( final List<String> record : extractFields(vc, fieldsToTake, genotypeFieldsToTake, samples, ALLOW_MISSING_DATA, splitMultiAllelic) ) {
ALLOW_MISSING_DATA, splitMultiAllelic) ) if ( moltenizeOutput )
out.println(Utils.join("\t", record)); emitMoltenizedOutput(record);
else
out.println(Utils.join("\t", record));
}
} }
} }
@ -198,36 +214,68 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
@Override @Override
public boolean isDone() { public boolean isDone() {
boolean done = MAX_RECORDS != -1 && nRecords >= MAX_RECORDS; return (MAX_RECORDS != -1 && nRecords >= MAX_RECORDS);
if ( done) logger.warn("isDone() will return true to leave after " + nRecords + " records");
return done ;
} }
private static final boolean isWildCard(String s) { private static final boolean isWildCard(String s) {
return s.endsWith("*"); return s.endsWith("*");
} }
private static String createGenotypeHeader(final List<String> genotypeFieldsToTake, final List<String> samples) {
boolean firstEntry = true;
final StringBuilder sb = new StringBuilder();
for ( final String sample : samples ) {
for ( final String gf : genotypeFieldsToTake ) {
if ( firstEntry )
firstEntry = false;
else
sb.append("\t");
sb.append(sample);
sb.append(".");
sb.append(gf);
}
}
return sb.toString();
}
private void emitMoltenizedOutput(final List<String> record) {
int index = 0;
for ( final String field : fieldsToTake ) {
out.println(String.format("%d\tsite\t%s\t%s", nRecords, field, record.get(index++)));
}
for ( final String sample : samples ) {
for ( final String gf : genotypeFieldsToTake ) {
out.println(String.format("%d\t%s\t%s\t%s", nRecords, sample, gf, record.get(index++)));
}
}
}
/** /**
* Utility function that returns the list of values for each field in fields from vc. * Utility function that returns the list of values for each field in fields from vc.
* *
* @param vc the VariantContext whose field values we can to capture * @param vc the VariantContext whose field values we can to capture
* @param fields a non-null list of fields to capture from VC * @param fields a non-null list of fields to capture from VC
* @param genotypeFields a (possibly) null) list of fields to capture from each genotype * @param genotypeFields a (possibly null) list of fields to capture from each genotype
* @param samples set of samples in vc, can be null in case of sites-only file * @param samples list of samples in vc
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise provides a value of NA * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise provides a value of NA
* @param splitMultiAllelic if true, multiallelic variants are to be split into multiple records * @param splitMultiAllelic if true, multiallelic variants are to be split into multiple records
* @return List of lists of field values * @return List of lists of field values
*/ */
private static List<List<String>> extractFields(VariantContext vc, List<String> fields, List<String> genotypeFields, private static List<List<String>> extractFields(final VariantContext vc,
Set<String> samples, boolean allowMissingData, boolean splitMultiAllelic) { final List<String> fields,
final List<String> genotypeFields,
final List<String> samples,
final boolean allowMissingData,
final boolean splitMultiAllelic) {
final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1; final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1;
final List<List<String>> records = new ArrayList<List<String>>(numRecordsToProduce); final List<List<String>> records = new ArrayList<List<String>>(numRecordsToProduce);
int numFields = fields.size(); int numFields = fields.size();
final boolean addGenotypeFields = (genotypeFields != null && !genotypeFields.isEmpty() && samples != null && !samples.isEmpty()); final boolean addGenotypeFields = genotypeFields != null && !genotypeFields.isEmpty();
if (addGenotypeFields) if ( addGenotypeFields )
numFields += genotypeFields.size()*samples.size(); numFields += genotypeFields.size() * samples.size();
for ( int i = 0; i < numRecordsToProduce; i++ ) for ( int i = 0; i < numRecordsToProduce; i++ )
records.add(new ArrayList<String>(numFields)); records.add(new ArrayList<String>(numFields));
@ -263,16 +311,17 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
} }
} }
if (addGenotypeFields) { if ( addGenotypeFields ) {
for (final String sample : samples) { for ( final String sample : samples ) {
for (final String gf : genotypeFields) { for ( final String gf : genotypeFields ) {
if (vc.hasGenotype(sample) && vc.getGenotype(sample).hasAttribute(gf)) if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAttribute(gf) )
addFieldValue(vc.getGenotype(sample).getAttribute(gf),records); addFieldValue(vc.getGenotype(sample).getAttribute(gf), records);
else else
addFieldValue(MISSING_DATA, records); addFieldValue(MISSING_DATA, records);
} }
} }
} }
return records; return records;
} }

View File

@ -33,7 +33,7 @@ import java.util.*;
public class VariantsToTableIntegrationTest extends WalkerTest { public class VariantsToTableIntegrationTest extends WalkerTest {
private String variantsToTableCmd(String moreArgs) { private String variantsToTableCmd(String moreArgs) {
return "-R " + hg18Reference + return "-R " + hg18Reference +
" --variant:vcf " + testDir + "/soap_gatk_annotated.vcf" + " --variant:vcf " + testDir + "soap_gatk_annotated.vcf" +
" -T VariantsToTable" + " -T VariantsToTable" +
" -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F TRANSITION -F DP -F SB -F set -F RankSumP -F refseq.functionalClass*" + " -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F TRANSITION -F DP -F SB -F set -F RankSumP -F refseq.functionalClass*" +
" -L chr1 -o %s" + moreArgs; " -L chr1 -o %s" + moreArgs;
@ -41,7 +41,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
private String variantsToTableMultiAllelicCmd(String moreArgs) { private String variantsToTableMultiAllelicCmd(String moreArgs) {
return "-R " + b37KGReference + return "-R " + b37KGReference +
" --variant " + testDir + "/multiallelic.vcf" + " --variant " + testDir + "multiallelic.vcf" +
" -T VariantsToTable" + " -T VariantsToTable" +
" -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F MULTI-ALLELIC -F AC -F AF" + " -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F MULTI-ALLELIC -F AC -F AF" +
" -o %s" + moreArgs; " -o %s" + moreArgs;
@ -51,7 +51,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
public void testComplexVariantsToTable() { public void testComplexVariantsToTable() {
WalkerTestSpec spec = new WalkerTestSpec(variantsToTableCmd(" -AMD"), WalkerTestSpec spec = new WalkerTestSpec(variantsToTableCmd(" -AMD"),
Arrays.asList("e8f771995127b727fb433da91dd4ee98")); Arrays.asList("e8f771995127b727fb433da91dd4ee98"));
executeTest("testComplexVariantsToTable", spec).getFirst(); executeTest("testComplexVariantsToTable", spec);
} }
@Test(enabled = true) @Test(enabled = true)
@ -64,13 +64,68 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
public void testMultiAllelicOneRecord() { public void testMultiAllelicOneRecord() {
WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(""), WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(""),
Arrays.asList("13dd36c08be6c800f23988e6000d963e")); Arrays.asList("13dd36c08be6c800f23988e6000d963e"));
executeTest("testMultiAllelicOneRecord", spec).getFirst(); executeTest("testMultiAllelicOneRecord", spec);
} }
@Test(enabled = true) @Test(enabled = true)
public void testMultiAllelicSplitRecords() { public void testMultiAllelicSplitRecords() {
WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(" -SMA"), WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(" -SMA"),
Arrays.asList("17a0fc80409d2fc00ad2bbb94b3a346b")); Arrays.asList("17a0fc80409d2fc00ad2bbb94b3a346b"));
executeTest("testMultiAllelicSplitRecords", spec).getFirst(); executeTest("testMultiAllelicSplitRecords", spec);
}
@Test(enabled = true)
public void testGenotypeFields() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --variant " + testDir + "vcfexample2.vcf" +
" -T VariantsToTable" +
" -GF RD" +
" -o %s",
1,
Arrays.asList("f80c4714d83226b6a6db8bf281b3bcba"));
executeTest("testGenotypeFields", spec);
}
@Test(enabled = true)
public void testMoltenOutput() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --variant " + testDir + "vcfexample2.vcf" +
" -T VariantsToTable" +
" -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER" +
" --moltenize" +
" -o %s",
1,
Arrays.asList("30047a5e78a7f523bd2872ac8baccc0e"));
executeTest("testMoltenOutput", spec);
}
@Test(enabled = true)
public void testMoltenOutputWithGenotypeFields() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --variant " + testDir + "vcfexample2.vcf" +
" -T VariantsToTable" +
" -GF RD" +
" --moltenize" +
" -o %s",
1,
Arrays.asList("132890fd33d16946e04b41cfd7453c0e"));
executeTest("testMoltenOutputWithGenotypeFields", spec);
}
@Test(enabled = true)
public void testMoltenOutputWithMultipleAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b37KGReference +
" --variant " + testDir + "multiallelic.vcf" +
" -T VariantsToTable" +
" -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F MULTI-ALLELIC -F AC -F AF" +
" --moltenize -SMA" +
" -o %s",
1,
Arrays.asList("c131e2c3cfb673c456cb160bda476101"));
executeTest("testMoltenOutputWithMultipleAlleles", spec);
} }
} }