Refactored VariantsToTable so that 1) genotype-level fields can be specified (stabilized and supported code) and 2) the --moltenize argument could be supported to produce molten output of the data. Added tests that cover these capabilities.

This commit is contained in:
Eric Banks 2012-06-04 14:28:32 -04:00
parent f11e7ebc3a
commit 8405156ae1
2 changed files with 152 additions and 48 deletions

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
@ -111,13 +110,16 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
/**
* -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10).
* Note that this tool does not support capturing any GENOTYPE field values. Note this argument
* accepts any number of inputs. So -F CHROM -F POS is allowed.
* Note that to capture GENOTYPE (FORMAT) field values, see the GF argument. This argument accepts any number
* of inputs. So -F CHROM -F POS is allowed.
*/
@Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true)
@Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=false)
public List<String> fieldsToTake = new ArrayList<String>();
@Hidden
/**
* -GF NAME can be any binding in the FORMAT field (e.g., GQ, PL).
* Note this argument accepts any number of inputs. So -F GQ -F PL is allowed.
*/
@Argument(fullName="genotypeFields", shortName="GF", doc="The name of each genotype field to capture for output in the table", required=false)
public List<String> genotypeFieldsToTake = new ArrayList<String>();
@ -130,12 +132,11 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
public boolean showFiltered = false;
/**
* If provided, then this tool will exit with success after this number of records have been emitted to the file.
* If provided, then this tool will exit with success after this number of VCF records have been emitted to the file.
*/
@Advanced
@Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false)
public int MAX_RECORDS = -1;
int nRecords = 0;
long nRecords = 0L;
/**
* By default, records with multiple ALT alleles will comprise just one line of output; note that in general this can make your resulting file
@ -146,6 +147,15 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
@Argument(fullName="splitMultiAllelic", shortName="SMA", doc="If provided, we will split multi-allelic records into multiple lines of output", required=false)
public boolean splitMultiAllelic = false;
/**
* By default, this tool emits one line per usable VCF record (or per allele if the -SMA flag is provided). Using the -moltenize flag
* will cause records to be split into multiple lines of output: one for each field provided with -F or one for each combination of sample
* and field provided with -GF. Note that the "Sample" column for -F fields will always be "site".
*/
@Advanced
@Argument(fullName="moltenize", shortName="moltenize", doc="If provided, we will produce molten output", required=false)
public boolean moltenizeOutput = false;
/**
* By default, this tool throws a UserException when it encounters a field without a value in some record. This
* is generally useful when you mistype -F CHROM, so that you get a friendly warning about CHROM not being
@ -158,27 +168,29 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
public boolean ALLOW_MISSING_DATA = false;
private final static String MISSING_DATA = "NA";
private TreeSet<String> samples = new TreeSet<String>();
private final List<String> samples = new ArrayList<String>();
public void initialize() {
String genotypeHeader = "";
if (!genotypeFieldsToTake.isEmpty()) {
if ( !genotypeFieldsToTake.isEmpty() ) {
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), variants);
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
samples.addAll(vcfSamples);
StringBuilder sb = new StringBuilder();
sb.append("\t");
for (final String sample : samples) {
for (final String gf : genotypeFieldsToTake) {
sb.append(sample+"."+gf+"\t");
}
}
genotypeHeader = sb.toString();
// optimization: if there are no samples, we don't have to worry about any genotype fields
if ( samples.isEmpty() )
genotypeFieldsToTake.clear();
}
// print out the header
out.println(Utils.join("\t", fieldsToTake) + genotypeHeader);
if ( moltenizeOutput ) {
out.println("RecordID\tSample\tVariable\tValue");
} else {
final String baseHeader = Utils.join("\t", fieldsToTake);
final String genotypeHeader = createGenotypeHeader(genotypeFieldsToTake, samples);
final String separator = (!baseHeader.isEmpty() && !genotypeHeader.isEmpty()) ? "\t" : "";
out.println(baseHeader + separator + genotypeHeader);
}
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@ -186,10 +198,14 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
return 0;
for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) {
nRecords++;
if ( showFiltered || vc.isNotFiltered() ) {
for ( final List<String> record : extractFields(vc, fieldsToTake, genotypeFieldsToTake, samples,
ALLOW_MISSING_DATA, splitMultiAllelic) )
out.println(Utils.join("\t", record));
for ( final List<String> record : extractFields(vc, fieldsToTake, genotypeFieldsToTake, samples, ALLOW_MISSING_DATA, splitMultiAllelic) ) {
if ( moltenizeOutput )
emitMoltenizedOutput(record);
else
out.println(Utils.join("\t", record));
}
}
}
@ -198,36 +214,68 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
@Override
public boolean isDone() {
boolean done = MAX_RECORDS != -1 && nRecords >= MAX_RECORDS;
if ( done) logger.warn("isDone() will return true to leave after " + nRecords + " records");
return done ;
return (MAX_RECORDS != -1 && nRecords >= MAX_RECORDS);
}
private static final boolean isWildCard(String s) {
return s.endsWith("*");
}
private static String createGenotypeHeader(final List<String> genotypeFieldsToTake, final List<String> samples) {
boolean firstEntry = true;
final StringBuilder sb = new StringBuilder();
for ( final String sample : samples ) {
for ( final String gf : genotypeFieldsToTake ) {
if ( firstEntry )
firstEntry = false;
else
sb.append("\t");
sb.append(sample);
sb.append(".");
sb.append(gf);
}
}
return sb.toString();
}
private void emitMoltenizedOutput(final List<String> record) {
int index = 0;
for ( final String field : fieldsToTake ) {
out.println(String.format("%d\tsite\t%s\t%s", nRecords, field, record.get(index++)));
}
for ( final String sample : samples ) {
for ( final String gf : genotypeFieldsToTake ) {
out.println(String.format("%d\t%s\t%s\t%s", nRecords, sample, gf, record.get(index++)));
}
}
}
/**
* Utility function that returns the list of values for each field in fields from vc.
*
* @param vc the VariantContext whose field values we can to capture
* @param fields a non-null list of fields to capture from VC
* @param genotypeFields a (possibly) null) list of fields to capture from each genotype
* @param samples set of samples in vc, can be null in case of sites-only file
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise provides a value of NA
* @param splitMultiAllelic if true, multiallelic variants are to be split into multiple records
* @param vc the VariantContext whose field values we can to capture
* @param fields a non-null list of fields to capture from VC
* @param genotypeFields a (possibly null) list of fields to capture from each genotype
* @param samples list of samples in vc
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise provides a value of NA
* @param splitMultiAllelic if true, multiallelic variants are to be split into multiple records
* @return List of lists of field values
*/
private static List<List<String>> extractFields(VariantContext vc, List<String> fields, List<String> genotypeFields,
Set<String> samples, boolean allowMissingData, boolean splitMultiAllelic) {
private static List<List<String>> extractFields(final VariantContext vc,
final List<String> fields,
final List<String> genotypeFields,
final List<String> samples,
final boolean allowMissingData,
final boolean splitMultiAllelic) {
final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1;
final List<List<String>> records = new ArrayList<List<String>>(numRecordsToProduce);
int numFields = fields.size();
final boolean addGenotypeFields = (genotypeFields != null && !genotypeFields.isEmpty() && samples != null && !samples.isEmpty());
if (addGenotypeFields)
numFields += genotypeFields.size()*samples.size();
final boolean addGenotypeFields = genotypeFields != null && !genotypeFields.isEmpty();
if ( addGenotypeFields )
numFields += genotypeFields.size() * samples.size();
for ( int i = 0; i < numRecordsToProduce; i++ )
records.add(new ArrayList<String>(numFields));
@ -263,16 +311,17 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
}
}
if (addGenotypeFields) {
for (final String sample : samples) {
for (final String gf : genotypeFields) {
if (vc.hasGenotype(sample) && vc.getGenotype(sample).hasAttribute(gf))
addFieldValue(vc.getGenotype(sample).getAttribute(gf),records);
if ( addGenotypeFields ) {
for ( final String sample : samples ) {
for ( final String gf : genotypeFields ) {
if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAttribute(gf) )
addFieldValue(vc.getGenotype(sample).getAttribute(gf), records);
else
addFieldValue(MISSING_DATA, records);
}
}
}
return records;
}

View File

@ -33,7 +33,7 @@ import java.util.*;
public class VariantsToTableIntegrationTest extends WalkerTest {
private String variantsToTableCmd(String moreArgs) {
return "-R " + hg18Reference +
" --variant:vcf " + testDir + "/soap_gatk_annotated.vcf" +
" --variant:vcf " + testDir + "soap_gatk_annotated.vcf" +
" -T VariantsToTable" +
" -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F TRANSITION -F DP -F SB -F set -F RankSumP -F refseq.functionalClass*" +
" -L chr1 -o %s" + moreArgs;
@ -41,7 +41,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
private String variantsToTableMultiAllelicCmd(String moreArgs) {
return "-R " + b37KGReference +
" --variant " + testDir + "/multiallelic.vcf" +
" --variant " + testDir + "multiallelic.vcf" +
" -T VariantsToTable" +
" -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F MULTI-ALLELIC -F AC -F AF" +
" -o %s" + moreArgs;
@ -51,7 +51,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
public void testComplexVariantsToTable() {
WalkerTestSpec spec = new WalkerTestSpec(variantsToTableCmd(" -AMD"),
Arrays.asList("e8f771995127b727fb433da91dd4ee98"));
executeTest("testComplexVariantsToTable", spec).getFirst();
executeTest("testComplexVariantsToTable", spec);
}
@Test(enabled = true)
@ -64,13 +64,68 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
public void testMultiAllelicOneRecord() {
WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(""),
Arrays.asList("13dd36c08be6c800f23988e6000d963e"));
executeTest("testMultiAllelicOneRecord", spec).getFirst();
executeTest("testMultiAllelicOneRecord", spec);
}
@Test(enabled = true)
public void testMultiAllelicSplitRecords() {
WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(" -SMA"),
Arrays.asList("17a0fc80409d2fc00ad2bbb94b3a346b"));
executeTest("testMultiAllelicSplitRecords", spec).getFirst();
executeTest("testMultiAllelicSplitRecords", spec);
}
@Test(enabled = true)
public void testGenotypeFields() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --variant " + testDir + "vcfexample2.vcf" +
" -T VariantsToTable" +
" -GF RD" +
" -o %s",
1,
Arrays.asList("f80c4714d83226b6a6db8bf281b3bcba"));
executeTest("testGenotypeFields", spec);
}
@Test(enabled = true)
public void testMoltenOutput() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --variant " + testDir + "vcfexample2.vcf" +
" -T VariantsToTable" +
" -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER" +
" --moltenize" +
" -o %s",
1,
Arrays.asList("30047a5e78a7f523bd2872ac8baccc0e"));
executeTest("testMoltenOutput", spec);
}
@Test(enabled = true)
public void testMoltenOutputWithGenotypeFields() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --variant " + testDir + "vcfexample2.vcf" +
" -T VariantsToTable" +
" -GF RD" +
" --moltenize" +
" -o %s",
1,
Arrays.asList("132890fd33d16946e04b41cfd7453c0e"));
executeTest("testMoltenOutputWithGenotypeFields", spec);
}
@Test(enabled = true)
public void testMoltenOutputWithMultipleAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b37KGReference +
" --variant " + testDir + "multiallelic.vcf" +
" -T VariantsToTable" +
" -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F MULTI-ALLELIC -F AC -F AF" +
" --moltenize -SMA" +
" -o %s",
1,
Arrays.asList("c131e2c3cfb673c456cb160bda476101"));
executeTest("testMoltenOutputWithMultipleAlleles", spec);
}
}