diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index e43d54e14..9f4718ef2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -49,7 +48,13 @@ import java.util.*; * fields to print with the -F NAME, each of which appears as a single column in * the output file, with a header named NAME, and the value of this field in the VCF * one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding - * in the INFO field (AC=10). Note that this tool does not support capturing any + * in the INFO field (AC=10). In addition, there are specially supported values like + * EVENTLENGTH (length of the event), TRANSITION (for SNPs), HET (count of het genotypes), + * HOM-REF (count of homozygous reference genotypes), HOM-VAR (count of homozygous variant + * genotypes), NO-CALL (count of no-call genotypes), TYPE (the type of event), VAR (count of + * non-reference genotypes), NSAMPLES (number of samples), NCALLED (number of called samples), + * GQ (from the genotype field; works only for a file with a single sample), and MULTI-ALLELIC + * (is the record from a multi-allelic site). Note that this tool does not support capturing any * GENOTYPE field values. If a VCF record is missing a value, then the tool by * default throws an error, but the special value NA can be emitted instead with * appropriate tool arguments. @@ -121,18 +126,13 @@ public class VariantsToTable extends RodWalker { int nRecords = 0; /** - * By default, only biallelic (REF=A, ALT=B) sites are including in the output. If this flag is provided, then - * VariantsToTable will emit field values for records with multiple ALT alleles. Note that in general this - * can make your resulting file unreadable and malformated according to tools like R, as the representation of - * multi-allelic INFO field values can be lists of values. + * By default, records with multiple ALT alleles will comprise just one line of output; note that in general this can make your resulting file + * unreadable/malformed for certain tools like R, as the representation of multi-allelic INFO field values are often comma-separated lists + * of values. Using the flag will cause multi-allelic records to be split into multiple lines of output (one for each allele in the ALT field); + * INFO field values that are not lists are copied for each of the output records while only the appropriate entry is used for lists. */ - @Advanced - @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) - public boolean keepMultiAllelic = false; - - @Hidden - @Argument(fullName="logACSum", shortName="logACSum", doc="Log sum of AC instead of max value in case of multiallelic variants", required=false) - public boolean logACSum = false; + @Argument(fullName="splitMultiAllelic", shortName="SMA", doc="If provided, we will split multi-allelic records into multiple lines of output", required=false) + public boolean splitMultiAllelic = false; /** * By default, this tool throws a UserException when it encounters a field without a value in some record. This @@ -144,6 +144,7 @@ public class VariantsToTable extends RodWalker { @Advanced @Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false) public boolean ALLOW_MISSING_DATA = false; + private final static String MISSING_DATA = "NA"; public void initialize() { // print out the header @@ -155,9 +156,9 @@ public class VariantsToTable extends RodWalker { return 0; for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) { - if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) { - List vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, keepMultiAllelic, logACSum); - out.println(Utils.join("\t", vals)); + if ( showFiltered || vc.isNotFiltered() ) { + for ( final List record : extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, splitMultiAllelic) ) + out.println(Utils.join("\t", record)); } } @@ -180,22 +181,23 @@ public class VariantsToTable extends RodWalker { * * @param vc the VariantContext whose field values we can to capture * @param fields a non-null list of fields to capture from VC - * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise - * provides a value of NA - * @param kma if true, multiallelic variants are to be kept - * @param logsum if true, AF and AC are computed based on sum of allele counts. Otherwise, based on allele with highest count. - * @return + * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise provides a value of NA + * @param splitMultiAllelic if true, multiallelic variants are to be split into multiple records + * @return List of lists of field values */ - private static List extractFields(VariantContext vc, List fields, boolean allowMissingData, boolean kma, boolean logsum) { - List vals = new ArrayList(); + private static List> extractFields(VariantContext vc, List fields, boolean allowMissingData, boolean splitMultiAllelic) { + + final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1; + final List> records = new ArrayList>(numRecordsToProduce); + for ( int i = 0; i < numRecordsToProduce; i++ ) + records.add(new ArrayList(fields.size())); for ( String field : fields ) { - String val = "NA"; if ( getters.containsKey(field) ) { - val = getters.get(field).get(vc); + addFieldValue(getters.get(field).get(vc), records); } else if ( vc.hasAttribute(field) ) { - val = vc.getAttributeAsString(field, null); + addFieldValue(vc.getAttribute(field, null), records); } else if ( isWildCard(field) ) { Set wildVals = new HashSet(); for ( Map.Entry elt : vc.getAttributes().entrySet()) { @@ -204,51 +206,47 @@ public class VariantsToTable extends RodWalker { } } + String val = MISSING_DATA; if ( wildVals.size() > 0 ) { List toVal = new ArrayList(wildVals); Collections.sort(toVal); val = Utils.join(",", toVal); } + + addFieldValue(val, records); } else if ( ! allowMissingData ) { throw new UserException(String.format("Missing field %s in vc %s at %s", field, vc.getSource(), vc)); + } else { + addFieldValue(MISSING_DATA, records); } - - if (field.equals("AF") || field.equals("AC")) { - String afo = val; - - double af=0; - if (afo.contains(",")) { - String[] afs = afo.split(","); - afs[0] = afs[0].substring(1,afs[0].length()); - afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); - - double[] afd = new double[afs.length]; - - for (int k=0; k < afd.length; k++) - afd[k] = Double.valueOf(afs[k]); - - if (kma && logsum) - af = MathUtils.sum(afd); - else - af = MathUtils.arrayMax(afd); - //af = Double.valueOf(afs[0]); - - } - else - if (!afo.equals("NA")) - af = Double.valueOf(afo); - - val = Double.toString(af); - - } - vals.add(val); } - return vals; + return records; } - public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) { - return extractFields(vc, fields, allowMissingData, false, false); + private static void addFieldValue(Object val, List> result) { + final int numResultRecords = result.size(); + + // if we're trying to create a single output record, add it + if ( numResultRecords == 1 ) { + result.get(0).add(val.toString()); + } + // if this field is a list of the proper size, add the appropriate entry to each record + else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) { + final List list = (List)val; + for ( int i = 0; i < numResultRecords; i++ ) + result.get(i).add(list.get(i).toString()); + } + // otherwise, add the original value to all of the records + else { + final String valStr = val.toString(); + for ( List record : result ) + record.add(valStr); + } + } + + public static List> extractFields(VariantContext vc, List fields, boolean allowMissingData) { + return extractFields(vc, fields, allowMissingData, false); } // // default reduce -- doesn't do anything at all @@ -321,6 +319,7 @@ public class VariantsToTable extends RodWalker { getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); + getters.put("MULTI-ALLELIC", new Getter() { public String get(VariantContext vc) { return Boolean.toString(vc.getAlternateAlleles().size() > 1); } }); getters.put("GQ", new Getter() { public String get(VariantContext vc) { if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); return String.format("%.2f", -10 * vc.getGenotype(0).getLog10PError()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java index 19021c1c2..0ab593e7a 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java @@ -27,10 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; -import org.testng.annotations.DataProvider; import java.util.*; -import java.io.File; public class VariantsToTableIntegrationTest extends WalkerTest { private String variantsToTableCmd(String moreArgs) { @@ -38,7 +36,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest { " --variant:vcf " + validationDataLocation + "/soap_gatk_annotated.vcf" + " -T VariantsToTable" + " -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F TRANSITION -F DP -F SB -F set -F RankSumP -F refseq.functionalClass*" + - " -L chr1 -KMA -o %s" + moreArgs; + " -L chr1 -o %s" + moreArgs; } @Test(enabled = true)