Cleaning up VariantsToTable: added docs for supported fields; removed one-off hidden arguments for multi-allelics; default behavior is now to include multi-allelics in one record; added option to split multi-allelics into separate records.
This commit is contained in:
parent
c8c06c7753
commit
14981bed10
|
|
@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.variantutils;
|
|||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
|
|
@ -49,7 +48,13 @@ import java.util.*;
|
|||
* fields to print with the -F NAME, each of which appears as a single column in
|
||||
* the output file, with a header named NAME, and the value of this field in the VCF
|
||||
* one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding
|
||||
* in the INFO field (AC=10). Note that this tool does not support capturing any
|
||||
* in the INFO field (AC=10). In addition, there are specially supported values like
|
||||
* EVENTLENGTH (length of the event), TRANSITION (for SNPs), HET (count of het genotypes),
|
||||
* HOM-REF (count of homozygous reference genotypes), HOM-VAR (count of homozygous variant
|
||||
* genotypes), NO-CALL (count of no-call genotypes), TYPE (the type of event), VAR (count of
|
||||
* non-reference genotypes), NSAMPLES (number of samples), NCALLED (number of called samples),
|
||||
* GQ (from the genotype field; works only for a file with a single sample), and MULTI-ALLELIC
|
||||
* (is the record from a multi-allelic site). Note that this tool does not support capturing any
|
||||
* GENOTYPE field values. If a VCF record is missing a value, then the tool by
|
||||
* default throws an error, but the special value NA can be emitted instead with
|
||||
* appropriate tool arguments.
|
||||
|
|
@ -121,18 +126,13 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
int nRecords = 0;
|
||||
|
||||
/**
|
||||
* By default, only biallelic (REF=A, ALT=B) sites are including in the output. If this flag is provided, then
|
||||
* VariantsToTable will emit field values for records with multiple ALT alleles. Note that in general this
|
||||
* can make your resulting file unreadable and malformated according to tools like R, as the representation of
|
||||
* multi-allelic INFO field values can be lists of values.
|
||||
* By default, records with multiple ALT alleles will comprise just one line of output; note that in general this can make your resulting file
|
||||
* unreadable/malformed for certain tools like R, as the representation of multi-allelic INFO field values are often comma-separated lists
|
||||
* of values. Using the flag will cause multi-allelic records to be split into multiple lines of output (one for each allele in the ALT field);
|
||||
* INFO field values that are not lists are copied for each of the output records while only the appropriate entry is used for lists.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
|
||||
public boolean keepMultiAllelic = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="logACSum", shortName="logACSum", doc="Log sum of AC instead of max value in case of multiallelic variants", required=false)
|
||||
public boolean logACSum = false;
|
||||
@Argument(fullName="splitMultiAllelic", shortName="SMA", doc="If provided, we will split multi-allelic records into multiple lines of output", required=false)
|
||||
public boolean splitMultiAllelic = false;
|
||||
|
||||
/**
|
||||
* By default, this tool throws a UserException when it encounters a field without a value in some record. This
|
||||
|
|
@ -144,6 +144,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
@Advanced
|
||||
@Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false)
|
||||
public boolean ALLOW_MISSING_DATA = false;
|
||||
private final static String MISSING_DATA = "NA";
|
||||
|
||||
public void initialize() {
|
||||
// print out the header
|
||||
|
|
@ -155,9 +156,9 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
return 0;
|
||||
|
||||
for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) {
|
||||
if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) {
|
||||
List<String> vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, keepMultiAllelic, logACSum);
|
||||
out.println(Utils.join("\t", vals));
|
||||
if ( showFiltered || vc.isNotFiltered() ) {
|
||||
for ( final List<String> record : extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, splitMultiAllelic) )
|
||||
out.println(Utils.join("\t", record));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -180,22 +181,23 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
*
|
||||
* @param vc the VariantContext whose field values we can to capture
|
||||
* @param fields a non-null list of fields to capture from VC
|
||||
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise
|
||||
* provides a value of NA
|
||||
* @param kma if true, multiallelic variants are to be kept
|
||||
* @param logsum if true, AF and AC are computed based on sum of allele counts. Otherwise, based on allele with highest count.
|
||||
* @return
|
||||
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise provides a value of NA
|
||||
* @param splitMultiAllelic if true, multiallelic variants are to be split into multiple records
|
||||
* @return List of lists of field values
|
||||
*/
|
||||
private static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData, boolean kma, boolean logsum) {
|
||||
List<String> vals = new ArrayList<String>();
|
||||
private static List<List<String>> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData, boolean splitMultiAllelic) {
|
||||
|
||||
final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1;
|
||||
final List<List<String>> records = new ArrayList<List<String>>(numRecordsToProduce);
|
||||
for ( int i = 0; i < numRecordsToProduce; i++ )
|
||||
records.add(new ArrayList<String>(fields.size()));
|
||||
|
||||
for ( String field : fields ) {
|
||||
String val = "NA";
|
||||
|
||||
if ( getters.containsKey(field) ) {
|
||||
val = getters.get(field).get(vc);
|
||||
addFieldValue(getters.get(field).get(vc), records);
|
||||
} else if ( vc.hasAttribute(field) ) {
|
||||
val = vc.getAttributeAsString(field, null);
|
||||
addFieldValue(vc.getAttribute(field, null), records);
|
||||
} else if ( isWildCard(field) ) {
|
||||
Set<String> wildVals = new HashSet<String>();
|
||||
for ( Map.Entry<String,Object> elt : vc.getAttributes().entrySet()) {
|
||||
|
|
@ -204,51 +206,47 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
String val = MISSING_DATA;
|
||||
if ( wildVals.size() > 0 ) {
|
||||
List<String> toVal = new ArrayList<String>(wildVals);
|
||||
Collections.sort(toVal);
|
||||
val = Utils.join(",", toVal);
|
||||
}
|
||||
|
||||
addFieldValue(val, records);
|
||||
} else if ( ! allowMissingData ) {
|
||||
throw new UserException(String.format("Missing field %s in vc %s at %s", field, vc.getSource(), vc));
|
||||
} else {
|
||||
addFieldValue(MISSING_DATA, records);
|
||||
}
|
||||
|
||||
if (field.equals("AF") || field.equals("AC")) {
|
||||
String afo = val;
|
||||
|
||||
double af=0;
|
||||
if (afo.contains(",")) {
|
||||
String[] afs = afo.split(",");
|
||||
afs[0] = afs[0].substring(1,afs[0].length());
|
||||
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
|
||||
|
||||
double[] afd = new double[afs.length];
|
||||
|
||||
for (int k=0; k < afd.length; k++)
|
||||
afd[k] = Double.valueOf(afs[k]);
|
||||
|
||||
if (kma && logsum)
|
||||
af = MathUtils.sum(afd);
|
||||
else
|
||||
af = MathUtils.arrayMax(afd);
|
||||
//af = Double.valueOf(afs[0]);
|
||||
|
||||
}
|
||||
else
|
||||
if (!afo.equals("NA"))
|
||||
af = Double.valueOf(afo);
|
||||
|
||||
val = Double.toString(af);
|
||||
|
||||
}
|
||||
vals.add(val);
|
||||
}
|
||||
|
||||
return vals;
|
||||
return records;
|
||||
}
|
||||
|
||||
public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
|
||||
return extractFields(vc, fields, allowMissingData, false, false);
|
||||
private static void addFieldValue(Object val, List<List<String>> result) {
|
||||
final int numResultRecords = result.size();
|
||||
|
||||
// if we're trying to create a single output record, add it
|
||||
if ( numResultRecords == 1 ) {
|
||||
result.get(0).add(val.toString());
|
||||
}
|
||||
// if this field is a list of the proper size, add the appropriate entry to each record
|
||||
else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) {
|
||||
final List list = (List)val;
|
||||
for ( int i = 0; i < numResultRecords; i++ )
|
||||
result.get(i).add(list.get(i).toString());
|
||||
}
|
||||
// otherwise, add the original value to all of the records
|
||||
else {
|
||||
final String valStr = val.toString();
|
||||
for ( List<String> record : result )
|
||||
record.add(valStr);
|
||||
}
|
||||
}
|
||||
|
||||
public static List<List<String>> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
|
||||
return extractFields(vc, fields, allowMissingData, false);
|
||||
}
|
||||
//
|
||||
// default reduce -- doesn't do anything at all
|
||||
|
|
@ -321,6 +319,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
|
||||
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
|
||||
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
|
||||
getters.put("MULTI-ALLELIC", new Getter() { public String get(VariantContext vc) { return Boolean.toString(vc.getAlternateAlleles().size() > 1); } });
|
||||
getters.put("GQ", new Getter() { public String get(VariantContext vc) {
|
||||
if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
|
||||
return String.format("%.2f", -10 * vc.getGenotype(0).getLog10PError());
|
||||
|
|
|
|||
|
|
@ -27,10 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.variantutils;
|
|||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.annotations.DataProvider;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.File;
|
||||
|
||||
public class VariantsToTableIntegrationTest extends WalkerTest {
|
||||
private String variantsToTableCmd(String moreArgs) {
|
||||
|
|
@ -38,7 +36,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
|
|||
" --variant:vcf " + validationDataLocation + "/soap_gatk_annotated.vcf" +
|
||||
" -T VariantsToTable" +
|
||||
" -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F TRANSITION -F DP -F SB -F set -F RankSumP -F refseq.functionalClass*" +
|
||||
" -L chr1 -KMA -o %s" + moreArgs;
|
||||
" -L chr1 -o %s" + moreArgs;
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
|
|
|
|||
Loading…
Reference in New Issue