a) Renamed/expanded SelectVariants arguments that choose particular kinds of variants and particular allelic types, now instead of -Indels or -SNPs we can specify for example -selectType [MIXED|INDEL|SNP|MNP|SYMBOLIC]. To select biallelic, multiallelic variants, use -restrictAllelesTo [BIALLELIC|MULTIALLELIC]. Corresponding gatkdocs changes.

b) More useful AC,AF logging in VariantsToTable with multiallelic sites: instead of logging comma-separated values, log max value by default. Hidden, experimental argument -logACSum to log sum of ACs instead. This is due to extreme slowness of R in parsing strings to tokens and computing max/sum itself (~100x slower than gatk).
c) Added integrationtest for new SelectVariants commands
This commit is contained in:
Guillermo del Angel 2011-08-24 12:25:50 -04:00
parent cd12f7f286
commit e618cb1e79
3 changed files with 112 additions and 27 deletions

View File

@ -24,6 +24,7 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.apache.poi.hpsf.Variant;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.utils.MathUtils;
@ -165,6 +166,23 @@ import java.util.*;
* -o output.vcf \
* -fraction 0.5
*
* Select only indels from a VCF:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -selectType INDEL
*
* Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column):
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -selectType SNP -selectType MNP \
* -restrictAllelesTo MULTIALLELIC
*
* </pre>
*
*/
@ -223,11 +241,17 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
private boolean EXCLUDE_FILTERED = false;
@Argument(fullName="excludeBiallelic", shortName="xl_biallelic", doc="Select only multi-allelic variants, excluding any biallelic one.", required=false)
private boolean EXCLUDE_BIALLELIC = false;
@Argument(fullName="selectMultiallelic", shortName="xl_multiallelic", doc="Select only biallelic variants, excluding any multi-allelic one.", required=false)
private boolean EXCLUDE_MULTIALLELIC = false;
/**
* When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
* For example, a multiallelic record such as:
* 1 100 . A AAA,AAAAA
* will be excluded if "-restrictAllelesTo BIALLELIC" is included, because there are two alternate alleles, whereas a record such as:
* 1 100 . A T
* will be included in that case, but would be excluded if "-restrictAllelesTo MULTIALLELIC
*/
@Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false)
private NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL;
@Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false)
private boolean KEEP_ORIGINAL_CHR_COUNTS = false;
@ -272,11 +296,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false)
private double fractionRandom = 0;
@Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs from the input file", required=false)
private boolean SELECT_SNPS = false;
/**
* This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
* When specified one or more times, a particular type of variant is selected.
*
*/
@Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false)
private List<VariantContext.Type> TYPES_TO_INCLUDE = new ArrayList<VariantContext.Type>();
@Argument(fullName="selectIndels", shortName="indels", doc="Select only indels from the input file", required=false)
private boolean SELECT_INDELS = false;
@Hidden
@Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
@ -296,6 +323,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
}
public enum NumberAlleleRestriction {
ALL,
BIALLELIC,
MULTIALLELIC
}
private ArrayList<VariantContext.Type> selectedTypes = new ArrayList<VariantContext.Type>();
private ArrayList<String> selectNames = new ArrayList<String>();
private List<VariantContextUtils.JexlVCMatchExp> jexls = null;
@ -360,6 +394,20 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
for ( String sample : samples )
logger.info("Including sample '" + sample + "'");
// if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
if (TYPES_TO_INCLUDE.isEmpty()) {
for (VariantContext.Type t : VariantContext.Type.values())
selectedTypes.add(t);
}
else {
for (VariantContext.Type t : TYPES_TO_INCLUDE)
selectedTypes.add(t);
}
// Initialize VCF header
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
@ -499,18 +547,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
if (!isConcordant(vc, compVCs))
return 0;
}
if (EXCLUDE_BIALLELIC && vc.isBiallelic())
return 0;
if (EXCLUDE_MULTIALLELIC && !vc.isBiallelic())
return 0;
// TODO - add ability to also select MNPs
// TODO - move variant selection arguments to the engine so other walkers can also do this
if (SELECT_INDELS && !(vc.isIndel() || vc.isMixed()))
if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic())
continue;
if (SELECT_SNPS && !vc.isSNP())
if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic())
continue;
if (!selectedTypes.contains(vc.getType()))
continue;
VariantContext sub = subsetRecord(vc, samples);

View File

@ -124,8 +124,12 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
* multi-allelic INFO field values can be lists of values.
*/
@Advanced
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
public boolean keepMultiAllelic = false;
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
public boolean keepMultiAllelic = false;
@Hidden
@Argument(fullName="logACSum", shortName="logACSum", doc="Log sum of AC instead of max value in case of multiallelic variants", required=false)
public boolean logACSum = false;
/**
* By default, this tool throws a UserException when it encounters a field without a value in some record. This
@ -150,7 +154,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
if ( ++nRecords < MAX_RECORDS || MAX_RECORDS == -1 ) {
for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) {
if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) {
List<String> vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA);
List<String> vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, keepMultiAllelic, logACSum);
out.println(Utils.join("\t", vals));
}
}
@ -176,9 +180,11 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
* @param fields a non-null list of fields to capture from VC
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise
* provides a value of NA
* @param kma if true, multiallelic variants are to be kept
* @param logsum if true, AF and AC are computed based on sum of allele counts. Otherwise, based on allele with highest count.
* @return
*/
public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
private static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData, boolean kma, boolean logsum) {
List<String> vals = new ArrayList<String>();
for ( String field : fields ) {
@ -206,12 +212,31 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
}
if (field.equals("AF") || field.equals("AC")) {
if (val.contains(",")) {
// strip [,] and spaces
val = val.replace("[","");
val = val.replace("]","");
val = val.replace(" ","");
}
String afo = val;
double af=0;
if (afo.contains(",")) {
String[] afs = afo.split(",");
afs[0] = afs[0].substring(1,afs[0].length());
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
double[] afd = new double[afs.length];
for (int k=0; k < afd.length; k++)
afd[k] = Double.valueOf(afs[k]);
if (kma && logsum)
af = MathUtils.sum(afd);
else
af = MathUtils.arrayMax(afd);
//af = Double.valueOf(afs[0]);
}
else
if (!afo.equals("NA"))
af = Double.valueOf(afo);
val = Double.toString(af);
}
vals.add(val);
@ -220,6 +245,9 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
return vals;
}
public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
return extractFields(vc, fields, allowMissingData, false, false);
}
//
// default reduce -- doesn't do anything at all
//

View File

@ -77,6 +77,19 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
executeTest("testConcordance--" + testFile, spec);
}
@Test
public void testVariantTypeSelection() {
String testFile = validationDataLocation + "complexExample1.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s -NO_HEADER",
1,
Arrays.asList("e0b12c0b47a8a7a988b3587b47bfa8cf")
);
executeTest("testVariantTypeSelection--" + testFile, spec);
}
@Test(enabled=false)
public void testRemovePLs() {
String testFile = validationDataLocation + "combine.3.vcf";