Merge branch 'master' of ssh://copper.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
98acb546a9
|
|
@ -69,7 +69,7 @@ import java.util.*;
|
||||||
* <p>
|
* <p>
|
||||||
* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
|
* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
|
||||||
* is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
|
* is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
|
||||||
* or deletion (indels) in the individualÕs genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
|
* or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
|
||||||
* the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently,
|
* the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently,
|
||||||
* it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are
|
* it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are
|
||||||
* correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,
|
* correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,
|
||||||
|
|
|
||||||
|
|
@ -40,36 +40,211 @@ import java.io.PrintStream;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Emits specific fields as dictated by the user from one or more VCF files.
|
* Emits specific fields from a VCF file to a tab-deliminated table
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* This walker accepts a single VCF file and writes out user-selected fields from the
|
||||||
|
* VCF as a header-containing, tab-deliminated file. The user specifies one or more
|
||||||
|
* fields to print with the -F NAME, each of which appears as a single column in
|
||||||
|
* the output file, with a header named NAME, and the value of this field in the VCF
|
||||||
|
* one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding
|
||||||
|
* in the INFO field (AC=10). Note that this tool does not support capturing any
|
||||||
|
* GENOTYPE field values. If a VCF record is missing a value, then the tool by
|
||||||
|
* default throws an error, but the special value NA can be emitted instead with
|
||||||
|
* appropriate tool arguments.
|
||||||
|
*
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Input</h2>
|
||||||
|
* <p>
|
||||||
|
* <ul>
|
||||||
|
* <li>A VCF file</li>
|
||||||
|
* <li>A list of -F fields to write</li>
|
||||||
|
* </ul>
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Output</h2>
|
||||||
|
* <p>
|
||||||
|
* A table deliminated file containing the values of the requested fields in the VCF file
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Examples</h2>
|
||||||
|
* <pre>
|
||||||
|
* -T $WalkerName \
|
||||||
|
* -V file.vcf \
|
||||||
|
* -F CHROM -F POS -F ID -F QUAL -F AC \
|
||||||
|
* -o results.table
|
||||||
|
*
|
||||||
|
* would produce a file that looks like:
|
||||||
|
*
|
||||||
|
* CHROM POS ID QUAL AC
|
||||||
|
* 1 10 . 50 1
|
||||||
|
* 1 20 rs10 99 10
|
||||||
|
* et cetera...
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* @author Mark DePristo
|
||||||
|
* @since 2010
|
||||||
*/
|
*/
|
||||||
public class VariantsToTable extends RodWalker<Integer, Integer> {
|
public class VariantsToTable extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
@ArgumentCollection
|
@ArgumentCollection
|
||||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||||
|
|
||||||
@Output(doc="File to which results should be written",required=true)
|
@Output(doc="File to which results should be written",required=true)
|
||||||
protected PrintStream out;
|
protected PrintStream out;
|
||||||
|
|
||||||
@Argument(fullName="fields", shortName="F", doc="Fields to emit from the VCF, allows any VCF field, any info field, and some meta fields like nHets", required=true)
|
/**
|
||||||
public ArrayList<String> fieldsToTake = new ArrayList<String>();
|
* -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10).
|
||||||
|
* Note that this tool does not support capturing any GENOTYPE field values. Note this argument
|
||||||
|
* accepts any number of inputs. So -F CHROM -F POS is allowed.
|
||||||
|
*/
|
||||||
|
@Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true)
|
||||||
|
public List<String> fieldsToTake = new ArrayList<String>();
|
||||||
|
|
||||||
@Argument(fullName="showFiltered", shortName="raw", doc="Include filtered records")
|
/**
|
||||||
|
* By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
|
||||||
|
* Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value.
|
||||||
|
*/
|
||||||
|
@Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false)
|
||||||
public boolean showFiltered = false;
|
public boolean showFiltered = false;
|
||||||
|
|
||||||
@Argument(fullName="maxRecords", shortName="M", doc="Maximum number of records to emit, if provided", required=false)
|
/**
|
||||||
|
* If provided, then this tool will exit with success after this number of records have been emitted to the file.
|
||||||
|
*/
|
||||||
|
@Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false)
|
||||||
public int MAX_RECORDS = -1;
|
public int MAX_RECORDS = -1;
|
||||||
int nRecords = 0;
|
int nRecords = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* By default, only biallelic (REF=A, ALT=B) sites are including in the output. If this flag is provided, then
|
||||||
|
* VariantsToTable will emit field values for records with multiple ALT alleles. Note that in general this
|
||||||
|
* can make your resulting file unreadable and malformated according to tools like R, as the representation of
|
||||||
|
* multi-allelic INFO field values can be lists of values.
|
||||||
|
*/
|
||||||
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
|
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
|
||||||
public boolean keepMultiAllelic = false;
|
public boolean keepMultiAllelic = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* By default, this tool throws a UserException when it encounters a field without a value in some record. This
|
||||||
|
* is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being
|
||||||
|
* found before the tool runs through 40M 1000G records. However, in some cases you genuinely want to allow such
|
||||||
|
* fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument
|
||||||
|
* will cause VariantsToTable to write out NA values for missing fields instead of throwing an error.
|
||||||
|
*/
|
||||||
@Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false)
|
@Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false)
|
||||||
public boolean ALLOW_MISSING_DATA = false;
|
public boolean ALLOW_MISSING_DATA = false;
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
// print out the header
|
||||||
out.println(Utils.join("\t", fieldsToTake));
|
out.println(Utils.join("\t", fieldsToTake));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
if ( tracker == null ) // RodWalkers can make funky map calls
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if ( ++nRecords < MAX_RECORDS || MAX_RECORDS == -1 ) {
|
||||||
|
for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) {
|
||||||
|
if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) {
|
||||||
|
List<String> vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA);
|
||||||
|
out.println(Utils.join("\t", vals));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
if ( nRecords >= MAX_RECORDS ) {
|
||||||
|
logger.warn("Calling sys exit to leave after " + nRecords + " records");
|
||||||
|
System.exit(0); // todo -- what's the recommend way to abort like this?
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final boolean isWildCard(String s) {
|
||||||
|
return s.endsWith("*");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility function that returns the list of values for each field in fields from vc.
|
||||||
|
*
|
||||||
|
* @param vc the VariantContext whose field values we can to capture
|
||||||
|
* @param fields a non-null list of fields to capture from VC
|
||||||
|
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise
|
||||||
|
* provides a value of NA
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
|
||||||
|
List<String> vals = new ArrayList<String>();
|
||||||
|
|
||||||
|
for ( String field : fields ) {
|
||||||
|
String val = "NA";
|
||||||
|
|
||||||
|
if ( getters.containsKey(field) ) {
|
||||||
|
val = getters.get(field).get(vc);
|
||||||
|
} else if ( vc.hasAttribute(field) ) {
|
||||||
|
val = vc.getAttributeAsString(field);
|
||||||
|
} else if ( isWildCard(field) ) {
|
||||||
|
Set<String> wildVals = new HashSet<String>();
|
||||||
|
for ( Map.Entry<String,Object> elt : vc.getAttributes().entrySet()) {
|
||||||
|
if ( elt.getKey().startsWith(field.substring(0, field.length() - 1)) ) {
|
||||||
|
wildVals.add(elt.getValue().toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( wildVals.size() > 0 ) {
|
||||||
|
List<String> toVal = new ArrayList<String>(wildVals);
|
||||||
|
Collections.sort(toVal);
|
||||||
|
val = Utils.join(",", toVal);
|
||||||
|
}
|
||||||
|
} else if ( ! allowMissingData ) {
|
||||||
|
throw new UserException(String.format("Missing field %s in vc %s at %s", field, vc.getSource(), vc));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (field.equals("AF") || field.equals("AC")) {
|
||||||
|
String afo = val;
|
||||||
|
|
||||||
|
double af=0;
|
||||||
|
if (afo.contains(",")) {
|
||||||
|
String[] afs = afo.split(",");
|
||||||
|
afs[0] = afs[0].substring(1,afs[0].length());
|
||||||
|
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
|
||||||
|
|
||||||
|
double[] afd = new double[afs.length];
|
||||||
|
|
||||||
|
for (int k=0; k < afd.length; k++)
|
||||||
|
afd[k] = Double.valueOf(afs[k]);
|
||||||
|
|
||||||
|
af = MathUtils.arrayMax(afd);
|
||||||
|
//af = Double.valueOf(afs[0]);
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if (!afo.equals("NA"))
|
||||||
|
af = Double.valueOf(afo);
|
||||||
|
|
||||||
|
val = Double.toString(af);
|
||||||
|
|
||||||
|
}
|
||||||
|
vals.add(val);
|
||||||
|
}
|
||||||
|
|
||||||
|
return vals;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// default reduce -- doesn't do anything at all
|
||||||
|
//
|
||||||
|
public Integer reduceInit() { return 0; }
|
||||||
|
public Integer reduce(Integer counter, Integer sum) { return counter + sum; }
|
||||||
|
public void onTraversalDone(Integer sum) {}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// static system for getting values from VC by name.
|
||||||
|
//
|
||||||
|
// ----------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
public static abstract class Getter { public abstract String get(VariantContext vc); }
|
public static abstract class Getter { public abstract String get(VariantContext vc); }
|
||||||
public static Map<String, Getter> getters = new HashMap<String, Getter>();
|
public static Map<String, Getter> getters = new HashMap<String, Getter>();
|
||||||
|
|
||||||
|
|
@ -128,98 +303,4 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
||||||
}});
|
}});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
|
||||||
if ( tracker == null ) // RodWalkers can make funky map calls
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
if ( ++nRecords < MAX_RECORDS || MAX_RECORDS == -1 ) {
|
|
||||||
for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) {
|
|
||||||
if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) {
|
|
||||||
List<String> vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA);
|
|
||||||
out.println(Utils.join("\t", vals));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
if ( nRecords >= MAX_RECORDS ) {
|
|
||||||
logger.warn("Calling sys exit to leave after " + nRecords + " records");
|
|
||||||
System.exit(0); // todo -- what's the recommend way to abort like this?
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final boolean isWildCard(String s) {
|
|
||||||
return s.endsWith("*");
|
|
||||||
}
|
|
||||||
|
|
||||||
public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
|
|
||||||
List<String> vals = new ArrayList<String>();
|
|
||||||
|
|
||||||
for ( String field : fields ) {
|
|
||||||
String val = "NA";
|
|
||||||
|
|
||||||
if ( getters.containsKey(field) ) {
|
|
||||||
val = getters.get(field).get(vc);
|
|
||||||
} else if ( vc.hasAttribute(field) ) {
|
|
||||||
val = vc.getAttributeAsString(field);
|
|
||||||
} else if ( isWildCard(field) ) {
|
|
||||||
Set<String> wildVals = new HashSet<String>();
|
|
||||||
for ( Map.Entry<String,Object> elt : vc.getAttributes().entrySet()) {
|
|
||||||
if ( elt.getKey().startsWith(field.substring(0, field.length() - 1)) ) {
|
|
||||||
wildVals.add(elt.getValue().toString());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( wildVals.size() > 0 ) {
|
|
||||||
List<String> toVal = new ArrayList<String>(wildVals);
|
|
||||||
Collections.sort(toVal);
|
|
||||||
val = Utils.join(",", toVal);
|
|
||||||
}
|
|
||||||
} else if ( ! allowMissingData ) {
|
|
||||||
throw new UserException(String.format("Missing field %s in vc %s at %s", field, vc.getSource(), vc));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (field.equals("AF") || field.equals("AC")) {
|
|
||||||
String afo = val;
|
|
||||||
|
|
||||||
double af=0;
|
|
||||||
if (afo.contains(",")) {
|
|
||||||
String[] afs = afo.split(",");
|
|
||||||
afs[0] = afs[0].substring(1,afs[0].length());
|
|
||||||
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
|
|
||||||
|
|
||||||
double[] afd = new double[afs.length];
|
|
||||||
|
|
||||||
for (int k=0; k < afd.length; k++)
|
|
||||||
afd[k] = Double.valueOf(afs[k]);
|
|
||||||
|
|
||||||
af = MathUtils.arrayMax(afd);
|
|
||||||
//af = Double.valueOf(afs[0]);
|
|
||||||
|
|
||||||
}
|
|
||||||
else
|
|
||||||
if (!afo.equals("NA"))
|
|
||||||
af = Double.valueOf(afo);
|
|
||||||
|
|
||||||
val = Double.toString(af);
|
|
||||||
|
|
||||||
}
|
|
||||||
vals.add(val);
|
|
||||||
}
|
|
||||||
|
|
||||||
return vals;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer reduceInit() {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer reduce(Integer counter, Integer sum) {
|
|
||||||
return counter + sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void onTraversalDone(Integer sum) {}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue