From 38969b97836e593a8879e4790940dd48a13e64b4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 26 Jul 2011 11:09:06 -0400 Subject: [PATCH] Prototype of RODBinding @Arguments instead of -B syntax Initial version of RodBinding class. Flow from walker Rodbinding @Arguments -> RMDTriplet (old system) -> GATK engine (standard). Will need refactoring. --- .../commandline/ArgumentTypeDescriptor.java | 38 ++- .../sting/commandline/ParsingEngine.java | 1 + .../sting/commandline/RodBinding.java | 63 +++++ .../sting/gatk/CommandLineExecutable.java | 25 +- .../VariantsToTableNewRodStyle.java | 225 ++++++++++++++++++ .../sting/utils/text/ListFileUtils.java | 39 +++ 6 files changed, 386 insertions(+), 5 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/commandline/RodBinding.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableNewRodStyle.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index 9c33e084d..eaabe4da2 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import java.io.File; import java.lang.annotation.Annotation; import java.lang.reflect.*; import java.util.*; @@ -275,6 +276,35 @@ public abstract class ArgumentTypeDescriptor { } } +/** + * Parser for RodBinding objects + */ +class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * We only want RodBinding class objects + * @param type The type to check. + * @return true if the provided class is a RodBinding.class + */ + @Override + public boolean supports( Class type ) { + return isRodBinding(type); + } + + public static boolean isRodBinding( Class type ) { + return type.isAssignableFrom(RodBinding.class); + } + + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Class type, ArgumentMatches matches) { + ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); + String value = getArgumentValue( defaultDefinition, matches ); + RodBinding result = new RodBinding(source.field.getName(), new File(value)); + Tags tags = getArgumentTags(matches); + parsingEngine.addTags(result,tags); + return result; + } +} + /** * Parse simple argument types: java primitives, wrapper classes, and anything that has * a simple String constructor. @@ -282,9 +312,10 @@ public abstract class ArgumentTypeDescriptor { class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { @Override public boolean supports( Class type ) { - if( type.isPrimitive() ) return true; - if( type.isEnum() ) return true; - if( primitiveToWrapperMap.containsValue(type) ) return true; + if ( RodBindingArgumentTypeDescriptor.isRodBinding(type) ) return false; + if ( type.isPrimitive() ) return true; + if ( type.isEnum() ) return true; + if ( primitiveToWrapperMap.containsValue(type) ) return true; try { type.getConstructor(String.class); @@ -385,7 +416,6 @@ class CompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { public Object parse(ParsingEngine parsingEngine,ArgumentSource source, Class type, ArgumentMatches matches) { Class componentType; Object result; - Tags tags; if( Collection.class.isAssignableFrom(type) ) { diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java index 0dc18e6f9..e2e694cfb 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java @@ -64,6 +64,7 @@ public class ParsingEngine { * The type of set used must be ordered (but not necessarily sorted). */ private static final Set STANDARD_ARGUMENT_TYPE_DESCRIPTORS = new LinkedHashSet( Arrays.asList(new SimpleArgumentTypeDescriptor(), + new RodBindingArgumentTypeDescriptor(), new CompoundArgumentTypeDescriptor(), new MultiplexArgumentTypeDescriptor()) ); diff --git a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java new file mode 100644 index 000000000..2f5046b3a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.commandline; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.File; +import java.util.List; + +/** + * + */ +// TODO -- should have a derived class called VariantContentRodBinding with simple accessors +public class RodBinding { + final String variableName; + final File sourceFile; + + public RodBinding(final String variableName, final File sourceFile) { + this.variableName = variableName; + this.sourceFile = sourceFile; + } + + public String getVariableName() { + return variableName; + } + + public File getSourceFile() { + return sourceFile; + } + + public List getAll(RefMetaDataTracker tracker) { + return (List)tracker.getReferenceMetaData(variableName); + } + + public T getVariantContext(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc) { + return (T)tracker.getVariantContext(ref, variableName, loc); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java index a080ab439..10573cf25 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -25,8 +25,10 @@ package org.broadinstitute.sting.gatk; +import org.broadinstitute.sting.commandline.ArgumentSource; import org.broadinstitute.sting.commandline.ArgumentTypeDescriptor; import org.broadinstitute.sting.commandline.CommandLineProgram; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.io.stubs.OutputStreamArgumentTypeDescriptor; @@ -34,12 +36,15 @@ import org.broadinstitute.sting.gatk.io.stubs.SAMFileReaderArgumentTypeDescripto import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor; import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.classloader.JVMUtils; import org.broadinstitute.sting.utils.text.ListFileUtils; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.List; /** * @author aaron @@ -81,7 +86,6 @@ public abstract class CommandLineExecutable extends CommandLineProgram { // File lists can require a bit of additional expansion. Set these explicitly by the engine. engine.setSAMFileIDs(ListFileUtils.unpackBAMFileList(getArgumentCollection().samFiles,parser)); - engine.setReferenceMetaDataFiles(ListFileUtils.unpackRODBindings(getArgumentCollection().RODBindings,getArgumentCollection().DBSNPFile,parser)); engine.setWalker(walker); walker.setToolkit(engine); @@ -96,6 +100,11 @@ public abstract class CommandLineExecutable extends CommandLineProgram { loadArgumentsIntoObject(walker); argumentSources.add(walker); + Collection newStyle = ListFileUtils.unpackRODBindings(getRodBindingsInWalker(walker), parser); + Collection oldStyle = ListFileUtils.unpackRODBindings(getArgumentCollection().RODBindings, getArgumentCollection().DBSNPFile, parser); + oldStyle.addAll(newStyle); + engine.setReferenceMetaDataFiles(oldStyle); + for (ReadFilter filter: filters) { loadArgumentsIntoObject(filter); argumentSources.add(filter); @@ -112,6 +121,20 @@ public abstract class CommandLineExecutable extends CommandLineProgram { return 0; } + private List getRodBindingsInWalker(Walker walker) { + List rods = new ArrayList(); + + for ( ArgumentSource source : parser.extractArgumentSources(walker.getClass()) ) { + Object obj = JVMUtils.getFieldValue(source.field, walker); + if ( obj instanceof RodBinding ) { + System.out.printf("Found rod binding for field %s of %s%n", obj, source.field); + rods.add((RodBinding)obj); + } + } + + return rods; + } + /** * Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled. * This report will be written to either STDOUT or to the run repository, depending on the options diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableNewRodStyle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableNewRodStyle.java new file mode 100644 index 000000000..b44983e3d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableNewRodStyle.java @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.io.PrintStream; +import java.util.*; + +/** + * Emits specific fields as dictated by the user from one or more VCF files. + */ +public class VariantsToTableNewRodStyle extends RodWalker { + @Output(doc="File to which results should be written",required=true) + protected PrintStream out; + + @Argument(fullName="fields", shortName="F", doc="Fields to emit from the VCF, allows any VCF field, any info field, and some meta fields like nHets", required=true) + public ArrayList fieldsToTake = new ArrayList(); + + @Argument(fullName="showFiltered", shortName="raw", doc="Include filtered records") + public boolean showFiltered = false; + + @Argument(fullName="maxRecords", shortName="M", doc="Maximum number of records to emit, if provided", required=false) + public int MAX_RECORDS = -1; + int nRecords = 0; + + @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) + public boolean keepMultiAllelic = false; + + @Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false) + public boolean ALLOW_MISSING_DATA = false; + + @Input(fullName="variants", shortName="V", doc="The variant file we will convert to a table", required=true) + public RodBinding variants; + + public void initialize() { + out.println(Utils.join("\t", fieldsToTake)); + } + + public static abstract class Getter { public abstract String get(VariantContext vc); } + public static Map getters = new HashMap(); + + static { + // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT + getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } }); + getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } }); + getters.put("REF", new Getter() { + public String get(VariantContext vc) { + String x = ""; + if (vc.hasAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)) { + Byte refByte = (Byte)(vc.getAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)); + x=x+new String(new byte[]{refByte}); + } + return x+vc.getReference().getDisplayString(); + } + }); + getters.put("ALT", new Getter() { + public String get(VariantContext vc) { + StringBuilder x = new StringBuilder(); + int n = vc.getAlternateAlleles().size(); + if ( n == 0 ) return "."; + if (vc.hasAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)) { + Byte refByte = (Byte)(vc.getAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)); + x.append(new String(new byte[]{refByte})); + } + + for ( int i = 0; i < n; i++ ) { + if ( i != 0 ) x.append(","); + x.append(vc.getAlternateAllele(i).getDisplayString()); + } + return x.toString(); + } + }); + getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } }); + getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) { + if ( vc.isSNP() && vc.isBiallelic() ) + return VariantContextUtils.isTransition(vc) ? "1" : "0"; + else + return "-1"; + }}); + getters.put("FILTER", new Getter() { public String get(VariantContext vc) { + return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); } + }); + + getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } }); + getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); + getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); + getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } }); + getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); + getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); + getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); + getters.put("GQ", new Getter() { public String get(VariantContext vc) { + if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); + return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError()); + }}); + } + + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return 0; + + if ( ++nRecords < MAX_RECORDS || MAX_RECORDS == -1 ) { + VariantContext vc = variants.getVariantContext(tracker, ref, context.getLocation()); + if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) { + List vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA); + out.println(Utils.join("\t", vals)); + } + return 1; + } else { + if ( nRecords >= MAX_RECORDS ) { + logger.warn("Calling sys exit to leave after " + nRecords + " records"); + System.exit(0); // todo -- what's the recommend way to abort like this? + } + return 0; + } + } + + private static final boolean isWildCard(String s) { + return s.endsWith("*"); + } + + public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) { + List vals = new ArrayList(); + + for ( String field : fields ) { + String val = "NA"; + + if ( getters.containsKey(field) ) { + val = getters.get(field).get(vc); + } else if ( vc.hasAttribute(field) ) { + val = vc.getAttributeAsString(field); + } else if ( isWildCard(field) ) { + Set wildVals = new HashSet(); + for ( Map.Entry elt : vc.getAttributes().entrySet()) { + if ( elt.getKey().startsWith(field.substring(0, field.length() - 1)) ) { + wildVals.add(elt.getValue().toString()); + } + } + + if ( wildVals.size() > 0 ) { + List toVal = new ArrayList(wildVals); + Collections.sort(toVal); + val = Utils.join(",", toVal); + } + } else if ( ! allowMissingData ) { + throw new UserException(String.format("Missing field %s in vc %s at %s", field, vc.getSource(), vc)); + } + + if (field.equals("AF") || field.equals("AC")) { + String afo = val; + + double af=0; + if (afo.contains(",")) { + String[] afs = afo.split(","); + afs[0] = afs[0].substring(1,afs[0].length()); + afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); + + double[] afd = new double[afs.length]; + + for (int k=0; k < afd.length; k++) + afd[k] = Double.valueOf(afs[k]); + + af = MathUtils.arrayMax(afd); + //af = Double.valueOf(afs[0]); + + } + else + if (!afo.equals("NA")) + af = Double.valueOf(afo); + + val = Double.toString(af); + + } + vals.add(val); + } + + return vals; + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer counter, Integer sum) { + return counter + sum; + } + + public void onTraversalDone(Integer sum) {} +} diff --git a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java index 4ab1c1685..d9bf86aba 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils.text; import org.broadinstitute.sting.commandline.ParsingEngine; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper; @@ -93,6 +94,7 @@ public class ListFileUtils { * @return a list of expanded, bound RODs. */ public static Collection unpackRODBindings(final List RODBindings, final String dbSNPFile, final ParsingEngine parser) { + // todo -- this is a strange home for this code. Move into ROD system Collection rodBindings = new ArrayList(); for (String fileName: RODBindings) { @@ -134,6 +136,43 @@ public class ListFileUtils { return rodBindings; } + /** + * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine. + * @param RODBindings a text equivale + * @return a list of expanded, bound RODs. + */ + public static Collection unpackRODBindings(final List RODBindings, final ParsingEngine parser) { + // todo -- this is a strange home for this code. Move into ROD system + Collection rodBindings = new ArrayList(); + + for (RodBinding rodBinding: RODBindings) { + String argValue = rodBinding.getSourceFile().getPath(); + String fileName = expandFileName(argValue); + final Tags tags = parser.getTags(rodBinding); + + List positionalTags = tags.getPositionalTags(); + if(positionalTags.size() != 1) + throw new UserException("Invalid syntax for RODBinding (reference-ordered data) input . " + + "Please use the following syntax when providing reference-ordered " + + "data: -: ."); + // Assume that if tags are present, those tags are name and type. + // Name is always first, followed by type. + String name = rodBinding.getVariableName(); + String type = positionalTags.get(0); + + RMDTriplet.RMDStorageType storageType = null; + if(tags.getValue("storage") != null) + storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,tags.getValue("storage")); + else if(fileName.toLowerCase().endsWith("stdin")) + storageType = RMDTriplet.RMDStorageType.STREAM; + else + storageType = RMDTriplet.RMDStorageType.FILE; + + rodBindings.add(new RMDTriplet(name,type,fileName,storageType,tags)); + } + + return rodBindings; + } /** * Expand any special characters that appear in the filename. Right now, '-' is expanded to