diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java index fc7670608..cf08e9b58 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java @@ -33,11 +33,13 @@ import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; +import java.util.Map.Entry; import org.broad.tribble.vcf.VCFHeader; import org.broad.tribble.vcf.VCFHeaderLine; @@ -81,27 +83,19 @@ public class GenomicAnnotator extends RodWalker { @Argument(fullName="select", shortName="s", doc="Optionally specifies which subset of columns from which -B inputs should be used for annotations. For example, -B mydbsnp,AnnotatorInputTable,/path/to/mydbsnp.txt -B mytable,AnnotatorInputTable,/path/mytable.txt -s mydbsnp.avHet,mydbsnp.name,mytable.column3 will cause annotations to only be generated from the 3 columns specified using -s.", required=false) protected String[] SELECT_COLUMNS = {}; + @Argument(fullName="join", shortName="J", doc="Optionally specifies a file and column within that file that should be LEFT-JOIN'ed to a column in a previously-specified file. The file provided to -J must be tab-delimited, with the first non-comment/non-empty line containing column names. (example: -B name,AnnotatorInputTable,/path/to/file1 -J name2,/path/to/file2,name.columnName=name2.columnName2 - this will join the table in file2 to the table in file1) ", required=false) + protected String[] JOIN_ARGS = {}; + @Argument(fullName="oneToMany", shortName="m", doc="If more than one record from the same file matches a particular locus (for example, multiple dbSNP records with the same position), create multiple entries in the ouptut VCF file - one for each match. If a particular tabular file has J matches, and another tabular file has K matches for a given locus, then J*K output VCF records will be generated - one for each pair of K, J. If this flag is not provided, the multiple records are still generated, but they are stored in the INFO field of a single output VCF record, with their annotation keys differentiated by appending '_i' with i varying from 1 to K*J. ", required=false) protected Boolean ONE_TO_MANY = false; - @Argument(fullName="join", shortName="j", doc="TODO If more than one record from the same file matches a particular locus (for example, multiple dbSNP records with the same position), create multiple entries in the ouptut VCF file - one for each match. If a particular tabular file has J matches, and another tabular file has K matches for a given locus, then J*K output VCF records will be generated - one for each pair of K, J. If this flag is not provided, the multiple records are still generated, but they are stored in the INFO field of a single output VCF record, with their annotation keys differentiated by appending '_i' with i varying from 1 to K*J. ", required=false) - protected String[] JOIN_COLUMNS = {}; - - /* - NOTE: there are several cases for file1.a=file2.b - if, for a particular locus: - only file1.0 matches, INNER-JOIN - skip file1.a OUTER JOIN - annotate with file1.a - only file2.0 matches, INNER-JOIN - skip file2.a OUTER JOIN - annotate with file2.a - - */ - private VCFWriter vcfWriter; - private HashMap nonVCFsampleName = new HashMap(); - private VariantAnnotatorEngine engine; + private boolean strict = true; + /** * Prepare the output file and the list of available features. @@ -112,34 +106,111 @@ public class GenomicAnnotator extends RodWalker { TreeSet samples = new TreeSet(); SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap, String>()); - // add the non-VCF sample from the command-line, if applicable - if ( sampleName != null ) { - nonVCFsampleName.put(sampleName.toUpperCase(), "variant"); - samples.add(sampleName.toUpperCase()); - } - // if there are no valid samples, warn the user - if ( samples.size() == 0 ) { - logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired."); - } //read all ROD file headers and construct a set of all column names to be used for validation of command-line args - final HashSet allColumnNames = new HashSet(); + final Set allFullyQualifiedColumnNames = new LinkedHashSet(); + final Set allBindingNames = new LinkedHashSet(); try { for(ReferenceOrderedDataSource ds : getToolkit().getRodDataSources()) { if(! ds.getReferenceOrderedData().getType().equals(AnnotatorInputTableCodec.class)) { continue; //skip all non-AnnotatorInputTable files. } + final String bindingName = ds.getName(); + allBindingNames.add(bindingName); final ArrayList header = AnnotatorInputTableCodec.readHeader(ds.getReferenceOrderedData().getFile()); for(String columnName : header) { - allColumnNames.add(ds.getName() + "." + columnName); + allFullyQualifiedColumnNames.add(bindingName + "." + columnName); } } } catch(IOException e) { throw new StingException("Failed when attempting to read file header. ", e); } + + //parse the JOIN_COLUMNS args, read in the specified files, and validate column names in the = relation. This end result of this loop is to populate the List of joinTables with one entry per -J arg. + final List joinTables = new LinkedList(); + for(String joinArg : JOIN_ARGS) { + + //parse the tokens + final String[] arg = joinArg.split(","); + if(arg.length != 3) { + throw new StingException("The following -J arg: \"" + joinArg + "\" must contain 3 comma-separated values. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); + } + final String bindingName = arg[0]; + final String filename = arg[1]; + final String columnsToJoin = arg[2]; + + if(allBindingNames.contains(bindingName)) { + throw new StingException("The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" has already been used."); + } + + + String[] splitOnEquals = columnsToJoin.split("=+"); + if(splitOnEquals.length != 2) { + throw new StingException("The -J arg: \"" + joinArg + "\" must specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); + } + + String[] splitOnDot1 = splitOnEquals[0].split("\\."); + String[] splitOnDot2 = splitOnEquals[1].split("\\."); + if(splitOnDot1.length != 2 || splitOnDot2.length != 2) { + throw new StingException("The -J arg: \"" + joinArg + "\" must fully specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); + } + + + final String bindingName1 = splitOnDot1[0]; + final String columnName1 = splitOnDot1[1]; + final String bindingName2 = splitOnDot2[0]; + final String columnName2 = splitOnDot2[1]; + + //figure out which of the 2 binding names within the = relation matches the -J bindingName + final String localBindingName = bindingName; //alias + final String localColumnName; + final String externalBindingName; + final String externalColumnName; + if(bindingName1.equals(bindingName)) { + localColumnName = columnName1; + externalBindingName = bindingName2; + externalColumnName = columnName2; + } else if(bindingName2.equals(bindingName)) { + localColumnName = columnName2; + externalBindingName = bindingName1; + externalColumnName = columnName1; + } else { + throw new StingException("The -J arg: \"" + joinArg + "\" must fully specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); + } + + //validate externalColumnName + final String fullyQualifiedExternalColumnName = externalBindingName + '.' + externalColumnName; + if( !allFullyQualifiedColumnNames.contains(fullyQualifiedExternalColumnName) ) { + throw new StingException("The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + fullyQualifiedExternalColumnName + "\""); + } + + //read in the file contents into a JoinTable object + final JoinTable joinTable = new JoinTable(); + joinTable.parseFromFile(filename, localBindingName, localColumnName, externalBindingName, externalColumnName, strict); + joinTables.add(joinTable); + + //validate localColumnName, and add all column names in this file to the list of allFullyQualifiedColumnNames so that they can be referenced from subsequent -J args. + final List columnNames = joinTable.getColumnNames(); + final List fullyQualifiedColumnNames = new LinkedList(); + boolean found = false; + for(int i = 0; i < columnNames.size(); i++) { + final String columnName = columnNames.get(i); + if(columnName.equals(localColumnName)) { + found = true; + } + fullyQualifiedColumnNames.add(localBindingName + '.' + columnName); + } + + if(!found) { + throw new StingException("The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename); + } + + allFullyQualifiedColumnNames.addAll(fullyQualifiedColumnNames); + } + //parse the SELECT_COLUMNS arg and validate the column names List parsedSelectColumns = new LinkedList(); for(String token : SELECT_COLUMNS) { @@ -148,17 +219,16 @@ public class GenomicAnnotator extends RodWalker { SELECT_COLUMNS = parsedSelectColumns.toArray(SELECT_COLUMNS); for(String columnName : SELECT_COLUMNS) { - if(!allColumnNames.contains(columnName)) { - throw new StingException("The column name '" + columnName + "' provided to -s doesn't match any of the column names in any of the -B files. Here is the list of available column names: " + allColumnNames); + if(!allFullyQualifiedColumnNames.contains(columnName)) { + throw new StingException("The column name '" + columnName + "' provided to -s doesn't match any of the column names in any of the -B files. Here is the list of available column names: " + allFullyQualifiedColumnNames); } } - - - //instanciate teh VariantAnnotatorEngine + //instanciate the VariantAnnotatorEngine engine = new VariantAnnotatorEngine(getToolkit(), new String[] { }, new String[] { "GenomicAnnotation" }); engine.setOneToMany( Boolean.TRUE.equals( ONE_TO_MANY ) ); engine.setRequestedColumns(SELECT_COLUMNS); + engine.setJoinTables(joinTables); // setup the header fields Set hInfo = new HashSet(); @@ -224,13 +294,14 @@ public class GenomicAnnotator extends RodWalker { if ( variant instanceof VCFRecord) { //TODO is this if-statement necessary? for(VariantContext annotatedVC : annotatedVCs ) { - vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase())); + vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase())); } } - return 1; + return annotatedVCs.size(); } + /** * Increment the number of loci processed. * @@ -247,10 +318,21 @@ public class GenomicAnnotator extends RodWalker { * * @param result the number of loci seen. */ - public void onTraversalDone(Integer result) { - out.printf("Processed %d loci.\n", result); + public void onTraversalDone(Integer totalOutputVCFRecords) { + out.printf("Generated %d annotated VCF records.\n", totalOutputVCFRecords); + Map inputTableHitCounter = engine.getInputTableHitCounter(); + for(Entry e : inputTableHitCounter.entrySet()) { + final String bindingName = e.getKey(); + final int counter = e.getValue(); + final float percent = 100 * counter /(float) totalOutputVCFRecords; + out.printf(" %-6.1f%% (%d) annotated with %s.\n", percent, counter, bindingName ); + } + vcfWriter.close(); } + + + }