diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index b5f0a8001..9d2e493a0 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -314,23 +314,23 @@ public class VariantAnnotatorEngine { final String fullyQualifiedExternalColumnName = GenomicAnnotation.generateInfoFieldKey(externalBindingName, externalColumnName); //find the externalJoinColumnValue in the current info field, and then look up any joinTable records that have this value for the localJoinColumnValue - List> matchingJoinTableRecords = null; //record(s) in the join table whose joinColumnValue(s) matches the joinColumnValue inside the current outputRecordInfoField. Since the join keys don't have to be unique, there may be more than one record in the join table thtat matches. + ArrayList matchingJoinTableRecord = null; //record in the join table whose joinColumnValue matches the joinColumnValue inside the current outputRecordInfoField. final Object numInfoFieldKeysToCheckObj = outputRecordInfoField.get(GenomicAnnotation.generateInfoFieldKey(externalBindingName, GenomicAnnotation.NUM_MATCHES_SPECIAL_INFO_FIELD)); if(numInfoFieldKeysToCheckObj == null) { - //only 1 record in the externalBindingName -B AnnotatoInfoTable overlapped the current position + //only 1 record in the externalBindingName -B AnnotationInfoTable overlapped the current position Object externalColumnValue = outputRecordInfoField.get(fullyQualifiedExternalColumnName); if(externalColumnValue != null) { - matchingJoinTableRecords = joinTable.get(externalColumnValue.toString()); - //System.err.println("Found matching record(s) in join table for record: " + outputRecordInfoField + " where " + fullyQualifiedExternalColumnName + "==" + externalColumnValue + ": " + matchingJoinTableRecords); + matchingJoinTableRecord = joinTable.get(externalColumnValue.toString()); + //System.err.println("Found matching record in join table for record: " + outputRecordInfoField + " where " + fullyQualifiedExternalColumnName + "==" + externalColumnValue + ": " + matchingJoinTableRecords); } } else { - //multiple records in the externalBindingName -B AnnotatoInfoTable overlapped the current position + //multiple records in the externalBindingName -B AnnotationInfoTable overlapped the current position final int numInfoFieldKeysToCheck = Integer.parseInt(numInfoFieldKeysToCheckObj.toString()); - for(int i = 0; i < numInfoFieldKeysToCheck; i++) { + for (int i = 0; i < numInfoFieldKeysToCheck; i++) { final Object externalColumnValue = outputRecordInfoField.get(fullyQualifiedExternalColumnName + "_" + i); - if(externalColumnValue != null) { - matchingJoinTableRecords = joinTable.get(externalColumnValue.toString()); - if(matchingJoinTableRecords != null) { + if ( externalColumnValue != null ) { + matchingJoinTableRecord = joinTable.get(externalColumnValue.toString()); + if ( matchingJoinTableRecord != null ) { //System.err.println("Found matching record(s) in join table for record: " + outputRecordInfoField + " where " + fullyQualifiedExternalColumnName + "==" + externalColumnValue + ": " + matchingJoinTableRecords); break; } @@ -338,25 +338,20 @@ public class VariantAnnotatorEngine { } } - //if match(s) for the externalJoinColumnValue in the current outputRecordInfoField have been found in the join table, perform the join. - if(matchingJoinTableRecords != null) + //if a match for the externalJoinColumnValue in the current outputRecordInfoField has been found in the join table, perform the join. + if ( matchingJoinTableRecord != null ) { final String joinTableBindingName = joinTable.getLocalBindingName(); //convert the List> to List> by hashing the values from the ArrayList by their column names. final List> matchingJoinTableRecordsConverted = new LinkedList>(); - for(ArrayList columnValues : matchingJoinTableRecords) { - final List columnNames = joinTable.getColumnNames(); - - final Map matchingRecord = new LinkedHashMap(); - for(int i = 0; i < columnNames.size(); i++) { - matchingRecord.put(columnNames.get(i), columnValues.get(i)); - } - - matchingJoinTableRecordsConverted.add(GenomicAnnotation.convertRecordToAnnotations(joinTableBindingName, matchingRecord)); - } + final List columnNames = joinTable.getColumnNames(); + final Map matchingRecord = new LinkedHashMap(); + for (int i = 0; i < columnNames.size(); i++) + matchingRecord.put(columnNames.get(i), matchingJoinTableRecord.get(i)); + matchingJoinTableRecordsConverted.add(GenomicAnnotation.convertRecordToAnnotations(joinTableBindingName, matchingRecord)); // do the join between the outputRecordInfoField and the matchingJoinTableRecords, then add the results to to infoAnnotationOutputsList List> tempList = new LinkedList>(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java index dd40c5622..b5e6d7005 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotator.java @@ -46,7 +46,6 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.vcf.VCFUtils; /** @@ -68,7 +67,7 @@ public class GenomicAnnotator extends RodWalker implements Tre @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false) protected String sampleName = null; - @Argument(fullName="select", shortName="s", doc="Optionally specifies which subset of columns from which -B inputs should be used for annotations. For example, -B mydbsnp,AnnotatorInputTable,/path/to/mydbsnp.txt -B mytable,AnnotatorInputTable,/path/mytable.txt -s mydbsnp.avHet,mydbsnp.name,mytable.column3 will cause annotations to only be generated from the 3 columns specified using -s.", required=false) + @Argument(fullName="select", shortName="s", doc="Optionally specifies which subset of columns from which -B inputs should be used for annotations. For example, -B:mydbsnp,AnnotatorInputTable /path/to/mydbsnp.txt -B:mytable,AnnotatorInputTable /path/mytable.txt -s mydbsnp.avHet,mydbsnp.name,mytable.column3 will cause annotations to only be generated from the 3 columns specified using -s.", required=false) protected String[] SELECT_COLUMNS = {}; @Argument(fullName="join", shortName="J", doc="Optionally specifies a file and column within that file that should be LEFT-JOIN'ed to a column in a previously-specified file. The file provided to -J must be tab-delimited, with the first non-comment/non-empty line containing column names. (example: -B name,AnnotatorInputTable,/path/to/file1 -J name2,/path/to/file2,name.columnName=name2.columnName2 - this will join the table in file2 to the table in file1) ", required=false) @@ -79,18 +78,11 @@ public class GenomicAnnotator extends RodWalker implements Tre private VariantAnnotatorEngine engine; - private boolean strict = true; - /** * Prepare the output file and the list of available features. */ public void initialize() { - // get the list of all sample names from the various VCF input rods - TreeSet samples = new TreeSet(); - SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap, String>()); - - //read all ROD file headers and construct a set of all column names to be used for validation of command-line args final Set allFullyQualifiedColumnNames = new LinkedHashSet(); final Set allBindingNames = new LinkedHashSet(); @@ -110,7 +102,6 @@ public class GenomicAnnotator extends RodWalker implements Tre throw new StingException("Failed when attempting to read file header. ", e); } - //parse the JOIN_COLUMNS args, read in the specified files, and validate column names in the = relation. This end result of this loop is to populate the List of joinTables with one entry per -J arg. final List joinTables = new LinkedList(); for(String joinArg : JOIN_ARGS) { @@ -125,10 +116,9 @@ public class GenomicAnnotator extends RodWalker implements Tre final String columnsToJoin = arg[2]; if(allBindingNames.contains(bindingName)) { - throw new StingException("The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" has already been used."); + throw new StingException("The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" has already been used in another binding."); } - String[] splitOnEquals = columnsToJoin.split("=+"); if(splitOnEquals.length != 2) { throw new StingException("The -J arg: \"" + joinArg + "\" must specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); @@ -140,7 +130,6 @@ public class GenomicAnnotator extends RodWalker implements Tre throw new StingException("The -J arg: \"" + joinArg + "\" must fully specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); } - final String bindingName1 = splitOnDot1[0]; final String columnName1 = splitOnDot1[1]; final String bindingName2 = splitOnDot2[0]; @@ -160,7 +149,7 @@ public class GenomicAnnotator extends RodWalker implements Tre externalBindingName = bindingName1; externalColumnName = columnName1; } else { - throw new StingException("The -J arg: \"" + joinArg + "\" must fully specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); + throw new StingException("The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" must be specified in one the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); } //validate externalColumnName @@ -171,56 +160,51 @@ public class GenomicAnnotator extends RodWalker implements Tre //read in the file contents into a JoinTable object final JoinTable joinTable = new JoinTable(); - joinTable.parseFromFile(filename, localBindingName, localColumnName, externalBindingName, externalColumnName, strict); + joinTable.parseFromFile(filename, localBindingName, localColumnName, externalBindingName, externalColumnName); joinTables.add(joinTable); //validate localColumnName, and add all column names in this file to the list of allFullyQualifiedColumnNames so that they can be referenced from subsequent -J args. final List columnNames = joinTable.getColumnNames(); final List fullyQualifiedColumnNames = new LinkedList(); boolean found = false; - for(int i = 0; i < columnNames.size(); i++) { - final String columnName = columnNames.get(i); - if(columnName.equals(localColumnName)) { - found = true; - } - fullyQualifiedColumnNames.add(localBindingName + '.' + columnName); + for ( String columnName : columnNames ) { + if ( columnName.equals(localColumnName) ) + found = true; + fullyQualifiedColumnNames.add(localBindingName + '.' + columnName); } - - if(!found) { + if ( !found ) throw new StingException("The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename); - } allFullyQualifiedColumnNames.addAll(fullyQualifiedColumnNames); } //parse the SELECT_COLUMNS arg and validate the column names List parsedSelectColumns = new LinkedList(); - for(String token : SELECT_COLUMNS) { + for ( String token : SELECT_COLUMNS ) parsedSelectColumns.addAll(Arrays.asList(token.split(","))); - } SELECT_COLUMNS = parsedSelectColumns.toArray(SELECT_COLUMNS); - for(String columnName : SELECT_COLUMNS) { - if(!allFullyQualifiedColumnNames.contains(columnName)) { + for ( String columnName : SELECT_COLUMNS ) { + if ( !allFullyQualifiedColumnNames.contains(columnName) ) throw new StingException("The column name '" + columnName + "' provided to -s doesn't match any of the column names in any of the -B files. Here is the list of available column names: " + allFullyQualifiedColumnNames); - } } - //instanciate the VariantAnnotatorEngine + //instantiate the VariantAnnotatorEngine ArrayList annotationsToUse = new ArrayList(); annotationsToUse.add("GenomicAnnotation"); engine = new VariantAnnotatorEngine(getToolkit(), new ArrayList(), annotationsToUse); - engine.setOneToMany( Boolean.TRUE.equals( ONE_TO_MANY ) ); + engine.setOneToMany(ONE_TO_MANY); engine.setRequestedColumns(SELECT_COLUMNS); engine.setJoinTables(joinTables); - // setup the header fields + // set up the header fields Set hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), Arrays.asList("variant"))); - hInfo.add(new VCFHeaderLine("source", "Annotator")); - hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName())); hInfo.addAll(engine.getVCFAnnotationDescriptions()); + Set rodName = new HashSet(); + rodName.add("variant"); + TreeSet samples = new TreeSet(SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName)); VCFHeader vcfHeader = new VCFHeader(hInfo, samples); vcfWriter.writeHeader(vcfHeader); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/JoinTable.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/JoinTable.java index 68c7d8ae8..ca841439d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/JoinTable.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/JoinTable.java @@ -30,7 +30,6 @@ import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; -import java.util.LinkedList; import java.util.List; import org.broadinstitute.sting.utils.StingException; @@ -62,7 +61,7 @@ public class JoinTable //stores a map entry for each record in the join table. The entry's key is the value of the join column in a given record (eg. bindingName.columnName in the above example), //and the entry value is an ArrayList representing the entire join table record. - private HashMap>> joinColumnValueToRecords = new HashMap>>(); + private HashMap> joinColumnValueToRecords = new HashMap>(); private boolean parsedFromFile = false; @@ -73,9 +72,8 @@ public class JoinTable * @param localColumnName The column name within the given file to join on. * @param externalBindingName The bindingName of another file (previously specified with either -B or -J). * @param externalColumnName The columnName in this other file to join on. - * @param strict Whether to throw an exception if the number of columnNames in the header doesn't match the number of values in any row in the file specified by filename. */ - public void parseFromFile(String filename, String localBindingName, String localColumnName, String externalBindingName, String externalColumnName, boolean strict) { + public void parseFromFile(String filename, String localBindingName, String localColumnName, String externalBindingName, String externalColumnName) { if(parsedFromFile) { throw new StingException("parseFromFile(" + filename +", ..) called more than once"); } @@ -89,16 +87,15 @@ public class JoinTable try { br = new BufferedReader(new FileReader(filename)); - final JoinTableParser parser = new JoinTableParser(strict); + final JoinTableParser parser = new JoinTableParser(); //read in the header - final List header = parser.readHeader(br); - columnNames = header; + columnNames = parser.readHeader(br); //get the index of the localJoinColumnName int localColumnNameIdx = -1; - for(int i = 0; i < header.size(); i++) { - final String columnName = header.get(i); + for(int i = 0; i < columnNames.size(); i++) { + final String columnName = columnNames.get(i); if(columnName.equals(localColumnName)) { localColumnNameIdx = i; break; @@ -109,13 +106,14 @@ public class JoinTable throw new StingException("The -J arg specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename); } - //read in all records and create a map entry for each - String line = null; + String line; while((line = br.readLine()) != null) { final ArrayList columnValues = parser.parseLine(line); + if ( columnValues.size() < columnNames.size() ) + throw new IllegalStateException("The file: " + filename + " is malformed as there are not a sufficient number of columns for this line: " + line); final String joinColumnValue = columnValues.get(localColumnNameIdx); - put(joinColumnValue, columnValues); + put(joinColumnValue, columnValues, filename); } } catch(IOException e) @@ -134,8 +132,6 @@ public class JoinTable } } - - /** * If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2, * this returns bindingName1. @@ -149,7 +145,6 @@ public class JoinTable this.localBindingName = localBindingName; } - /** * @return the list of join table column names parsed out of the file header. */ @@ -175,7 +170,6 @@ public class JoinTable this.externalColumnName = externalColumnName; } - /** * If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2, * this returns bindingName2. @@ -192,7 +186,7 @@ public class JoinTable /** * Whether any join table records have the given value in the join column. - * @param value + * @param joinColumnValue value * @return */ public boolean containsJoinColumnValue(String joinColumnValue) { @@ -201,24 +195,21 @@ public class JoinTable /** * Returns all records in the table where the join column has the given value. - * @param joinColumnValue - * @return + * @param joinColumnValue column value + * @return row */ - public List> get(String joinColumnValue) { + public ArrayList get(String joinColumnValue) { return joinColumnValueToRecords.get(joinColumnValue); } /** * Adds the given record to the map. - * @param joinColumnValue - * @param record + * @param joinColumnValue value + * @param record row */ - protected void put(String joinColumnValue, ArrayList record) { - List> list = joinColumnValueToRecords.get(joinColumnValue); - if(list == null) { - list = new LinkedList>(); - joinColumnValueToRecords.put(joinColumnValue, list); - } - list.add(record); + protected void put(String joinColumnValue, ArrayList record, String filename) { + if ( joinColumnValueToRecords.containsKey(joinColumnValue) ) + throw new IllegalStateException("The file " + filename + " contains non-unique entries for the requested column, which isn't allowed."); + joinColumnValueToRecords.put(joinColumnValue, record); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/JoinTableParser.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/JoinTableParser.java index dbf7ec3dd..06f77262b 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/JoinTableParser.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/annotator/JoinTableParser.java @@ -47,23 +47,15 @@ public class JoinTableParser private List header; //column names parsed out of the header line - /** Whether to throw an exception if a row contains a different number of columns than the header. */ - private boolean strict; - /** * Constructor. - * - * @param source The file to read. - * @param strict Whether to throw an exception if a row contains a different number of columns than the header. */ - public JoinTableParser(boolean strict) { - this.strict = strict; - } + public JoinTableParser() {} /** * Returns the header and returns it. - * @param source + * @param br source * @return */ public List readHeader(BufferedReader br) throws IOException @@ -95,7 +87,7 @@ public class JoinTableParser final ArrayList values = Utils.split(line, DELIMITER, header.size()); - if ( strict && values.size() != header.size() ) { + if ( values.size() != header.size() ) { throw new StingException(String.format("Encountered a row with %d columns which is different from the number or columns in the header: %d\nHeader: " + header + "\nLine: " + values, values.size(), header.size())); } @@ -103,12 +95,9 @@ public class JoinTableParser } - - - /** * Returns the header. - * @param source The file to read. + * @param br The file to read. * @return ArrayList containing column names from the header. * @throws IOException */ diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotatorIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotatorIntegrationTest.java index 43bca529b..4891a3e1d 100755 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotatorIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/annotator/GenomicAnnotatorIntegrationTest.java @@ -26,7 +26,7 @@ public class GenomicAnnotatorIntegrationTest extends WalkerTest { */ - String[] md5WithDashSArg = {"5942c1bdc736f016af248994e036777a"}; + String[] md5WithDashSArg = {"9583d7060bc3de73b392e7435c31946b"}; WalkerTestSpec specWithSArg = new WalkerTestSpec( "-T GenomicAnnotator -R " + b36KGReference + " -B:variant,vcf /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf" +