More cleanup of the Genomic Annotator. Also, we now require join tables to have unique entries for the column keyed on the join.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4124 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-08-26 04:43:52 +00:00
parent dd7f136298
commit 79cd716671
5 changed files with 59 additions and 100 deletions

View File

@ -314,23 +314,23 @@ public class VariantAnnotatorEngine {
final String fullyQualifiedExternalColumnName = GenomicAnnotation.generateInfoFieldKey(externalBindingName, externalColumnName);
//find the externalJoinColumnValue in the current info field, and then look up any joinTable records that have this value for the localJoinColumnValue
List<ArrayList<String>> matchingJoinTableRecords = null; //record(s) in the join table whose joinColumnValue(s) matches the joinColumnValue inside the current outputRecordInfoField. Since the join keys don't have to be unique, there may be more than one record in the join table thtat matches.
ArrayList<String> matchingJoinTableRecord = null; //record in the join table whose joinColumnValue matches the joinColumnValue inside the current outputRecordInfoField.
final Object numInfoFieldKeysToCheckObj = outputRecordInfoField.get(GenomicAnnotation.generateInfoFieldKey(externalBindingName, GenomicAnnotation.NUM_MATCHES_SPECIAL_INFO_FIELD));
if(numInfoFieldKeysToCheckObj == null) {
//only 1 record in the externalBindingName -B AnnotatoInfoTable overlapped the current position
//only 1 record in the externalBindingName -B AnnotationInfoTable overlapped the current position
Object externalColumnValue = outputRecordInfoField.get(fullyQualifiedExternalColumnName);
if(externalColumnValue != null) {
matchingJoinTableRecords = joinTable.get(externalColumnValue.toString());
//System.err.println("Found matching record(s) in join table for record: " + outputRecordInfoField + " where " + fullyQualifiedExternalColumnName + "==" + externalColumnValue + ": " + matchingJoinTableRecords);
matchingJoinTableRecord = joinTable.get(externalColumnValue.toString());
//System.err.println("Found matching record in join table for record: " + outputRecordInfoField + " where " + fullyQualifiedExternalColumnName + "==" + externalColumnValue + ": " + matchingJoinTableRecords);
}
} else {
//multiple records in the externalBindingName -B AnnotatoInfoTable overlapped the current position
//multiple records in the externalBindingName -B AnnotationInfoTable overlapped the current position
final int numInfoFieldKeysToCheck = Integer.parseInt(numInfoFieldKeysToCheckObj.toString());
for(int i = 0; i < numInfoFieldKeysToCheck; i++) {
for (int i = 0; i < numInfoFieldKeysToCheck; i++) {
final Object externalColumnValue = outputRecordInfoField.get(fullyQualifiedExternalColumnName + "_" + i);
if(externalColumnValue != null) {
matchingJoinTableRecords = joinTable.get(externalColumnValue.toString());
if(matchingJoinTableRecords != null) {
if ( externalColumnValue != null ) {
matchingJoinTableRecord = joinTable.get(externalColumnValue.toString());
if ( matchingJoinTableRecord != null ) {
//System.err.println("Found matching record(s) in join table for record: " + outputRecordInfoField + " where " + fullyQualifiedExternalColumnName + "==" + externalColumnValue + ": " + matchingJoinTableRecords);
break;
}
@ -338,25 +338,20 @@ public class VariantAnnotatorEngine {
}
}
//if match(s) for the externalJoinColumnValue in the current outputRecordInfoField have been found in the join table, perform the join.
if(matchingJoinTableRecords != null)
//if a match for the externalJoinColumnValue in the current outputRecordInfoField has been found in the join table, perform the join.
if ( matchingJoinTableRecord != null )
{
final String joinTableBindingName = joinTable.getLocalBindingName();
//convert the List<ArrayList<String>> to List<Map<String, String>> by hashing the values from the ArrayList<String> by their column names.
final List<Map<String, String>> matchingJoinTableRecordsConverted = new LinkedList<Map<String,String>>();
for(ArrayList<String> columnValues : matchingJoinTableRecords) {
final List<String> columnNames = joinTable.getColumnNames();
final Map<String, String> matchingRecord = new LinkedHashMap<String, String>();
for(int i = 0; i < columnNames.size(); i++) {
matchingRecord.put(columnNames.get(i), columnValues.get(i));
}
matchingJoinTableRecordsConverted.add(GenomicAnnotation.convertRecordToAnnotations(joinTableBindingName, matchingRecord));
}
final List<String> columnNames = joinTable.getColumnNames();
final Map<String, String> matchingRecord = new LinkedHashMap<String, String>();
for (int i = 0; i < columnNames.size(); i++)
matchingRecord.put(columnNames.get(i), matchingJoinTableRecord.get(i));
matchingJoinTableRecordsConverted.add(GenomicAnnotation.convertRecordToAnnotations(joinTableBindingName, matchingRecord));
// do the join between the outputRecordInfoField and the matchingJoinTableRecords, then add the results to to infoAnnotationOutputsList
List<Map<String, Object>> tempList = new LinkedList<Map<String, Object>>();

View File

@ -46,7 +46,6 @@ import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
/**
@ -68,7 +67,7 @@ public class GenomicAnnotator extends RodWalker<Integer, Integer> implements Tre
@Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false)
protected String sampleName = null;
@Argument(fullName="select", shortName="s", doc="Optionally specifies which subset of columns from which -B inputs should be used for annotations. For example, -B mydbsnp,AnnotatorInputTable,/path/to/mydbsnp.txt -B mytable,AnnotatorInputTable,/path/mytable.txt -s mydbsnp.avHet,mydbsnp.name,mytable.column3 will cause annotations to only be generated from the 3 columns specified using -s.", required=false)
@Argument(fullName="select", shortName="s", doc="Optionally specifies which subset of columns from which -B inputs should be used for annotations. For example, -B:mydbsnp,AnnotatorInputTable /path/to/mydbsnp.txt -B:mytable,AnnotatorInputTable /path/mytable.txt -s mydbsnp.avHet,mydbsnp.name,mytable.column3 will cause annotations to only be generated from the 3 columns specified using -s.", required=false)
protected String[] SELECT_COLUMNS = {};
@Argument(fullName="join", shortName="J", doc="Optionally specifies a file and column within that file that should be LEFT-JOIN'ed to a column in a previously-specified file. The file provided to -J must be tab-delimited, with the first non-comment/non-empty line containing column names. (example: -B name,AnnotatorInputTable,/path/to/file1 -J name2,/path/to/file2,name.columnName=name2.columnName2 - this will join the table in file2 to the table in file1) ", required=false)
@ -79,18 +78,11 @@ public class GenomicAnnotator extends RodWalker<Integer, Integer> implements Tre
private VariantAnnotatorEngine engine;
private boolean strict = true;
/**
* Prepare the output file and the list of available features.
*/
public void initialize() {
// get the list of all sample names from the various VCF input rods
TreeSet<String> samples = new TreeSet<String>();
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
//read all ROD file headers and construct a set of all column names to be used for validation of command-line args
final Set<String> allFullyQualifiedColumnNames = new LinkedHashSet<String>();
final Set<String> allBindingNames = new LinkedHashSet<String>();
@ -110,7 +102,6 @@ public class GenomicAnnotator extends RodWalker<Integer, Integer> implements Tre
throw new StingException("Failed when attempting to read file header. ", e);
}
//parse the JOIN_COLUMNS args, read in the specified files, and validate column names in the = relation. This end result of this loop is to populate the List of joinTables with one entry per -J arg.
final List<JoinTable> joinTables = new LinkedList<JoinTable>();
for(String joinArg : JOIN_ARGS) {
@ -125,10 +116,9 @@ public class GenomicAnnotator extends RodWalker<Integer, Integer> implements Tre
final String columnsToJoin = arg[2];
if(allBindingNames.contains(bindingName)) {
throw new StingException("The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" has already been used.");
throw new StingException("The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" has already been used in another binding.");
}
String[] splitOnEquals = columnsToJoin.split("=+");
if(splitOnEquals.length != 2) {
throw new StingException("The -J arg: \"" + joinArg + "\" must specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)");
@ -140,7 +130,6 @@ public class GenomicAnnotator extends RodWalker<Integer, Integer> implements Tre
throw new StingException("The -J arg: \"" + joinArg + "\" must fully specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)");
}
final String bindingName1 = splitOnDot1[0];
final String columnName1 = splitOnDot1[1];
final String bindingName2 = splitOnDot2[0];
@ -160,7 +149,7 @@ public class GenomicAnnotator extends RodWalker<Integer, Integer> implements Tre
externalBindingName = bindingName1;
externalColumnName = columnName1;
} else {
throw new StingException("The -J arg: \"" + joinArg + "\" must fully specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)");
throw new StingException("The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" must be specified in one the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)");
}
//validate externalColumnName
@ -171,56 +160,51 @@ public class GenomicAnnotator extends RodWalker<Integer, Integer> implements Tre
//read in the file contents into a JoinTable object
final JoinTable joinTable = new JoinTable();
joinTable.parseFromFile(filename, localBindingName, localColumnName, externalBindingName, externalColumnName, strict);
joinTable.parseFromFile(filename, localBindingName, localColumnName, externalBindingName, externalColumnName);
joinTables.add(joinTable);
//validate localColumnName, and add all column names in this file to the list of allFullyQualifiedColumnNames so that they can be referenced from subsequent -J args.
final List<String> columnNames = joinTable.getColumnNames();
final List<String> fullyQualifiedColumnNames = new LinkedList<String>();
boolean found = false;
for(int i = 0; i < columnNames.size(); i++) {
final String columnName = columnNames.get(i);
if(columnName.equals(localColumnName)) {
found = true;
}
fullyQualifiedColumnNames.add(localBindingName + '.' + columnName);
for ( String columnName : columnNames ) {
if ( columnName.equals(localColumnName) )
found = true;
fullyQualifiedColumnNames.add(localBindingName + '.' + columnName);
}
if(!found) {
if ( !found )
throw new StingException("The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename);
}
allFullyQualifiedColumnNames.addAll(fullyQualifiedColumnNames);
}
//parse the SELECT_COLUMNS arg and validate the column names
List<String> parsedSelectColumns = new LinkedList<String>();
for(String token : SELECT_COLUMNS) {
for ( String token : SELECT_COLUMNS )
parsedSelectColumns.addAll(Arrays.asList(token.split(",")));
}
SELECT_COLUMNS = parsedSelectColumns.toArray(SELECT_COLUMNS);
for(String columnName : SELECT_COLUMNS) {
if(!allFullyQualifiedColumnNames.contains(columnName)) {
for ( String columnName : SELECT_COLUMNS ) {
if ( !allFullyQualifiedColumnNames.contains(columnName) )
throw new StingException("The column name '" + columnName + "' provided to -s doesn't match any of the column names in any of the -B files. Here is the list of available column names: " + allFullyQualifiedColumnNames);
}
}
//instanciate the VariantAnnotatorEngine
//instantiate the VariantAnnotatorEngine
ArrayList<String> annotationsToUse = new ArrayList<String>();
annotationsToUse.add("GenomicAnnotation");
engine = new VariantAnnotatorEngine(getToolkit(), new ArrayList<String>(), annotationsToUse);
engine.setOneToMany( Boolean.TRUE.equals( ONE_TO_MANY ) );
engine.setOneToMany(ONE_TO_MANY);
engine.setRequestedColumns(SELECT_COLUMNS);
engine.setJoinTables(joinTables);
// setup the header fields
// set up the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), Arrays.asList("variant")));
hInfo.add(new VCFHeaderLine("source", "Annotator"));
hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName()));
hInfo.addAll(engine.getVCFAnnotationDescriptions());
Set<String> rodName = new HashSet<String>();
rodName.add("variant");
TreeSet<String> samples = new TreeSet<String>(SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName));
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
}

View File

@ -30,7 +30,6 @@ import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import org.broadinstitute.sting.utils.StingException;
@ -62,7 +61,7 @@ public class JoinTable
//stores a map entry for each record in the join table. The entry's key is the value of the join column in a given record (eg. bindingName.columnName in the above example),
//and the entry value is an ArrayList representing the entire join table record.
private HashMap<String, List<ArrayList<String>>> joinColumnValueToRecords = new HashMap<String, List<ArrayList<String>>>();
private HashMap<String, ArrayList<String>> joinColumnValueToRecords = new HashMap<String, ArrayList<String>>();
private boolean parsedFromFile = false;
@ -73,9 +72,8 @@ public class JoinTable
* @param localColumnName The column name within the given file to join on.
* @param externalBindingName The bindingName of another file (previously specified with either -B or -J).
* @param externalColumnName The columnName in this other file to join on.
* @param strict Whether to throw an exception if the number of columnNames in the header doesn't match the number of values in any row in the file specified by filename.
*/
public void parseFromFile(String filename, String localBindingName, String localColumnName, String externalBindingName, String externalColumnName, boolean strict) {
public void parseFromFile(String filename, String localBindingName, String localColumnName, String externalBindingName, String externalColumnName) {
if(parsedFromFile) {
throw new StingException("parseFromFile(" + filename +", ..) called more than once");
}
@ -89,16 +87,15 @@ public class JoinTable
try
{
br = new BufferedReader(new FileReader(filename));
final JoinTableParser parser = new JoinTableParser(strict);
final JoinTableParser parser = new JoinTableParser();
//read in the header
final List<String> header = parser.readHeader(br);
columnNames = header;
columnNames = parser.readHeader(br);
//get the index of the localJoinColumnName
int localColumnNameIdx = -1;
for(int i = 0; i < header.size(); i++) {
final String columnName = header.get(i);
for(int i = 0; i < columnNames.size(); i++) {
final String columnName = columnNames.get(i);
if(columnName.equals(localColumnName)) {
localColumnNameIdx = i;
break;
@ -109,13 +106,14 @@ public class JoinTable
throw new StingException("The -J arg specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename);
}
//read in all records and create a map entry for each
String line = null;
String line;
while((line = br.readLine()) != null) {
final ArrayList<String> columnValues = parser.parseLine(line);
if ( columnValues.size() < columnNames.size() )
throw new IllegalStateException("The file: " + filename + " is malformed as there are not a sufficient number of columns for this line: " + line);
final String joinColumnValue = columnValues.get(localColumnNameIdx);
put(joinColumnValue, columnValues);
put(joinColumnValue, columnValues, filename);
}
}
catch(IOException e)
@ -134,8 +132,6 @@ public class JoinTable
}
}
/**
* If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2,
* this returns bindingName1.
@ -149,7 +145,6 @@ public class JoinTable
this.localBindingName = localBindingName;
}
/**
* @return the list of join table column names parsed out of the file header.
*/
@ -175,7 +170,6 @@ public class JoinTable
this.externalColumnName = externalColumnName;
}
/**
* If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2,
* this returns bindingName2.
@ -192,7 +186,7 @@ public class JoinTable
/**
* Whether any join table records have the given value in the join column.
* @param value
* @param joinColumnValue value
* @return
*/
public boolean containsJoinColumnValue(String joinColumnValue) {
@ -201,24 +195,21 @@ public class JoinTable
/**
* Returns all records in the table where the join column has the given value.
* @param joinColumnValue
* @return
* @param joinColumnValue column value
* @return row
*/
public List<ArrayList<String>> get(String joinColumnValue) {
public ArrayList<String> get(String joinColumnValue) {
return joinColumnValueToRecords.get(joinColumnValue);
}
/**
* Adds the given record to the map.
* @param joinColumnValue
* @param record
* @param joinColumnValue value
* @param record row
*/
protected void put(String joinColumnValue, ArrayList<String> record) {
List<ArrayList<String>> list = joinColumnValueToRecords.get(joinColumnValue);
if(list == null) {
list = new LinkedList<ArrayList<String>>();
joinColumnValueToRecords.put(joinColumnValue, list);
}
list.add(record);
protected void put(String joinColumnValue, ArrayList<String> record, String filename) {
if ( joinColumnValueToRecords.containsKey(joinColumnValue) )
throw new IllegalStateException("The file " + filename + " contains non-unique entries for the requested column, which isn't allowed.");
joinColumnValueToRecords.put(joinColumnValue, record);
}
}

View File

@ -47,23 +47,15 @@ public class JoinTableParser
private List<String> header; //column names parsed out of the header line
/** Whether to throw an exception if a row contains a different number of columns than the header. */
private boolean strict;
/**
* Constructor.
*
* @param source The file to read.
* @param strict Whether to throw an exception if a row contains a different number of columns than the header.
*/
public JoinTableParser(boolean strict) {
this.strict = strict;
}
public JoinTableParser() {}
/**
* Returns the header and returns it.
* @param source
* @param br source
* @return
*/
public List<String> readHeader(BufferedReader br) throws IOException
@ -95,7 +87,7 @@ public class JoinTableParser
final ArrayList<String> values = Utils.split(line, DELIMITER, header.size());
if ( strict && values.size() != header.size() ) {
if ( values.size() != header.size() ) {
throw new StingException(String.format("Encountered a row with %d columns which is different from the number or columns in the header: %d\nHeader: " + header + "\nLine: " + values, values.size(), header.size()));
}
@ -103,12 +95,9 @@ public class JoinTableParser
}
/**
* Returns the header.
* @param source The file to read.
* @param br The file to read.
* @return ArrayList containing column names from the header.
* @throws IOException
*/

View File

@ -26,7 +26,7 @@ public class GenomicAnnotatorIntegrationTest extends WalkerTest {
*/
String[] md5WithDashSArg = {"5942c1bdc736f016af248994e036777a"};
String[] md5WithDashSArg = {"9583d7060bc3de73b392e7435c31946b"};
WalkerTestSpec specWithSArg = new WalkerTestSpec(
"-T GenomicAnnotator -R " + b36KGReference +
" -B:variant,vcf /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf" +