Implemented joins
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3477 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
09c3b15af3
commit
f5f7217413
|
|
@ -33,11 +33,13 @@ import java.util.Arrays;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.LinkedHashSet;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
|
||||||
import org.broad.tribble.vcf.VCFHeader;
|
import org.broad.tribble.vcf.VCFHeader;
|
||||||
import org.broad.tribble.vcf.VCFHeaderLine;
|
import org.broad.tribble.vcf.VCFHeaderLine;
|
||||||
|
|
@ -81,27 +83,19 @@ public class GenomicAnnotator extends RodWalker<Integer, Integer> {
|
||||||
@Argument(fullName="select", shortName="s", doc="Optionally specifies which subset of columns from which -B inputs should be used for annotations. For example, -B mydbsnp,AnnotatorInputTable,/path/to/mydbsnp.txt -B mytable,AnnotatorInputTable,/path/mytable.txt -s mydbsnp.avHet,mydbsnp.name,mytable.column3 will cause annotations to only be generated from the 3 columns specified using -s.", required=false)
|
@Argument(fullName="select", shortName="s", doc="Optionally specifies which subset of columns from which -B inputs should be used for annotations. For example, -B mydbsnp,AnnotatorInputTable,/path/to/mydbsnp.txt -B mytable,AnnotatorInputTable,/path/mytable.txt -s mydbsnp.avHet,mydbsnp.name,mytable.column3 will cause annotations to only be generated from the 3 columns specified using -s.", required=false)
|
||||||
protected String[] SELECT_COLUMNS = {};
|
protected String[] SELECT_COLUMNS = {};
|
||||||
|
|
||||||
|
@Argument(fullName="join", shortName="J", doc="Optionally specifies a file and column within that file that should be LEFT-JOIN'ed to a column in a previously-specified file. The file provided to -J must be tab-delimited, with the first non-comment/non-empty line containing column names. (example: -B name,AnnotatorInputTable,/path/to/file1 -J name2,/path/to/file2,name.columnName=name2.columnName2 - this will join the table in file2 to the table in file1) ", required=false)
|
||||||
|
protected String[] JOIN_ARGS = {};
|
||||||
|
|
||||||
@Argument(fullName="oneToMany", shortName="m", doc="If more than one record from the same file matches a particular locus (for example, multiple dbSNP records with the same position), create multiple entries in the ouptut VCF file - one for each match. If a particular tabular file has J matches, and another tabular file has K matches for a given locus, then J*K output VCF records will be generated - one for each pair of K, J. If this flag is not provided, the multiple records are still generated, but they are stored in the INFO field of a single output VCF record, with their annotation keys differentiated by appending '_i' with i varying from 1 to K*J. ", required=false)
|
@Argument(fullName="oneToMany", shortName="m", doc="If more than one record from the same file matches a particular locus (for example, multiple dbSNP records with the same position), create multiple entries in the ouptut VCF file - one for each match. If a particular tabular file has J matches, and another tabular file has K matches for a given locus, then J*K output VCF records will be generated - one for each pair of K, J. If this flag is not provided, the multiple records are still generated, but they are stored in the INFO field of a single output VCF record, with their annotation keys differentiated by appending '_i' with i varying from 1 to K*J. ", required=false)
|
||||||
protected Boolean ONE_TO_MANY = false;
|
protected Boolean ONE_TO_MANY = false;
|
||||||
|
|
||||||
@Argument(fullName="join", shortName="j", doc="TODO If more than one record from the same file matches a particular locus (for example, multiple dbSNP records with the same position), create multiple entries in the ouptut VCF file - one for each match. If a particular tabular file has J matches, and another tabular file has K matches for a given locus, then J*K output VCF records will be generated - one for each pair of K, J. If this flag is not provided, the multiple records are still generated, but they are stored in the INFO field of a single output VCF record, with their annotation keys differentiated by appending '_i' with i varying from 1 to K*J. ", required=false)
|
|
||||||
protected String[] JOIN_COLUMNS = {};
|
|
||||||
|
|
||||||
/*
|
|
||||||
NOTE: there are several cases for file1.a=file2.b
|
|
||||||
if, for a particular locus:
|
|
||||||
only file1.0 matches, INNER-JOIN - skip file1.a OUTER JOIN - annotate with file1.a
|
|
||||||
only file2.0 matches, INNER-JOIN - skip file2.a OUTER JOIN - annotate with file2.a
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
private VCFWriter vcfWriter;
|
private VCFWriter vcfWriter;
|
||||||
|
|
||||||
private HashMap<String, String> nonVCFsampleName = new HashMap<String, String>();
|
|
||||||
|
|
||||||
private VariantAnnotatorEngine engine;
|
private VariantAnnotatorEngine engine;
|
||||||
|
|
||||||
|
private boolean strict = true;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Prepare the output file and the list of available features.
|
* Prepare the output file and the list of available features.
|
||||||
|
|
@ -112,34 +106,111 @@ public class GenomicAnnotator extends RodWalker<Integer, Integer> {
|
||||||
TreeSet<String> samples = new TreeSet<String>();
|
TreeSet<String> samples = new TreeSet<String>();
|
||||||
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
|
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
|
||||||
|
|
||||||
// add the non-VCF sample from the command-line, if applicable
|
|
||||||
if ( sampleName != null ) {
|
|
||||||
nonVCFsampleName.put(sampleName.toUpperCase(), "variant");
|
|
||||||
samples.add(sampleName.toUpperCase());
|
|
||||||
}
|
|
||||||
|
|
||||||
// if there are no valid samples, warn the user
|
|
||||||
if ( samples.size() == 0 ) {
|
|
||||||
logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired.");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//read all ROD file headers and construct a set of all column names to be used for validation of command-line args
|
//read all ROD file headers and construct a set of all column names to be used for validation of command-line args
|
||||||
final HashSet<String> allColumnNames = new HashSet<String>();
|
final Set<String> allFullyQualifiedColumnNames = new LinkedHashSet<String>();
|
||||||
|
final Set<String> allBindingNames = new LinkedHashSet<String>();
|
||||||
try {
|
try {
|
||||||
for(ReferenceOrderedDataSource ds : getToolkit().getRodDataSources()) {
|
for(ReferenceOrderedDataSource ds : getToolkit().getRodDataSources()) {
|
||||||
if(! ds.getReferenceOrderedData().getType().equals(AnnotatorInputTableCodec.class)) {
|
if(! ds.getReferenceOrderedData().getType().equals(AnnotatorInputTableCodec.class)) {
|
||||||
continue; //skip all non-AnnotatorInputTable files.
|
continue; //skip all non-AnnotatorInputTable files.
|
||||||
}
|
}
|
||||||
|
final String bindingName = ds.getName();
|
||||||
|
allBindingNames.add(bindingName);
|
||||||
final ArrayList<String> header = AnnotatorInputTableCodec.readHeader(ds.getReferenceOrderedData().getFile());
|
final ArrayList<String> header = AnnotatorInputTableCodec.readHeader(ds.getReferenceOrderedData().getFile());
|
||||||
for(String columnName : header) {
|
for(String columnName : header) {
|
||||||
allColumnNames.add(ds.getName() + "." + columnName);
|
allFullyQualifiedColumnNames.add(bindingName + "." + columnName);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch(IOException e) {
|
} catch(IOException e) {
|
||||||
throw new StingException("Failed when attempting to read file header. ", e);
|
throw new StingException("Failed when attempting to read file header. ", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//parse the JOIN_COLUMNS args, read in the specified files, and validate column names in the = relation. This end result of this loop is to populate the List of joinTables with one entry per -J arg.
|
||||||
|
final List<JoinTable> joinTables = new LinkedList<JoinTable>();
|
||||||
|
for(String joinArg : JOIN_ARGS) {
|
||||||
|
|
||||||
|
//parse the tokens
|
||||||
|
final String[] arg = joinArg.split(",");
|
||||||
|
if(arg.length != 3) {
|
||||||
|
throw new StingException("The following -J arg: \"" + joinArg + "\" must contain 3 comma-separated values. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)");
|
||||||
|
}
|
||||||
|
final String bindingName = arg[0];
|
||||||
|
final String filename = arg[1];
|
||||||
|
final String columnsToJoin = arg[2];
|
||||||
|
|
||||||
|
if(allBindingNames.contains(bindingName)) {
|
||||||
|
throw new StingException("The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" has already been used.");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
String[] splitOnEquals = columnsToJoin.split("=+");
|
||||||
|
if(splitOnEquals.length != 2) {
|
||||||
|
throw new StingException("The -J arg: \"" + joinArg + "\" must specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)");
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] splitOnDot1 = splitOnEquals[0].split("\\.");
|
||||||
|
String[] splitOnDot2 = splitOnEquals[1].split("\\.");
|
||||||
|
if(splitOnDot1.length != 2 || splitOnDot2.length != 2) {
|
||||||
|
throw new StingException("The -J arg: \"" + joinArg + "\" must fully specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
final String bindingName1 = splitOnDot1[0];
|
||||||
|
final String columnName1 = splitOnDot1[1];
|
||||||
|
final String bindingName2 = splitOnDot2[0];
|
||||||
|
final String columnName2 = splitOnDot2[1];
|
||||||
|
|
||||||
|
//figure out which of the 2 binding names within the = relation matches the -J bindingName
|
||||||
|
final String localBindingName = bindingName; //alias
|
||||||
|
final String localColumnName;
|
||||||
|
final String externalBindingName;
|
||||||
|
final String externalColumnName;
|
||||||
|
if(bindingName1.equals(bindingName)) {
|
||||||
|
localColumnName = columnName1;
|
||||||
|
externalBindingName = bindingName2;
|
||||||
|
externalColumnName = columnName2;
|
||||||
|
} else if(bindingName2.equals(bindingName)) {
|
||||||
|
localColumnName = columnName2;
|
||||||
|
externalBindingName = bindingName1;
|
||||||
|
externalColumnName = columnName1;
|
||||||
|
} else {
|
||||||
|
throw new StingException("The -J arg: \"" + joinArg + "\" must fully specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)");
|
||||||
|
}
|
||||||
|
|
||||||
|
//validate externalColumnName
|
||||||
|
final String fullyQualifiedExternalColumnName = externalBindingName + '.' + externalColumnName;
|
||||||
|
if( !allFullyQualifiedColumnNames.contains(fullyQualifiedExternalColumnName) ) {
|
||||||
|
throw new StingException("The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + fullyQualifiedExternalColumnName + "\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
//read in the file contents into a JoinTable object
|
||||||
|
final JoinTable joinTable = new JoinTable();
|
||||||
|
joinTable.parseFromFile(filename, localBindingName, localColumnName, externalBindingName, externalColumnName, strict);
|
||||||
|
joinTables.add(joinTable);
|
||||||
|
|
||||||
|
//validate localColumnName, and add all column names in this file to the list of allFullyQualifiedColumnNames so that they can be referenced from subsequent -J args.
|
||||||
|
final List<String> columnNames = joinTable.getColumnNames();
|
||||||
|
final List<String> fullyQualifiedColumnNames = new LinkedList<String>();
|
||||||
|
boolean found = false;
|
||||||
|
for(int i = 0; i < columnNames.size(); i++) {
|
||||||
|
final String columnName = columnNames.get(i);
|
||||||
|
if(columnName.equals(localColumnName)) {
|
||||||
|
found = true;
|
||||||
|
}
|
||||||
|
fullyQualifiedColumnNames.add(localBindingName + '.' + columnName);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!found) {
|
||||||
|
throw new StingException("The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename);
|
||||||
|
}
|
||||||
|
|
||||||
|
allFullyQualifiedColumnNames.addAll(fullyQualifiedColumnNames);
|
||||||
|
}
|
||||||
|
|
||||||
//parse the SELECT_COLUMNS arg and validate the column names
|
//parse the SELECT_COLUMNS arg and validate the column names
|
||||||
List<String> parsedSelectColumns = new LinkedList<String>();
|
List<String> parsedSelectColumns = new LinkedList<String>();
|
||||||
for(String token : SELECT_COLUMNS) {
|
for(String token : SELECT_COLUMNS) {
|
||||||
|
|
@ -148,17 +219,16 @@ public class GenomicAnnotator extends RodWalker<Integer, Integer> {
|
||||||
SELECT_COLUMNS = parsedSelectColumns.toArray(SELECT_COLUMNS);
|
SELECT_COLUMNS = parsedSelectColumns.toArray(SELECT_COLUMNS);
|
||||||
|
|
||||||
for(String columnName : SELECT_COLUMNS) {
|
for(String columnName : SELECT_COLUMNS) {
|
||||||
if(!allColumnNames.contains(columnName)) {
|
if(!allFullyQualifiedColumnNames.contains(columnName)) {
|
||||||
throw new StingException("The column name '" + columnName + "' provided to -s doesn't match any of the column names in any of the -B files. Here is the list of available column names: " + allColumnNames);
|
throw new StingException("The column name '" + columnName + "' provided to -s doesn't match any of the column names in any of the -B files. Here is the list of available column names: " + allFullyQualifiedColumnNames);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//instanciate the VariantAnnotatorEngine
|
||||||
|
|
||||||
//instanciate teh VariantAnnotatorEngine
|
|
||||||
engine = new VariantAnnotatorEngine(getToolkit(), new String[] { }, new String[] { "GenomicAnnotation" });
|
engine = new VariantAnnotatorEngine(getToolkit(), new String[] { }, new String[] { "GenomicAnnotation" });
|
||||||
engine.setOneToMany( Boolean.TRUE.equals( ONE_TO_MANY ) );
|
engine.setOneToMany( Boolean.TRUE.equals( ONE_TO_MANY ) );
|
||||||
engine.setRequestedColumns(SELECT_COLUMNS);
|
engine.setRequestedColumns(SELECT_COLUMNS);
|
||||||
|
engine.setJoinTables(joinTables);
|
||||||
|
|
||||||
// setup the header fields
|
// setup the header fields
|
||||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||||
|
|
@ -224,13 +294,14 @@ public class GenomicAnnotator extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
if ( variant instanceof VCFRecord) { //TODO is this if-statement necessary?
|
if ( variant instanceof VCFRecord) { //TODO is this if-statement necessary?
|
||||||
for(VariantContext annotatedVC : annotatedVCs ) {
|
for(VariantContext annotatedVC : annotatedVCs ) {
|
||||||
vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase()));
|
vcfWriter.addRecord(VariantContextAdaptors.toVCF(annotatedVC, ref.getBase()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 1;
|
return annotatedVCs.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Increment the number of loci processed.
|
* Increment the number of loci processed.
|
||||||
*
|
*
|
||||||
|
|
@ -247,10 +318,21 @@ public class GenomicAnnotator extends RodWalker<Integer, Integer> {
|
||||||
*
|
*
|
||||||
* @param result the number of loci seen.
|
* @param result the number of loci seen.
|
||||||
*/
|
*/
|
||||||
public void onTraversalDone(Integer result) {
|
public void onTraversalDone(Integer totalOutputVCFRecords) {
|
||||||
out.printf("Processed %d loci.\n", result);
|
out.printf("Generated %d annotated VCF records.\n", totalOutputVCFRecords);
|
||||||
|
Map<String, Integer> inputTableHitCounter = engine.getInputTableHitCounter();
|
||||||
|
for(Entry<String, Integer> e : inputTableHitCounter.entrySet()) {
|
||||||
|
final String bindingName = e.getKey();
|
||||||
|
final int counter = e.getValue();
|
||||||
|
final float percent = 100 * counter /(float) totalOutputVCFRecords;
|
||||||
|
out.printf(" %-6.1f%% (%d) annotated with %s.\n", percent, counter, bindingName );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
vcfWriter.close();
|
vcfWriter.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue