Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Ryan Poplin 2011-08-02 13:30:46 -04:00
commit 2ba57bb502
23 changed files with 87 additions and 3446 deletions

View File

@ -1,193 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.refdata.features.annotator;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broad.tribble.exception.CodecLineParsingException;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.LineReader;
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.StringTokenizer;
public class AnnotatorInputTableCodec implements ReferenceDependentFeatureCodec<AnnotatorInputTableFeature> {
private static Logger logger = Logger.getLogger(AnnotatorInputTableCodec.class);
public static final String DELIMITER = "\t";
private ArrayList<String> header;
/**
* The parser to use when resolving genome-wide locations.
*/
private GenomeLocParser genomeLocParser;
/**
* Set the parser to use when resolving genetic data.
* @param genomeLocParser The supplied parser.
*/
public void setGenomeLocParser(GenomeLocParser genomeLocParser) {
this.genomeLocParser = genomeLocParser;
}
/**
* Parses the header.
*
* @param reader
*
* @return The # of header lines for this file.
*/
public Object readHeader(LineReader reader)
{
int[] lineCounter = new int[1];
try {
header = readHeader(reader, lineCounter);
} catch(IOException e) {
throw new IllegalArgumentException("Unable to read from file.", e);
}
return header;
}
public Class<AnnotatorInputTableFeature> getFeatureType() {
return AnnotatorInputTableFeature.class;
}
@Override
public Feature decodeLoc(String line) {
StringTokenizer st = new StringTokenizer(line, DELIMITER);
if ( st.countTokens() < 1 )
throw new CodecLineParsingException("Couldn't parse GenomeLoc out of the following line because there aren't enough tokens.\nLine: " + line);
GenomeLoc loc;
String chr = st.nextToken();
if ( chr.indexOf(":") != -1 ) {
loc = genomeLocParser.parseGenomeLoc(chr);
} else {
if ( st.countTokens() < 3 )
throw new CodecLineParsingException("Couldn't parse GenomeLoc out of the following line because there aren't enough tokens.\nLine: " + line);
loc = genomeLocParser.createGenomeLoc(chr, Integer.valueOf(st.nextToken()), Integer.valueOf(st.nextToken()));
}
return new AnnotatorInputTableFeature(loc.getContig(), loc.getStart(), loc.getStop());
}
/**
* Parses the line into an AnnotatorInputTableFeature object.
*
* @param line
*/
public AnnotatorInputTableFeature decode(String line) {
final ArrayList<String> header = this.header; //optimization
final ArrayList<String> values = Utils.split(line, DELIMITER, header.size());
if ( values.size() != header.size()) {
throw new CodecLineParsingException(String.format("Encountered a line that has %d columns while the header has %d columns.\nHeader: " + header + "\nLine: " + values, values.size(), header.size()));
}
final AnnotatorInputTableFeature feature = new AnnotatorInputTableFeature(header);
for ( int i = 0; i < header.size(); i++ ) {
feature.putColumnValue(header.get(i), values.get(i));
}
GenomeLoc loc;
if ( values.get(0).indexOf(":") != -1 )
loc = genomeLocParser.parseGenomeLoc(values.get(0));
else
loc = genomeLocParser.createGenomeLoc(values.get(0), Integer.valueOf(values.get(1)), Integer.valueOf(values.get(2)));
//parse the location
feature.setChr(loc.getContig());
feature.setStart((int)loc.getStart());
feature.setEnd((int)loc.getStop());
return feature;
}
/**
* Returns the header.
* @param source
* @return
* @throws IOException
*/
public static ArrayList<String> readHeader(final File source) throws IOException {
FileInputStream is = new FileInputStream(source);
try {
return readHeader(new AsciiLineReader(is), null);
} finally {
is.close();
}
}
/**
* Returns the header, and also sets the 2nd arg to the number of lines in the header.
* @param source
* @param lineCounter An array of length 1 or null. If not null, array[0] will be set to the number of lines in the header.
* @return The header fields.
* @throws IOException
*/
private static ArrayList<String> readHeader(final LineReader source, int[] lineCounter) throws IOException {
ArrayList<String> header = null;
int numLines = 0;
//find the 1st line that's non-empty and not a comment
String line = null;
while( (line = source.readLine()) != null ) {
numLines++;
if ( line.trim().isEmpty() || line.startsWith("#") ) {
continue;
}
//parse the header
header = Utils.split(line, DELIMITER);
break;
}
// check that we found the header
if ( header == null ) {
throw new IllegalArgumentException("No header in " + source + ". All lines are either comments or empty.");
}
if(lineCounter != null) {
lineCounter[0] = numLines;
}
logger.debug(String.format("Found header line containing %d columns:\n[%s]", header.size(), Utils.join("\t", header)));
return header;
}
}

View File

@ -1,158 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.refdata.features.annotator;
import org.broad.tribble.Feature;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
/**
* This class represents a single record in an AnnotatorInputTable.
*/
public class AnnotatorInputTableFeature implements Feature {
private ArrayList<String> columnNames;
private HashMap<String, String> columnValues; //maps colum names to column values
private String chr;
private int start;
private int end;
private String strRep = null;
/**
* Constructor.
* @param chr The chromosome name.
* @param start The start position
* @param end The end position
*/
public AnnotatorInputTableFeature(String chr, int start, int end) {
this.chr = chr;
this.start = start;
this.end = end;
}
/**
* Constructor.
* @param columnNames The column names as parsed out of the file header.
*/
public AnnotatorInputTableFeature(ArrayList<String> columnNames) {
this.columnNames = columnNames;
this.columnValues = new HashMap<String, String>();
}
/**
* @return the list of column names from the file header.
*/
public ArrayList<String> getHeader() {
return columnNames;
}
/**
* Returns the value of the given column.
*
* @param columnName The column name as it appears in the file header.
* @return The value
*/
public String getColumnValue(final String columnName) {
return columnValues.get(columnName);
}
public boolean containsColumnName(final String columnName) {
return columnValues.containsKey(columnName);
}
/**
* Sets the value for the given column.
*
* @param columnName The column name as it appears in the file header.
* @param value The value
* @return The existing value associated with the columnName, if there is one.
*/
protected String putColumnValue(final String columnName, final String value) {
return columnValues.put(columnName, value);
}
/**
* @return all values in this line, hashed by their column names.
*/
public Map<String,String> getColumnValues() {
return Collections.unmodifiableMap(columnValues);
}
public String getChr() {
return chr;
}
public int getStart() {
return start;
}
public int getEnd() {
return end;
}
protected void setChr(String chr) {
this.chr = chr;
}
protected void setStart(int start) {
this.start = start;
}
protected void setEnd(int end) {
this.end = end;
}
@Override
public String toString() {
if ( strRep == null ) {
StringBuilder sb = new StringBuilder();
for(String columnName : columnNames ) {
if ( sb.length() == 0 )
sb.append("[");
else
sb.append(", ");
sb.append(columnName + "=" + columnValues.get(columnName));
}
sb.append("]");
strRep = sb.toString();
}
return strRep;
}
}

View File

@ -219,7 +219,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
if ( stratifiedContexts != null ) {
annotatedVCs = new ArrayList<VariantContext>(VCs.size());
for ( VariantContext vc : VCs )
annotatedVCs.addAll(engine.annotateContext(tracker, ref, stratifiedContexts, vc));
annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc));
}
}

View File

@ -31,8 +31,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.GenomicAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.JoinTable;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
@ -45,7 +43,6 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
import java.util.Map.Entry;
public class VariantAnnotatorEngine {
@ -58,19 +55,6 @@ public class VariantAnnotatorEngine {
private HashMap<String, String> dbAnnotations = new HashMap<String, String>();
// command-line option from GenomicAnnotator.
private Map<String, Set<String>> requestedColumnsMap;
// command-line option from GenomicAnnotator.
private boolean oneToMany;
// command-line option from GenomicAnnotator.
private List<JoinTable> joinTables;
// used by GenomicAnnotator. Maps binding name to number of output VCF records
// annotated with records from the input table with this binding name. Only used for
// printing out stats at the end.
private Map<String, Integer> inputTableHitCounter = new HashMap<String, Integer>();
private static class VAExpression {
public String fullName, bindingName, fieldName;
@ -140,7 +124,7 @@ public class VariantAnnotatorEngine {
return descriptions;
}
public Collection<VariantContext> annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
@ -150,42 +134,18 @@ public class VariantAnnotatorEngine {
// annotate expressions where available
annotateExpressions(tracker, ref, infoAnnotations);
// process the info field
List<Map<String, Object>> infoAnnotationOutputsList = new LinkedList<Map<String, Object>>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file
infoAnnotationOutputsList.add(new LinkedHashMap<String, Object>(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file has 1 additional line.
infoAnnotationOutputsList.get(0).putAll(infoAnnotations); // put the DB membership info in
// go through all the requested info annotationTypes
for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations )
{
for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
Map<String, Object> annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc);
if ( annotationsFromCurrentType == null ) {
continue;
}
if(annotationType instanceof GenomicAnnotation)
{
infoAnnotationOutputsList = processGenomicAnnotation( infoAnnotationOutputsList, annotationsFromCurrentType );
}
else
{
// add the annotations to each output line.
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
infoAnnotationOutput.putAll(annotationsFromCurrentType);
}
}
if ( annotationsFromCurrentType != null )
infoAnnotations.putAll(annotationsFromCurrentType);
}
// annotate genotypes
Map<String, Genotype> genotypes = annotateGenotypes(tracker, ref, stratifiedContexts, vc);
// generate a new annotated VC
final VariantContext annotatedVC = VariantContext.modifyAttributes(vc, infoAnnotations);
// create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList
Collection<VariantContext> returnValue = new LinkedList<VariantContext>();
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
returnValue.add( new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, infoAnnotationOutput) );
}
return returnValue;
// annotate genotypes, creating another new VC in the process
return VariantContext.modifyGenotypes(annotatedVC, annotateGenotypes(tracker, ref, stratifiedContexts, vc));
}
private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
@ -251,6 +211,9 @@ public class VariantAnnotatorEngine {
return genotypes;
}
/*
// Finish processing data from GenomicAnnotation.
private List<Map<String, Object>> processGenomicAnnotation( List<Map<String, Object>> infoAnnotationOutputsList, Map<String, Object> annotationsForCurrentLocusFromAllAnnotatorInputTables)
{
@ -403,12 +366,14 @@ public class VariantAnnotatorEngine {
incrementStatsCounter(bindingName, infoAnnotationOutputsList.size());
}
/**
*/
/**
* Records statistics that will be printed when GenomicAnnotator finishes.
*
* @param bindingName The table from which annotations were gotten
* @param numNewRecords The number of new output VCF records created with annotations from this table
*/
*//*
private void incrementStatsCounter( final String bindingName, int numNewRecords) {
//record some stats - there were infoAnnotationOutputsList.size() output VCF records annotated with data from the 'bindingName' input table.
Integer counter = inputTableHitCounter.get(bindingName);
@ -453,13 +418,15 @@ public class VariantAnnotatorEngine {
}
/**
*/
/**
* Records statistics for the explodeInfoAnnotationOutputsList(..) calculation.
* @param bindingName The table from which annotations were gotten
* @param numNewVCFRecordsAnnotatedWithBindingNameData The number of new output VCF records created with annotations from this table
* @param infoAnnotationOutputsList output list
* @param matchingRecordsSize matching records size
*/
*//*
private void recordStats( final String bindingName, int numNewVCFRecordsAnnotatedWithBindingNameData, final List<Map<String, Object>> infoAnnotationOutputsList, int matchingRecordsSize ) {
//update stats for the 'bindingName' table
@ -509,13 +476,14 @@ public class VariantAnnotatorEngine {
}
/**
*/
/**
* Determines whether to exclude the given column from the annotations.
* @param key The fully qualified columnName
* @return Whether the -S arg specifies that this column should be included in the annotations.
*
* TODO this function can be optimized through memoization
*/
*//*
private boolean isKeyFilteredOutBySelectArg(String key)
{
for(final String bindingName : requestedColumnsMap.keySet()) {
@ -536,10 +504,8 @@ public class VariantAnnotatorEngine {
return false; //the -S arg doesn't have anything with the same binding name as this key, so the user implicitly requested this key
}
/**
*/
/**
* Determines how the engine will handle the case where multiple records in a ROD file
* overlap a particular single locus. If oneToMany is set to true, the output will be
* one-to-many, so that each locus in the input VCF file could result in multiple
@ -551,18 +517,21 @@ public class VariantAnnotatorEngine {
* See class-level comments for more details.
*
* @param oneToMany true if we should break out from one to many
*/
*//*
public void setOneToMany(boolean oneToMany) {
this.oneToMany = oneToMany;
}
/**
*/
/**
* Sets the columns that will be used for the info annotation field.
* Column names should be of the form bindingName.columnName (eg. dbsnp.avHet).
*
* @param columns An array of strings where each string is a comma-separated list
* of columnNames (eg ["dbsnp.avHet,dbsnp.valid", "file2.col1,file3.col1"] ).
*/
*//*
public void setRequestedColumns(String[] columns) {
if(columns == null) {
throw new IllegalArgumentException("columns arg is null. Please check the -s command-line arg.");
@ -574,17 +543,20 @@ public class VariantAnnotatorEngine {
}
/**
*/
/**
* Passes in a pointer to the JoinTables.
*
* @param joinTables The list of JoinTables. There should be one JoinTable object for each -J arg.
*/
*//*
public void setJoinTables(List<JoinTable> joinTables) {
this.joinTables = joinTables;
}
/**
*/
/**
* Parses the columns arg and returns a Map of columns hashed by their binding name.
* For example:
* The command line:
@ -604,7 +576,8 @@ public class VariantAnnotatorEngine {
* @param columnsArg The -s command line arg value.
*
* @return Map representing a parsed version of this arg - see above.
*/
*//*
private static Map<String, Set<String>> parseColumnsArg(String[] columnsArg) {
Map<String, Set<String>> result = new HashMap<String, Set<String>>();
@ -635,5 +608,6 @@ public class VariantAnnotatorEngine {
return Collections.unmodifiableMap(inputTableHitCounter);
}
*/
}

View File

@ -1,299 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
import java.util.Map.Entry;
/**
* This plugin for {@link VariantAnnotatorEngine} serves as the core
* of the {@link GenomicAnnotator}. It finds all records in the -B input files
* that match the given variant's position and, optionally, the variant's reference and alternate alleles.
*
* For details, see: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator
*/
public class GenomicAnnotation extends InfoFieldAnnotation {
public static final String CHR_COLUMN = "chr";
public static final String START_COLUMN = "start";
public static final String END_COLUMN = "end";
public static final String HAPLOTYPE_REFERENCE_COLUMN = "haplotypeReference";
public static final String HAPLOTYPE_ALTERNATE_COLUMN = "haplotypeAlternate";
public static final String NUM_MATCHES_SPECIAL_INFO_FIELD = "numMatchingRecords";
/** Characters that aren't allowed within VCF info field key-value pairs */
public static final char[] ILLEGAL_INFO_FIELD_VALUES = { ' ', '=', ';' };
/** Replacement for each character in ILLEGAL_INFO_FIELD_VALUES */
public static final char[] ILLEGAL_INFO_FIELD_VALUE_SUBSTITUTES = { '_', '-', '!' };
private void modifyAnnotationsForIndels(VariantContext vc, String featureName, Map<String, String> annotationsForRecord) {
String inCodingRegionKey = featureName + ".inCodingRegion";
String referenceCodonKey = featureName + ".referenceCodon";
String variantCodonKey = featureName + ".variantCodon";
String codingCoordStrKey = featureName + ".codingCoordStr";
String proteinCoordStrKey = featureName + ".proteinCoordStr";
String haplotypeReferenceKey = featureName + "." + HAPLOTYPE_REFERENCE_COLUMN;
String haplotypeAlternateKey = featureName + "." + HAPLOTYPE_ALTERNATE_COLUMN;
String functionalClassKey = featureName + ".functionalClass";
String startKey = featureName + "." + START_COLUMN;
String endKey = featureName + "." + END_COLUMN;
String referenceAAKey = featureName + ".referenceAA";
String variantAAKey = featureName + ".variantAA";
String changesAAKey = featureName + ".changesAA";
annotationsForRecord.put(variantCodonKey, "unknown");
annotationsForRecord.put(codingCoordStrKey, "unknown");
annotationsForRecord.put(proteinCoordStrKey, "unknown");
annotationsForRecord.put(referenceAAKey, "unknown");
annotationsForRecord.put(variantAAKey, "unknown");
String refAllele = vc.getReference().getDisplayString();
if (refAllele.length() == 0) { refAllele = "-"; }
String altAllele = vc.getAlternateAllele(0).toString();
if (altAllele.length() == 0) { altAllele = "-"; }
annotationsForRecord.put(haplotypeReferenceKey, refAllele);
annotationsForRecord.put(haplotypeAlternateKey, altAllele);
annotationsForRecord.put(startKey, String.format("%d", vc.getStart()));
annotationsForRecord.put(endKey, String.format("%d", vc.getEnd()));
boolean isCodingRegion = annotationsForRecord.containsKey(inCodingRegionKey) && annotationsForRecord.get(inCodingRegionKey).equalsIgnoreCase("true") ? true : false;
boolean isFrameshift = (vc.getIndelLengths().get(0) % 3 == 0) ? false : true;
String functionalClass;
if (isCodingRegion) {
functionalClass = isFrameshift ? "frameshift" : "inframe";
annotationsForRecord.put(changesAAKey, "true");
} else {
functionalClass = "noncoding";
}
annotationsForRecord.put(functionalClassKey, functionalClass);
}
/**
* For each -B input file, for each record which overlaps the current locus, generates a
* set of annotations of the form:
*
* bindingName.columnName1=columnValue, bindingName.columnName2=columnValue2, etc.
*
* For example: dbSNP.avHet=0.7, dbSNP.ref_allele=A, etc.
*
* @return The following is an explanation of this method's return value:
*
* The annotations from a matching in a particular file are stored in a Map<String, String>
* where the key is bindingName.columnName and the value is the columnValue.
* Since a single input file can have multiple records that overlap the current
* locus (eg. dbSNP can have multiple entries for the same genomic position), a different
* Map<String, String> is created for each matching record in a particular file.
* The set of matching records for each file is then represented as a List<Map<String, String>>
*
* The return value of this method is a Map<String, Object> of the form:
* rodName1 -> List<Map<String, String>>
* rodName2 -> List<Map<String, String>>
* rodName3 -> List<Map<String, String>>
* ...
* Where the rodNames are the -B binding names for each file that were specified on the command line (eg. -B bindingName,AnnotatorInputTable,/path/to/file).
*
* NOTE: The lists (List<Map<String, String>>) are guaranteed to have size > 0
* because a rodName -> List<Map<String, String>> entry will only
* be created in Map<String, Object> if the List has at least one element.
*/
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc) {
//iterate over each record that overlaps the current locus, and, if it passes certain filters,
//add its values to the list of annotations for this locus.
final Map<String, Object> annotations = new HashMap<String, Object>();
for(final GATKFeature gatkFeature : tracker.getAllRods())
{
final String name = gatkFeature.getName();
if( name.equals("variant") || name.equals("interval") ) {
continue;
}
if( ! (gatkFeature.getUnderlyingObject() instanceof AnnotatorInputTableFeature) ) {
continue; //GenericAnnotation only works with TabularRODs because it needs to be able to select individual columns.
}
final Map<String, String> annotationsForRecord = convertRecordToAnnotations( gatkFeature.getName(), ((AnnotatorInputTableFeature) gatkFeature.getUnderlyingObject()).getColumnValues());
//If this record contains the HAPLOTYPE_REFERENCE_COLUMN and/or HAPLOTYPE_ALTERNATE_COLUMN, check whether the
//alleles specified match the the variant's reference allele and alternate allele.
//If they don't match, this record will be skipped, and its values will not be used for annotations.
//
//If one of these columns doesn't exist in the current rod, or if its value is * (star), then this is treated as an automatic match.
//Otherwise, the HAPLOTYPE_REFERENCE_COLUMN is only considered to be matching the variant's reference if the string values of the two
//are exactly equal (case-insensitive).
//The HAPLOTYPE_REFERENCE_COLUMN matches the variant's reference allele based on a case-insensitive string comparison.
//The HAPLOTYPE_ALTERNATE_COLUMN can optionally list more than allele separated by one of these chars: ,\/:|
// only check this value for SNPs
String hapAltValue = vc.isSNP() ? annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_ALTERNATE_COLUMN) ) : null;
if ( hapAltValue != null && !hapAltValue.equals("*") ) {
Set<Allele> alternateAlleles = vc.getAlternateAlleles();
//if(alternateAlleles.isEmpty()) {
//handle a site that has been called monomorphic reference
//alternateAlleles.add(vc.getReference());
//continue; //TODO If this site is monomorphic in the VC, and the current record specifies a particular alternate allele, skip this record. Right?
//} else
if(alternateAlleles.size() > 1) {
throw new UserException.MalformedFile("File associated with " + vc.getSource() + " contains record [" + vc + "] contains " + alternateAlleles.size() + " alternate alleles. GenomicAnnotion currently only supports annotating 1 alternate allele.");
}
Allele vcAlt;
if(alternateAlleles.isEmpty()) {
vcAlt = vc.getReference();
} else {
vcAlt = alternateAlleles.iterator().next();
}
boolean matchFound = false;
for(String hapAlt : hapAltValue.split("[,\\\\/:|]")) {
if(!hapAlt.isEmpty() && vcAlt.basesMatch(hapAlt)) {
matchFound = true;
break;
}
}
if(!matchFound) {
continue; //skip record - none of its alternate alleles match the variant's alternate allele
}
}
// only check this value for SNPs
String hapRefValue = vc.isSNP() ? annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_REFERENCE_COLUMN) ) : null;
if(hapRefValue != null)
{
hapRefValue = hapRefValue.trim();
if(!hapRefValue.equals("*"))
{
//match against hapolotypeReference.
Allele vcRef = vc.getReference();
if(!vcRef.basesMatch(hapRefValue)) {
continue; //skip record
}
}
}
if (vc.isIndel()) {
modifyAnnotationsForIndels(vc, name, annotationsForRecord);
}
//filters passed, so add this record.
List<Map<String, String>> listOfMatchingRecords = (List<Map<String, String>>) annotations.get( name );
if(listOfMatchingRecords == null) {
listOfMatchingRecords = new LinkedList<Map<String,String>>();
listOfMatchingRecords.add( annotationsForRecord );
annotations.put(name, listOfMatchingRecords);
} else {
listOfMatchingRecords.add( annotationsForRecord );
}
}
return annotations;
}
/**
* Converts the given record to a set of key-value pairs of the form:
* bindingName.columnName1=column1Value, bindingName.columnName2=column2Value
* (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A)
*
* @param record AnnotatorInputTableFeature corresponding to one record in one -B input file.
* @param bindingName The binding name of the given AnnotatorInputTableFeature.
* @return The map of columnName -> columnValue pairs.
*/
public static Map<String, String> convertRecordToAnnotations( String bindingName, Map<String, String> record) {
final Map<String, String> result = new HashMap<String, String>();
for(final Entry<String, String> entry : record.entrySet()) {
final String value = entry.getValue();
if(!value.trim().isEmpty()) {
result.put( generateInfoFieldKey(bindingName, entry.getKey()), scrubInfoFieldValue(entry.getValue()));
}
}
return result;
}
/**
* Combines the 2 values into a full key.
* @param rodBindingName -B name
* @param columnName column name
* @return info field key
*/
public static String generateInfoFieldKey(String rodBindingName, String columnName ) {
return rodBindingName + '.' + columnName;
}
/**
* Replaces any characters that are not allowed in the info field of a VCF file.
*
* @param value info field value
* @return the value with any illegal characters replaced by legal ones.
*/
private static String scrubInfoFieldValue(String value) {
for(int i = 0; i < GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUES.length; i++) {
value = value.replace(GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUES[i], GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUE_SUBSTITUTES[i]);
}
return value;
}
public List<VCFInfoHeaderLine> getDescriptions() {
return Arrays.asList(new VCFInfoHeaderLine("GenericAnnotation", 1, VCFHeaderLineType.Integer, "For each variant in the 'variants' ROD, finds all entries in the other -B files that overlap the variant's position."));
}
public List<String> getKeyNames() {
return Arrays.asList("GenericAnnotation");
}
}

View File

@ -1,287 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.Map.Entry;
/**
* Annotates variant calls with information from user-specified tabular files.
*
* For details, see: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator
*/
@Requires(value={DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariantContext.class))
@By(DataSource.REFERENCE)
public class GenomicAnnotator extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
@Argument(fullName="vcfOutput", shortName="vcf", doc="Please use --out instead", required=false)
@Deprecated
protected String oldOutArg;
@Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false)
protected String sampleName = null;
@Argument(fullName="select", shortName="s", doc="Optionally specifies which subset of columns from which -B inputs should be used for annotations. For example, -B:mydbsnp,AnnotatorInputTable /path/to/mydbsnp.txt -B:mytable,AnnotatorInputTable /path/mytable.txt -s mydbsnp.avHet,mydbsnp.name,mytable.column3 will cause annotations to only be generated from the 3 columns specified using -s.", required=false)
protected String[] SELECT_COLUMNS = {};
@Argument(fullName="join", shortName="J", doc="Optionally specifies a file and column within that file that should be LEFT-JOIN'ed to a column in a previously-specified file. The file provided to -J must be tab-delimited, with the first non-comment/non-empty line containing column names. (example: -B:name,AnnotatorInputTable /path/to/file1 -J name2,/path/to/file2,name.columnName=name2.columnName2 - this will join the table in file2 to the table in file1) ", required=false)
protected String[] JOIN_ARGS = {};
@Argument(fullName="oneToMany", shortName="m", doc="If more than one record from the same file matches a particular locus (for example, multiple dbSNP records with the same position), create multiple entries in the ouptut VCF file - one for each match. If a particular tabular file has J matches, and another tabular file has K matches for a given locus, then J*K output VCF records will be generated - one for each pair of K, J. If this flag is not provided, the multiple records are still generated, but they are stored in the INFO field of a single output VCF record, with their annotation keys differentiated by appending '_i' with i varying from 1 to K*J. ", required=false)
protected Boolean ONE_TO_MANY = false;
@Argument(fullName="maxJoinTableSize", shortName="maxJoin", doc="The maximum allowed size (i.e. number of rows) for a table provided with the -J argument", required=false)
protected Integer MAX_JOIN_TABLE_SIZE = 500000;
@Argument(fullName="ignoreFilteredSites", shortName="noFilt", doc="If specified, don't annotate sites marked as filtered out")
protected Boolean IGNORE_FILTERED_SITES = false;
private VariantAnnotatorEngine engine;
/**
* Prepare the output file and the list of available features.
*/
public void initialize() {
//read all ROD file headers and construct a set of all column names to be used for validation of command-line args
final Set<String> allFullyQualifiedColumnNames = new LinkedHashSet<String>();
final Set<String> allBindingNames = new LinkedHashSet<String>();
for(ReferenceOrderedDataSource ds : getToolkit().getRodDataSources()) {
if(! ds.getType().equals(AnnotatorInputTableCodec.class)) {
continue; //skip all non-AnnotatorInputTable files.
}
final String bindingName = ds.getName();
File file = ds.getFile();
allBindingNames.add(bindingName);
try {
final ArrayList<String> header = AnnotatorInputTableCodec.readHeader(file);
for(String columnName : header) {
allFullyQualifiedColumnNames.add(bindingName + "." + columnName);
}
} catch(IOException e) {
throw new UserException.CouldNotReadInputFile(file, "Failed when attempting to read file header. ", e);
}
}
//parse the JOIN_COLUMNS args, read in the specified files, and validate column names in the = relation. This end result of this loop is to populate the List of joinTables with one entry per -J arg.
final List<JoinTable> joinTables = new LinkedList<JoinTable>();
for(String joinArg : JOIN_ARGS) {
//parse the tokens
final String[] arg = joinArg.split(",");
if(arg.length != 3) {
throw new UserException.BadArgumentValue("-J", "The following -J arg: \"" + joinArg + "\" must contain 3 comma-separated values. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)");
}
final String bindingName = arg[0];
final String filename = arg[1];
final String columnsToJoin = arg[2];
if(allBindingNames.contains(bindingName)) {
throw new UserException.BadArgumentValue("-J", "The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" has already been used in another binding.");
}
String[] splitOnEquals = columnsToJoin.split("=+");
if(splitOnEquals.length != 2) {
throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" must specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)");
}
String[] splitOnDot1 = splitOnEquals[0].split("\\.");
String[] splitOnDot2 = splitOnEquals[1].split("\\.");
if(splitOnDot1.length != 2 || splitOnDot2.length != 2) {
throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" must fully specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)");
}
final String bindingName1 = splitOnDot1[0];
final String columnName1 = splitOnDot1[1];
final String bindingName2 = splitOnDot2[0];
final String columnName2 = splitOnDot2[1];
//figure out which of the 2 binding names within the = relation matches the -J bindingName
final String localBindingName = bindingName; //alias
final String localColumnName;
final String externalBindingName;
final String externalColumnName;
if(bindingName1.equals(bindingName)) {
localColumnName = columnName1;
externalBindingName = bindingName2;
externalColumnName = columnName2;
} else if(bindingName2.equals(bindingName)) {
localColumnName = columnName2;
externalBindingName = bindingName1;
externalColumnName = columnName1;
} else {
throw new UserException.BadArgumentValue("-J", "The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" must be specified in one the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)");
}
//validate externalColumnName
final String fullyQualifiedExternalColumnName = externalBindingName + '.' + externalColumnName;
if( !allFullyQualifiedColumnNames.contains(fullyQualifiedExternalColumnName) ) {
throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + fullyQualifiedExternalColumnName + "\"");
}
//read in the file contents into a JoinTable object
final JoinTable joinTable = new JoinTable(MAX_JOIN_TABLE_SIZE);
joinTable.parseFromFile(filename, localBindingName, localColumnName, externalBindingName, externalColumnName);
joinTables.add(joinTable);
//validate localColumnName, and add all column names in this file to the list of allFullyQualifiedColumnNames so that they can be referenced from subsequent -J args.
final List<String> columnNames = joinTable.getColumnNames();
final List<String> fullyQualifiedColumnNames = new LinkedList<String>();
boolean found = false;
for ( String columnName : columnNames ) {
if ( columnName.equals(localColumnName) )
found = true;
fullyQualifiedColumnNames.add(localBindingName + '.' + columnName);
}
if ( !found )
throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename);
allFullyQualifiedColumnNames.addAll(fullyQualifiedColumnNames);
}
//parse the SELECT_COLUMNS arg and validate the column names
List<String> parsedSelectColumns = new LinkedList<String>();
for ( String token : SELECT_COLUMNS )
parsedSelectColumns.addAll(Arrays.asList(token.split(",")));
SELECT_COLUMNS = parsedSelectColumns.toArray(SELECT_COLUMNS);
for ( String columnName : SELECT_COLUMNS ) {
if ( !allFullyQualifiedColumnNames.contains(columnName) )
throw new UserException.BadArgumentValue("-s", "The column name '" + columnName + "' provided to -s doesn't match any of the column names in any of the -B files. Here is the list of available column names: " + allFullyQualifiedColumnNames);
}
//instantiate the VariantAnnotatorEngine
ArrayList<String> annotationsToUse = new ArrayList<String>();
annotationsToUse.add("GenomicAnnotation");
engine = new VariantAnnotatorEngine(getToolkit(), new ArrayList<String>(), annotationsToUse);
engine.setOneToMany(ONE_TO_MANY);
engine.setRequestedColumns(SELECT_COLUMNS);
engine.setJoinTables(joinTables);
// set up the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), Arrays.asList("variant")));
hInfo.addAll(engine.getVCFAnnotationDescriptions());
Set<String> rodName = new HashSet<String>();
rodName.add("variant");
Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
}
/**
* Initialize the number of loci processed to zero.
*
* @return 0
*/
public Integer reduceInit() { return 0; }
/**
* We want reads that span deletions
*
* @return true
*/
public boolean includeReadsWithDeletionAtLoci() { return true; }
/**
* For each site of interest, annotate based on the requested annotation types
*
* @param tracker the meta-data tracker
* @param ref the reference base
* @param context the context for the given locus
* @return 1 if the locus was successfully processed, 0 if otherwise
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return 0;
Set<VariantContext> results = new LinkedHashSet<VariantContext>();
for (VariantContext vc : tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false)) {
if ( (vc.isFiltered() && IGNORE_FILTERED_SITES) ||
(vc.isVariant() && !vc.isBiallelic()) ) {
results.add(vc);
} else {
Map<String, AlignmentContext> stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context);
if ( stratifiedContexts != null )
results.addAll(engine.annotateContext(tracker, ref, stratifiedContexts, vc));
else
results.add(vc);
}
}
for ( VariantContext vc : results )
vcfWriter.add(vc ,ref.getBase());
return 1;
}
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
}
public void onTraversalDone(Integer sum) {
//out.printf("Generated %d annotated VCF records.\n", totalOutputVCFRecords);
Map<String, Integer> inputTableHitCounter = engine.getInputTableHitCounter();
for ( Entry<String, Integer> e : inputTableHitCounter.entrySet() ) {
final String bindingName = e.getKey();
final int counter = e.getValue();
//final float percent = 100 * counter /(float) totalOutputVCFRecords;
//out.printf(" %-6.1f%% (%d) annotated with %s.\n", percent, counter, bindingName );
System.out.printf(" %d annotated with %s.\n", counter, bindingName );
}
}
}

View File

@ -1,226 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
/**
* This is a container that holds all data corresponding to a single join table as specified by one -J arg (ex: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2).
* Some terminology:
* 'bindingName' is an arbitrary label for a given table that is specified on the command line with either the -B or -J arg.
* In the example above, bindingName1 is the 'local' binding name because it is attached to the join table file provided with this -J arg. bindingName2 is the 'external' binding name because
* it corresponds to some other table specified previously with another -B or -J arg.
*
* The JoinTable object stores a map entry for each record in the join table. The entry's key is the value of the join column in a given record (eg. bindingName1.columnName in the above example),
* and the entry value is an ArrayList representing the entire join table record.
* The JoinTable object also stores some other join table parameters such as the column names that were parsed out of the file header, and the bindingNames and columnNames from the -J arg.
*
* The join operation is performed by looking up the value of the join column in the external table (the one that this table is being joined to), and then using this value to do a lookup
* on the map - if there's a hit, it will provide the record from the join table that is to be joined with the record in the external table.
*
* More information can be found here: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator
*/
public class JoinTable
{
//the list of join table column names parsed out of the file header.
private List<String> columnNames; //not fully-qualified
private String localBindingName;
private String externalBindingName;
private String externalColumnName;
//stores a map entry for each record in the join table. The entry's key is the value of the join column in a given record (eg. bindingName.columnName in the above example),
//and the entry value is an ArrayList representing the entire join table record.
private HashMap<String, ArrayList<String>> joinColumnValueToRecords = new HashMap<String, ArrayList<String>>();
private int maxSize;
private boolean parsedFromFile = false;
public JoinTable(int maxSize) {
this.maxSize = maxSize;
}
/**
* Parses the table from the given file using the JoinTableParser.
*
* @param filename The file containing the table.
* @param localBindingName The binding name within the given file to join on.
* @param localColumnName The column name within the given file to join on.
* @param externalBindingName The binding name of another file (previously specified with either -B or -J).
* @param externalColumnName The column name in this other file to join on.
*/
public void parseFromFile(String filename, String localBindingName, String localColumnName, String externalBindingName, String externalColumnName) {
if(parsedFromFile) {
throw new ReviewedStingException("parseFromFile(" + filename +", ..) called more than once");
}
parsedFromFile = true;
setLocalBindingName(localBindingName);
setExternalBindingName(externalBindingName);
setExternalColumnName(externalColumnName);
BufferedReader br = null;
try
{
br = new BufferedReader(new FileReader(filename));
final JoinTableParser parser = new JoinTableParser();
//read in the header
columnNames = parser.readHeader(br);
//get the index of the localJoinColumnName
int localColumnNameIdx = -1;
for(int i = 0; i < columnNames.size(); i++) {
final String columnName = columnNames.get(i);
if(columnName.equals(localColumnName)) {
localColumnNameIdx = i;
break;
}
}
if(localColumnNameIdx == -1) {
throw new UserException.BadArgumentValue("-J", "The -J arg specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename);
}
//read in all records and create a map entry for each
String line;
while((line = br.readLine()) != null) {
final ArrayList<String> columnValues = parser.parseLine(line);
if ( columnValues.size() < columnNames.size() )
throw new UserException.BadInput("the file: " + filename + " is malformed as there are not a sufficient number of columns for this line: " + line);
final String joinColumnValue = columnValues.get(localColumnNameIdx);
put(joinColumnValue, columnValues, filename);
}
}
catch(IOException e)
{
throw new UserException.CouldNotReadInputFile(new File(filename), "Unable to parse file", e);
}
finally
{
try {
if(br != null) {
br.close();
}
} catch(IOException e) {
throw new ReviewedStingException("Unable to close file: " + filename, e);
}
}
}
/**
* If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2,
* this returns bindingName1.
* @return local binding name
*/
public String getLocalBindingName() {
return localBindingName;
}
public void setLocalBindingName(String localBindingName) {
this.localBindingName = localBindingName;
}
/**
* @return the list of join table column names parsed out of the file header.
*/
public List<String> getColumnNames() {
return columnNames; //not fully-qualified
}
protected void setColumnNames(List<String> columnNames) {
this.columnNames = columnNames;
}
/**
* If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2,
* this returns columnName2.
* @return external column name
*/
public String getExternalColumnName() {
return externalColumnName;
}
protected void setExternalColumnName(
String externalColumnName) {
this.externalColumnName = externalColumnName;
}
/**
* If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2,
* this returns bindingName2.
* @return external binding name
*/
public String getExternalBindingName() {
return externalBindingName;
}
protected void setExternalBindingName(
String externalBindingName) {
this.externalBindingName = externalBindingName;
}
/**
* Whether any join table records have the given value in the join column.
* @param joinColumnValue value
* @return true if the given name value exists in the file
*/
public boolean containsJoinColumnValue(String joinColumnValue) {
return joinColumnValueToRecords.containsKey(joinColumnValue);
}
/**
* Returns all records in the table where the join column has the given value.
* @param joinColumnValue column value
* @return row
*/
public ArrayList<String> get(String joinColumnValue) {
return joinColumnValueToRecords.get(joinColumnValue);
}
/**
* Adds the given record to the map.
* @param joinColumnValue value
* @param record row
* @param filename the source file name
*/
protected void put(String joinColumnValue, ArrayList<String> record, String filename) {
if ( joinColumnValueToRecords.containsKey(joinColumnValue) )
throw new UserException.BadInput("the file " + filename + " contains non-unique entries for the requested column, which isn't allowed.");
joinColumnValueToRecords.put(joinColumnValue, record);
if ( joinColumnValueToRecords.size() > maxSize )
throw new UserException.BadInput("the file " + filename + " contains more than the maximum number (" + maxSize + ") of allowed rows (see the --maxJoinTableSize argument).");
}
}

View File

@ -1,131 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* Used to parse files passed to the GenomicAnnotator via the -J arg.
* The files must be tab-delimited, and the first non-empty/non-commented line
* must be a header containing column names.
*
* More information can be found here: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator
*/
public class JoinTableParser
{
public static final String DELIMITER = "\t";
private List<String> header; //column names parsed out of the header line
/**
* Constructor.
*/
public JoinTableParser() {}
/**
* Returns the header and returns it.
* @param br source
* @return column names
* @throws IOException on read
*/
public List<String> readHeader(BufferedReader br) throws IOException
{
if(header != null) {
throw new ReviewedStingException("readHeader(..) called more than once. Header is currently set to: " + header);
}
header = Collections.unmodifiableList(parseHeader(br));
return header;
}
/**
* @return A list containing the column names.
*/
public List<String> getHeader() {
return header;
}
/**
* Parses the line into an ArrayList containing the values for each column.
*
* @param line to parse
* @return tokens
*/
public ArrayList<String> parseLine(String line) {
final ArrayList<String> values = Utils.split(line, DELIMITER, header.size());
if ( values.size() != header.size() ) {
throw new UserException.MalformedFile(String.format("Encountered a row with %d columns which is different from the number or columns in the header: %d\nHeader: " + header + "\nLine: " + values, values.size(), header.size()));
}
return values;
}
/**
* Returns the header.
* @param br The file to read.
* @return ArrayList containing column names from the header.
* @throws IOException on reading
*/
public static ArrayList<String> parseHeader(final BufferedReader br) throws IOException
{
ArrayList<String> header = null;
//find the 1st line that's non-empty and not a comment
String line;
while( (line = br.readLine()) != null ) {
line = line.trim();
if ( line.isEmpty() || line.startsWith("#") ) {
continue;
}
//parse the header
header = Utils.split(line, DELIMITER);
break;
}
// check that header was found
if ( header == null ) {
throw new IllegalArgumentException("No header in " + br + ". All lines are either comments or empty.");
}
return header;
}
}

View File

@ -62,7 +62,6 @@ public class UGCallVariants extends RodWalker<VariantCallContext, Integer> {
private Set<String> trackNames = new HashSet<String>();
public void initialize() {
UAC.NO_SLOD = true;
for ( ReferenceOrderedDataSource d : getToolkit().getRodDataSources() ) {
if ( d.getName().startsWith("variant") )

View File

@ -58,8 +58,8 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
@Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false)
public boolean NO_SLOD = false;
@Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false)
public boolean COMPUTE_SLOD = false;
// control the error modes
@ -154,7 +154,7 @@ public class UnifiedArgumentCollection {
uac.PCR_error = PCR_error;
uac.GenotypingMode = GenotypingMode;
uac.OutputMode = OutputMode;
uac.NO_SLOD = NO_SLOD;
uac.COMPUTE_SLOD = COMPUTE_SLOD;
uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE;
uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;

View File

@ -144,7 +144,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
// annotation (INFO) fields from UnifiedGenotyper
if ( !UAC.NO_SLOD )
if ( UAC.COMPUTE_SLOD )
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));

View File

@ -255,7 +255,7 @@ public class UnifiedGenotyperEngine {
pileup = rawContext.getBasePileup();
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc).iterator().next();
vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc);
}
return new VariantCallContext(vc, ref.getBase(), false);
@ -372,8 +372,8 @@ public class UnifiedGenotyperEngine {
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
if ( !UAC.NO_SLOD && bestAFguess != 0 ) {
final boolean DEBUG_SLOD = false;
if ( UAC.COMPUTE_SLOD && bestAFguess != 0 ) {
//final boolean DEBUG_SLOD = false;
// the overall lod
VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model);
@ -381,7 +381,7 @@ public class UnifiedGenotyperEngine {
afcm.get().getLog10PNonRef(tracker, refContext, vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
//double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model);
@ -390,7 +390,7 @@ public class UnifiedGenotyperEngine {
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model);
@ -399,11 +399,11 @@ public class UnifiedGenotyperEngine {
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
//if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF;
if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
//if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
// strand score is max bias between forward and reverse strands
double strandScore = Math.max(forwardLod, reverseLod);
@ -436,8 +436,7 @@ public class UnifiedGenotyperEngine {
pileup = rawContext.getBasePileup();
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall);
vcCall = variantContexts.iterator().next(); // we know the collection will always have exactly 1 element.
vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall);
}
VariantCallContext call = new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));

View File

@ -64,7 +64,7 @@ public class RealignerTargetCreator extends RodWalker<RealignerTargetCreator.Eve
protected int windowSize = 10;
@Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1", required=false)
protected double mismatchThreshold = 0.15;
protected double mismatchThreshold = 0.0;
@Argument(fullName="minReadsAtLocus", shortName="minReads", doc="minimum reads at a locus to enable using the entropy calculation", required=false)
protected int minReadsAtLocus = 4;

View File

@ -1,890 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcid;
import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcidTable;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.*;
import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods;
/**
* Walks along all variant ROD loci, and dynamically annotates alleles at MNP records.
*/
@Allows(value = {DataSource.REFERENCE})
@Requires(value = {DataSource.REFERENCE}, referenceMetaData = {@RMD(name = AnnotateMNPsWalker.REFSEQ_ROD_NAME, type = AnnotatorInputTableFeature.class), @RMD(name = AnnotateMNPsWalker.VARIANT_ROD_NAME, type = ReferenceOrderedDatum.class)})
public class AnnotateMNPsWalker extends RodWalker<Integer, Integer> {
@Output(doc = "File to which variants should be written", required = true)
protected VCFWriter writer = null;
private ManualSortingVCFWriter sortingWriter = null;
@Argument(fullName = "emitOnlyMNPs", shortName = "emitOnlyMNPs", doc = "Only output MNP records; [default:false]", required = false)
protected boolean emitOnlyMNPs = false;
private LinkedList<String> rodNames = null;
private GenomeLocParser locParser = null;
private TreeMap<GenomeLoc, Set<GenomeLoc>> MNPstartToStops = null; // Must be TreeMap sorted by START sites!
public final static String REFSEQ_ROD_NAME = "refseq";
public final static String VARIANT_ROD_NAME = "variant";
private LocusToFeatures locusToRefSeqFeatures = null;
protected final static String MNP_ANNOTATION_KEY_PREFIX = "MNP.refseq.";
protected final static String REFSEQ_NAME = "name";
protected final static String REFSEQ_NAME2 = "name2";
protected final static String REFSEQ_POSITION_TYPE = "positionType";
protected final static String REFSEQ_CDS = "CDS";
protected final static String REFSEQ_STRAND = "transcriptStrand";
protected final static String REFSEQ_POS_STRAND = "+";
protected final static String REFSEQ_NEG_STRAND = "-";
protected final static String REFSEQ_CODON_COORD = "codonCoord";
protected final static String REFSEQ_CODING_FRAME = "frame";
protected final static String REFSEQ_REF_CODON = "referenceCodon";
protected final static String REFSEQ_REF_AA = "referenceAA";
protected final static String REFSEQ_ALT_BASE = "haplotypeAlternate";
protected final static String REFSEQ_VARIANT_CODON = "variantCodon";
protected final static String REFSEQ_VARIANT_AA = "variantAA";
protected final static String REFSEQ_CHANGES_AA = "changesAA";
protected final static String REFSEQ_FUNCTIONAL_CLASS = "functionalClass";
protected final static String REFSEQ_PROTEIN_COORD_DESCRIPTION = "proteinCoordStr";
protected final static String REFSEQ_CODING_ANNOTATIONS = "codingVariants";
protected final static String REFSEQ_NUM_AA_CHANGES = "numAAchanges";
protected final static String REFSEQ_HAS_MULT_AA_CHANGES = "alleleHasMultAAchanges";
public void initialize() {
rodNames = new LinkedList<String>();
rodNames.add(VARIANT_ROD_NAME);
locParser = getToolkit().getGenomeLocParser();
MNPstartToStops = new TreeMap<GenomeLoc, Set<GenomeLoc>>(); // sorted by start sites
initializeVcfWriter();
locusToRefSeqFeatures = new LocusToFeatures();
}
private void initializeVcfWriter() {
sortingWriter = new ManualSortingVCFWriter(writer);
writer = sortingWriter;
// setup the header fields:
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
Map<String, VCFHeader> rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames);
writer.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples())));
}
public boolean generateExtendedEvents() {
return false;
}
public Integer reduceInit() {
return 0;
}
/**
* For each site of interest, annotate it if it's a MNP.
*
* @param tracker the meta-data tracker
* @param ref the reference base
* @param context the context for the given locus
* @return count of MNPs observed
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (tracker == null)
return null;
int numMNPsObserved = 0;
GenomeLoc curLocus = ref.getLocus();
clearOldLocusFeatures(curLocus);
boolean requireStartHere = false; // see EVERY site of the MNP
boolean takeFirstOnly = false; // take as many entries as the VCF file has
for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) {
GenomeLoc vcLoc = VariantContextUtils.getLocation(locParser, vc);
boolean atStartOfVc = curLocus.getStart() == vcLoc.getStart();
boolean atEndOfVc = curLocus.getStart() == vcLoc.getStop();
if (vc.isMNP()) {
logger.debug("Observed MNP at " + vcLoc);
if (isChrM(vc)) {
if (atStartOfVc) {
logger.warn("Skipping mitochondrial MNP at " + vcLoc + " due to complexity of coding table [need to know if first codon, etc.]...");
writeVCF(vc);
}
continue;
}
GenomeLoc stopLoc = locParser.createGenomeLoc(curLocus.getContig(), vcLoc.getStop());
final List<Object> refSeqRODs = tracker.getReferenceMetaData(REFSEQ_ROD_NAME);
for (Object refSeqObject : refSeqRODs) {
AnnotatorInputTableFeature refSeqAnnotation = (AnnotatorInputTableFeature) refSeqObject;
locusToRefSeqFeatures.putLocusFeatures(curLocus, refSeqAnnotation, stopLoc);
}
if (atStartOfVc) { // MNP is starting here, so register that we're waiting for it
Set<GenomeLoc> stopLocs = MNPstartToStops.get(curLocus);
if (stopLocs == null) {
stopLocs = new HashSet<GenomeLoc>();
MNPstartToStops.put(curLocus, stopLocs);
}
stopLocs.add(stopLoc);
}
if (atEndOfVc) {
numMNPsObserved++; // only count a MNP at its stop site
logger.debug("Observed end of MNP at " + curLocus);
logger.debug("Current list of per-locus features\n" + locusToRefSeqFeatures);
Map<String, Object> MNPannotations = annotateMNP(vc);
MNPannotations.putAll(RefSeqDataParser.removeRefSeqAttributes(vc.getAttributes())); // remove any RefSeq INFO, since adding it in more thoroughly here
vc = VariantContext.modifyAttributes(vc, MNPannotations);
writeVCF(vc);
GenomeLoc startLoc = locParser.createGenomeLoc(curLocus.getContig(), vcLoc.getStart());
Set<GenomeLoc> stopLocs = MNPstartToStops.get(startLoc);
if (stopLocs != null) { // otherwise, just removed stopLocs due to another MNP that has the same (start, stop)
stopLocs.remove(stopLoc);
if (stopLocs.isEmpty()) // no longer waiting for startLoc
MNPstartToStops.remove(startLoc);
}
}
}
else if (atStartOfVc && !emitOnlyMNPs) {// only want to write other VariantContexts records once (where they start):
writeVCF(vc);
}
}
Integer mostUpstreamWritableLoc = null;
if (!MNPstartToStops.isEmpty()) {
GenomeLoc waitingForLoc = MNPstartToStops.entrySet().iterator().next().getKey();
mostUpstreamWritableLoc = waitingForLoc.getStart() - 1;
}
sortingWriter.setmostUpstreamWritableLocus(mostUpstreamWritableLoc);
return numMNPsObserved;
}
private static boolean isChrM(final VariantContext vc) {
return vc.getChr().equals("chrM") || vc.getChr().equals("MT");
}
private Map<String, Object> annotateMNP(VariantContext vc) {
Map<String, Object> annotations = new HashMap<String, Object>();
RefSeqNameToFeatures nameToPositionalFeatures = new RefSeqNameToFeatures(vc);
MNPannotationKeyBuilder kb = new MNPannotationKeyBuilder(nameToPositionalFeatures);
for (Map.Entry<String, RefSeqFeatureList> nameToFeatureEntry : nameToPositionalFeatures.entrySet()) {
String featureName = nameToFeatureEntry.getKey();
RefSeqFeatureList feature = nameToFeatureEntry.getValue();
CodonAnnotationsForAltAlleles codonAnnotationsForAlleles = new CodonAnnotationsForAltAlleles(vc, feature);
annotations.put(kb.getKey(REFSEQ_CODING_ANNOTATIONS), codonAnnotationsForAlleles.getCodonAnnotationsString());
annotations.put(kb.getKey(REFSEQ_NUM_AA_CHANGES), codonAnnotationsForAlleles.getNumAAchangesString());
annotations.put(kb.getKey(REFSEQ_HAS_MULT_AA_CHANGES), codonAnnotationsForAlleles.hasAlleleWithMultipleAAchanges);
annotations.put(kb.getKey(REFSEQ_NAME), featureName);
annotations.put(kb.getKey(REFSEQ_NAME2), feature.name2);
annotations.put(kb.getKey(REFSEQ_POSITION_TYPE), REFSEQ_CDS);
annotations.put(kb.getKey(REFSEQ_STRAND), (feature.positiveStrand ? REFSEQ_POS_STRAND : REFSEQ_NEG_STRAND));
annotations.put(kb.getKey(REFSEQ_CODON_COORD), feature.getCodonCoordString());
kb.incrementFeatureIndex();
}
return annotations;
}
private static class MNPannotationKeyBuilder {
private int featureIndex;
private boolean multipleEntries;
public MNPannotationKeyBuilder(RefSeqNameToFeatures nameToPositionalFeatures) {
this.featureIndex = 1;
this.multipleEntries = nameToPositionalFeatures.nameToFeatures.size() > 1;
}
public void incrementFeatureIndex() {
featureIndex++;
}
public String getKey(String type) {
String annotationKey = MNP_ANNOTATION_KEY_PREFIX + type;
if (multipleEntries)
annotationKey += "_" + featureIndex;
return annotationKey;
}
}
private static byte[] ByteArrayToPrimitive(Byte[] nonNullArray) {
byte[] primArray = new byte[nonNullArray.length];
for (int i = 0; i < nonNullArray.length; i++) {
if (nonNullArray[i] == null)
throw new ReviewedStingException("nonNullArray[i] == null");
primArray[i] = nonNullArray[i];
}
return primArray;
}
private void clearOldLocusFeatures(GenomeLoc curLoc) {
Iterator<Map.Entry<GenomeLoc, PositionalRefSeqFeatures>> locusFeaturesIt = locusToRefSeqFeatures.entrySet().iterator();
while (locusFeaturesIt.hasNext()) {
Map.Entry<GenomeLoc, PositionalRefSeqFeatures> locusFeaturesEntry = locusFeaturesIt.next();
if (curLoc.isPast(locusFeaturesEntry.getValue().getFurthestLocusUsingFeatures()))
locusFeaturesIt.remove();
}
}
public Integer reduce(Integer count, Integer total) {
if (count != null)
total = total + count;
return total;
}
/**
* @param result the number of MNPs processed.
*/
public void onTraversalDone(Integer result) {
System.out.println("Number of MNPs observed: " + result);
writer.close();
}
private void writeVCF(VariantContext vc) {
WriteVCF.writeVCF(vc, writer, logger);
}
/*
Inner classes:
*/
// Maps: RefSeq entry name -> features for ALL positions of a particular VariantContext MNP:
private class RefSeqNameToFeatures {
private Map<String, RefSeqFeatureList> nameToFeatures;
public RefSeqNameToFeatures(VariantContext vc) {
this.nameToFeatures = new HashMap<String, RefSeqFeatureList>();
int MNPstart = vc.getStart();
int MNPstop = vc.getEnd();
int MNPlength = MNPstop - MNPstart + 1;
for (int i = 0; i < MNPlength; i++) {
int genomicPosition = MNPstart + i;
GenomeLoc posLoc = locParser.createGenomeLoc(vc.getChr(), genomicPosition);
PositionalRefSeqFeatures locFeatures = locusToRefSeqFeatures.getLocusFeatures(posLoc);
if (locFeatures == null) // no features for posLoc
continue;
for (Map.Entry<String, PositionalRefSeqFeature> nameToFeatureEntry : locFeatures.entrySet()) {
String name = nameToFeatureEntry.getKey();
PositionalRefSeqFeature posFeature = nameToFeatureEntry.getValue();
RefSeqFeatureList featureList = nameToFeatures.get(name);
if (featureList == null) {
featureList = new RefSeqFeatureList(MNPlength);
nameToFeatures.put(name, featureList);
}
featureList.updateFeatureAtPosition(i, posFeature);
}
}
}
public Set<Map.Entry<String, RefSeqFeatureList>> entrySet() {
return nameToFeatures.entrySet();
}
}
// For a particular RefSeq entry, contains the features for ALL positions of a particular VariantContext MNP
private static class RefSeqFeatureList {
private final static String CODON_FRAME_START = "(";
private final static String CODON_FRAME_END = ")";
private final static String CODON_DELIM = "|";
private CodingRefSeqFeature[] refSeqFeatures;
private String name2;
private Boolean positiveStrand;
private Map<Integer, List<Integer>> codonToIndices; // Map of: codon index -> MNP indices that refer to codon
public RefSeqFeatureList(int MNPlength) {
this.refSeqFeatures = new CodingRefSeqFeature[MNPlength];
for (int i = 0; i < MNPlength; i++)
this.refSeqFeatures[i] = null;
this.name2 = null;
this.positiveStrand = null;
this.codonToIndices = new TreeMap<Integer, List<Integer>>();
}
public void updateFeatureAtPosition(int index, PositionalRefSeqFeature feature) {
if (name2 == null) {
name2 = feature.name2;
positiveStrand = feature.positiveStrand;
}
else if (!name2.equals(feature.name2) || positiveStrand != feature.positiveStrand) {
throw new UserException("Inconsistency between previous RefSeq entry and: " + feature);
}
CodingRefSeqFeature crsf = new CodingRefSeqFeature(feature);
refSeqFeatures[index] = crsf;
List<Integer> indicesWithCodon = codonToIndices.get(crsf.codonCoord);
if (indicesWithCodon == null) {
indicesWithCodon = new LinkedList<Integer>();
codonToIndices.put(crsf.codonCoord, indicesWithCodon);
}
indicesWithCodon.add(index);
}
public Set<Map.Entry<Integer, List<Integer>>> codonIndicesEntrySet() {
return codonToIndices.entrySet();
}
public String getCodonCoordString() {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < refSeqFeatures.length; i++) {
CodingRefSeqFeature crsf = refSeqFeatures[i];
if (crsf != null)
sb.append(crsf.codonCoord).append(CODON_FRAME_START).append(crsf.codingFrame).append(CODON_FRAME_END);
if (i < refSeqFeatures.length - 1)
sb.append(CODON_DELIM);
}
return sb.toString();
}
}
private static class CodingRefSeqFeature {
protected int codonCoord;
protected int codingFrame;
protected String referenceCodon;
protected String referenceAA;
public CodingRefSeqFeature(PositionalRefSeqFeature feature) {
this.codonCoord = feature.codonCoord;
this.codingFrame = feature.codingFrame;
this.referenceCodon = feature.referenceCodon.toUpperCase();
this.referenceAA = feature.referenceAA;
}
}
private static class CodonAnnotationsForAltAlleles {
protected final static int MIN_CODON_INDEX = 0;
protected final static int NUM_CODON_INDICES = 3;
private final static String CODON_ANNOTATION_DELIM = ",";
private List<SingleCodonAnnotationsForAlleles> alleleAnnotations;
private int[] alleleToNumAAchanges;
private boolean hasAlleleWithMultipleAAchanges;
public CodonAnnotationsForAltAlleles(VariantContext vc, RefSeqFeatureList feature) {
this.alleleAnnotations = new LinkedList<SingleCodonAnnotationsForAlleles>();
Set<Allele> altAlleles = vc.getAlternateAlleles();
int numAltAlleles = altAlleles.size();
this.alleleToNumAAchanges = new int[numAltAlleles];
for (int i = 0; i < numAltAlleles; i++)
this.alleleToNumAAchanges[i] = 0;
int MNPstart = vc.getStart();
int MNPstop = vc.getEnd();
int MNPlength = MNPstop - MNPstart + 1;
for (Map.Entry<Integer, List<Integer>> codonToIndicesEntry : feature.codonIndicesEntrySet()) {
int codonIndex = codonToIndicesEntry.getKey();
List<Integer> indices = codonToIndicesEntry.getValue();
if (indices.isEmpty())
throw new ReviewedStingException("indices should not exist if it's empty!");
for (int index : indices) {
int frame = feature.refSeqFeatures[index].codingFrame;
if (feature.refSeqFeatures[index].codonCoord != codonIndex)
throw new ReviewedStingException("LOGICAL ERROR: feature.refSeqFeatures[index].codonCoord != codonIndex");
if (frame < MIN_CODON_INDEX || frame >= NUM_CODON_INDICES)
throw new UserException("RefSeq codon frame not one of {0,1,2}");
}
CodingRefSeqFeature firstFeatureForCodon = feature.refSeqFeatures[indices.get(0)];
String refCodon = firstFeatureForCodon.referenceCodon;
SingleCodonAnnotationsForAlleles codonAnnotation = new SingleCodonAnnotationsForAlleles(codonIndex, altAlleles, MNPlength, refCodon, firstFeatureForCodon, indices, feature);
alleleAnnotations.add(codonAnnotation);
// From a single codon, summarize the data for ALL alleles:
for (int i = 0; i < numAltAlleles; i++) {
if (codonAnnotation.annotationsForAlleles[i].codonFunc.changesAA) {
alleleToNumAAchanges[i]++;
if (alleleToNumAAchanges[i] > 1)
this.hasAlleleWithMultipleAAchanges = true;
}
}
}
}
public String getCodonAnnotationsString() {
StringBuilder sb = new StringBuilder();
int index = 0;
for (SingleCodonAnnotationsForAlleles codonToAlleles : alleleAnnotations) {
sb.append(codonToAlleles);
if (index < alleleAnnotations.size() - 1)
sb.append(CODON_ANNOTATION_DELIM);
index++;
}
return sb.toString();
}
public String getNumAAchangesString() {
StringBuilder sb = new StringBuilder();
for (int index = 0; index < alleleToNumAAchanges.length; index++) {
sb.append(alleleToNumAAchanges[index]);
if (index < alleleToNumAAchanges.length - 1)
sb.append(SingleCodonAnnotationsForAlleles.ALLELE_ANNOTATION_DELIM);
}
return sb.toString();
}
}
private static class SingleCodonAnnotationsForAlleles {
private final static String CODON_MAP_SYMBOL = "->";
private final static String CODON_ANNOTATION_START = "[";
private final static String CODON_ANNOTATION_END = "]";
private final static String REF_CODON_INFO_DELIM = "|";
private final static String ALLELE_ANNOTATION_DELIM = ",";
private final static String ASSIGNMENT = ":";
private int codonIndex;
private String refCodon;
private String refAA;
private SingleCodonAnnotationsForAllele[] annotationsForAlleles;
public SingleCodonAnnotationsForAlleles(int codonIndex, Collection<Allele> altAlleles, int MNPlength, String refCodon, CodingRefSeqFeature firstFeatureForCodon, List<Integer> indices, RefSeqFeatureList feature) {
if (refCodon.length() != CodonAnnotationsForAltAlleles.NUM_CODON_INDICES)
throw new UserException("RefSeq reference codon " + refCodon + " is not of length " + CodonAnnotationsForAltAlleles.NUM_CODON_INDICES);
AminoAcid refAA = AminoAcidTable.getEukaryoticAA(refCodon);
if (!refAA.getCode().equals(firstFeatureForCodon.referenceAA))
throw new UserException("RefSeq: translated reference codon= " + refAA + " != " + firstFeatureForCodon.referenceAA + " = reference AA");
this.codonIndex = codonIndex;
this.refCodon = refCodon;
this.refAA = refAA.getCode();
this.annotationsForAlleles = new SingleCodonAnnotationsForAllele[altAlleles.size()];
int altInd = 0;
for (Allele altAllele : altAlleles) {
if (altAllele.length() != MNPlength)
throw new ReviewedStingException("length(altAllele) != length(MNP)");
byte[] altBases = altAllele.getBases();
Byte[] variantCodonArr = new Byte[CodonAnnotationsForAltAlleles.NUM_CODON_INDICES];
for (int i = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; i < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; i++)
variantCodonArr[i] = null;
for (int index : indices) {
int frame = feature.refSeqFeatures[index].codingFrame;
if (variantCodonArr[frame] != null)
throw new UserException("RefSeq assigns codon " + codonIndex + " twice at same frame: " + frame);
byte base = altBases[index];
if (!feature.positiveStrand) // negative strand codon
base = BaseUtils.simpleComplement(base);
variantCodonArr[frame] = base;
}
/* For missing frames, there MUST exist AT LEAST one index that refers to this codon,
so use it to derive the missing bases [ALREADY complemented if on the negative strand]:
*/
for (int frame = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; frame < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; frame++) {
if (variantCodonArr[frame] == null)
variantCodonArr[frame] = (byte) refCodon.charAt(frame);
}
String variantCodon = new String(ByteArrayToPrimitive(variantCodonArr)).toUpperCase();
SingleCodonAnnotationsForAllele alleleAnnotation = new SingleCodonAnnotationsForAllele(variantCodon, refCodon, refAA, codonIndex);
annotationsForAlleles[altInd] = alleleAnnotation;
altInd++;
}
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(codonIndex).append(CODON_MAP_SYMBOL).append(CODON_ANNOTATION_START);
sb.append(REFSEQ_REF_CODON).append(ASSIGNMENT).append(refCodon).append(REF_CODON_INFO_DELIM);
sb.append(REFSEQ_REF_AA).append(ASSIGNMENT).append(refAA).append(REF_CODON_INFO_DELIM);
int index = 0;
for (SingleCodonAnnotationsForAllele annotation : annotationsForAlleles) {
sb.append(annotation);
if (index < annotationsForAlleles.length - 1)
sb.append(ALLELE_ANNOTATION_DELIM);
index++;
}
sb.append(CODON_ANNOTATION_END);
return sb.toString();
}
}
private static class SingleCodonAnnotationsForAllele {
private final static String ALLELE_START = "{";
private final static String ALLELE_END = "}";
private final static String CODON_INFO_DELIM = "|";
private final static String ASSIGNMENT = ":";
private final static String MNP_DEPENDENT_AA = "MNPdependentAA";
private CodonFunction codonFunc;
private String proteinCoordStr;
private boolean MNPdependentAA;
private String originalAA;
public SingleCodonAnnotationsForAllele(String variantCodon, String refCodon, AminoAcid refAA, int codonIndex) {
this.codonFunc = new CodonFunction(variantCodon, refCodon, refAA);
this.proteinCoordStr = "p." + refAA.getLetter() + codonIndex + codonFunc.variantAA.getLetter();
int refCodonLength = refCodon.length();
if (codonFunc.variantCodon.length() != refCodonLength)
throw new ReviewedStingException("codonFunc.variantCodon.length() != refCodonLength, but ALREADY checked that they're both 3");
this.MNPdependentAA = true;
this.originalAA = "(";
for (int i = 0; i < refCodonLength; i++) {
// Take [0,i-1] and [i+1, end] from refCodon, and i from variantCodon:
String singleBaseChangeCodon = refCodon.substring(0, i) + variantCodon.substring(i, i+1) + refCodon.substring(i+1, refCodonLength);
CodonFunction singleBaseChangeCodonFunc = new CodonFunction(singleBaseChangeCodon, refCodon, refAA);
if (singleBaseChangeCodonFunc.variantAA.equals(codonFunc.variantAA)) {
this.MNPdependentAA = false;
this.originalAA = "";
break;
}
this.originalAA = this.originalAA + "" + singleBaseChangeCodonFunc.variantAA.getLetter();
if (i < refCodonLength - 1)
this.originalAA = this.originalAA + ",";
}
if (this.MNPdependentAA)
this.originalAA = this.originalAA + ")";
}
private static class CodonFunction {
private String variantCodon;
private AminoAcid variantAA;
private boolean changesAA;
private String functionalClass;
public CodonFunction(String variantCodon, String refCodon, AminoAcid refAA) {
this.variantCodon = variantCodon;
this.variantAA = AminoAcidTable.getEukaryoticAA(this.variantCodon);
this.changesAA = !refAA.equals(variantAA);
if (!this.variantCodon.equals(refCodon)) {
if (changesAA) {
if (variantAA.isStop()) {
functionalClass = "nonsense";
}
else if (refAA.isStop()) {
functionalClass = "readthrough";
}
else {
functionalClass = "missense";
}
}
else { // the same aa:
functionalClass = "silent";
}
}
else { // the same codon:
functionalClass = "no_change";
}
}
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(ALLELE_START);
sb.append(REFSEQ_VARIANT_CODON).append(ASSIGNMENT).append(codonFunc.variantCodon).append(CODON_INFO_DELIM);
sb.append(REFSEQ_VARIANT_AA).append(ASSIGNMENT).append(codonFunc.variantAA.getCode()).append(CODON_INFO_DELIM);
sb.append(REFSEQ_CHANGES_AA).append(ASSIGNMENT).append(codonFunc.changesAA).append(CODON_INFO_DELIM);
sb.append(REFSEQ_FUNCTIONAL_CLASS).append(ASSIGNMENT).append(codonFunc.functionalClass).append(CODON_INFO_DELIM);
sb.append(REFSEQ_PROTEIN_COORD_DESCRIPTION).append(ASSIGNMENT).append(proteinCoordStr).append(CODON_INFO_DELIM);
sb.append(MNP_DEPENDENT_AA).append(ASSIGNMENT).append(MNPdependentAA).append(originalAA);
sb.append(ALLELE_END);
return sb.toString();
}
}
}
// External classes:
class LocusToFeatures {
private Map<GenomeLoc, PositionalRefSeqFeatures> locusToFeatures;
public LocusToFeatures() {
this.locusToFeatures = new TreeMap<GenomeLoc, PositionalRefSeqFeatures>();
}
public PositionalRefSeqFeatures getLocusFeatures(GenomeLoc loc) {
return locusToFeatures.get(loc);
}
public void putLocusFeatures(GenomeLoc loc, AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) {
PositionalRefSeqFeatures locFeatures = locusToFeatures.get(loc);
if (locFeatures == null) {
locFeatures = new PositionalRefSeqFeatures(locusUsingThis);
locusToFeatures.put(loc, locFeatures);
}
locFeatures.putFeature(refSeqAnnotation, locusUsingThis);
}
public Set<Map.Entry<GenomeLoc, PositionalRefSeqFeatures>> entrySet() {
return locusToFeatures.entrySet();
}
public String toString() { // INTERNAL use only
StringBuilder sb = new StringBuilder();
for (Map.Entry<GenomeLoc, PositionalRefSeqFeatures> locFeatures : entrySet()) {
GenomeLoc loc = locFeatures.getKey();
PositionalRefSeqFeatures features = locFeatures.getValue();
sb.append("Locus: ").append(loc).append("\n").append(features);
}
return sb.toString();
}
}
class PositionalRefSeqFeatures {
private final static String[] REQUIRE_COLUMNS =
{AnnotateMNPsWalker.REFSEQ_NAME, AnnotateMNPsWalker.REFSEQ_POSITION_TYPE};
private Map<String, PositionalRefSeqFeature> nameToFeature;
private GenomeLoc furthestLocusUsingFeatures;
public PositionalRefSeqFeatures(GenomeLoc locusUsingThis) {
this.nameToFeature = new HashMap<String, PositionalRefSeqFeature>();
this.furthestLocusUsingFeatures = locusUsingThis;
}
public void putFeature(AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) {
for (String column : REQUIRE_COLUMNS) {
if (!refSeqAnnotation.containsColumnName(column))
throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column);
}
if (locusUsingThis.isPast(furthestLocusUsingFeatures))
furthestLocusUsingFeatures = locusUsingThis;
String posType = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_POSITION_TYPE);
if (!posType.equals(AnnotateMNPsWalker.REFSEQ_CDS)) // only interested in coding sequence annotations
return;
PositionalRefSeqFeature newLocusFeature = new PositionalRefSeqFeature(refSeqAnnotation);
String refSeqName = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME);
PositionalRefSeqFeature locusFeature = nameToFeature.get(refSeqName);
if (locusFeature == null) {
locusFeature = newLocusFeature;
nameToFeature.put(refSeqName, locusFeature);
}
else if (!locusFeature.equals(newLocusFeature)) {
throw new UserException("Inconsistency between previous RefSeq entry and: " + refSeqAnnotation);
}
locusFeature.updateFeature(refSeqAnnotation);
}
public GenomeLoc getFurthestLocusUsingFeatures() {
return furthestLocusUsingFeatures;
}
public Set<Map.Entry<String, PositionalRefSeqFeature>> entrySet() {
return nameToFeature.entrySet();
}
public String toString() { // INTERNAL use only
StringBuilder sb = new StringBuilder();
for (Map.Entry<String, PositionalRefSeqFeature> nameFeatureEntry : entrySet()) {
String name = nameFeatureEntry.getKey();
PositionalRefSeqFeature feature = nameFeatureEntry.getValue();
sb.append(name).append(" -> [").append(feature).append("]\n");
}
return sb.toString();
}
}
class PositionalRefSeqFeature {
private final static String[] REQUIRE_COLUMNS =
{AnnotateMNPsWalker.REFSEQ_NAME2, AnnotateMNPsWalker.REFSEQ_STRAND,
AnnotateMNPsWalker.REFSEQ_CODON_COORD, AnnotateMNPsWalker.REFSEQ_CODING_FRAME,
AnnotateMNPsWalker.REFSEQ_REF_CODON, AnnotateMNPsWalker.REFSEQ_REF_AA};
protected String name2;
protected boolean positiveStrand;
protected int codonCoord;
protected int codingFrame;
protected String referenceCodon;
protected String referenceAA;
private Map<String, BaseAnnotations> baseToAnnotations;
public PositionalRefSeqFeature(AnnotatorInputTableFeature refSeqAnnotation) {
for (String column : REQUIRE_COLUMNS) {
if (!refSeqAnnotation.containsColumnName(column))
throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column);
}
this.name2 = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME2);
this.positiveStrand = (refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_STRAND).equals(AnnotateMNPsWalker.REFSEQ_POS_STRAND));
this.codonCoord = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODON_COORD));
this.codingFrame = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODING_FRAME));
this.referenceCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_CODON);
this.referenceAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_AA);
this.baseToAnnotations = new HashMap<String, BaseAnnotations>();
}
public boolean equals(PositionalRefSeqFeature that) {
return this.name2.equals(that.name2) && this.positiveStrand == that.positiveStrand && this.codonCoord == that.codonCoord && this.codingFrame == that.codingFrame
&& this.referenceCodon.equals(that.referenceCodon) && this.referenceAA.equals(that.referenceAA);
}
public void updateFeature(AnnotatorInputTableFeature refSeqAnnotation) {
if (!refSeqAnnotation.containsColumnName(AnnotateMNPsWalker.REFSEQ_ALT_BASE))
throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + AnnotateMNPsWalker.REFSEQ_ALT_BASE);
String base = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_ALT_BASE);
baseToAnnotations.put(base, new BaseAnnotations(refSeqAnnotation));
}
public String toString() { // INTERNAL use only
StringBuilder sb = new StringBuilder();
sb.append("name2= ").append(name2);
sb.append(", positiveStrand= ").append(positiveStrand);
sb.append(", codonCoord= ").append(codonCoord);
sb.append(", codingFrame= ").append(codingFrame);
sb.append(", referenceCodon= ").append(referenceCodon);
sb.append(", referenceAA= ").append(referenceAA);
sb.append(", baseAnnotations= {");
for (Map.Entry<String, BaseAnnotations> baseToAnnotationsEntry : baseToAnnotations.entrySet()) {
String base = baseToAnnotationsEntry.getKey();
BaseAnnotations annotations = baseToAnnotationsEntry.getValue();
sb.append(" ").append(base).append(" -> {").append(annotations).append("}");
}
sb.append(" }");
return sb.toString();
}
}
class BaseAnnotations {
private final static String[] REQUIRE_COLUMNS =
{AnnotateMNPsWalker.REFSEQ_VARIANT_CODON, AnnotateMNPsWalker.REFSEQ_VARIANT_AA,
AnnotateMNPsWalker.REFSEQ_CHANGES_AA, AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS,
AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION};
protected String variantCodon;
protected String variantAA;
protected boolean changesAA;
protected String functionalClass;
protected String proteinCoordStr;
public BaseAnnotations(AnnotatorInputTableFeature refSeqAnnotation) {
for (String column : REQUIRE_COLUMNS) {
if (!refSeqAnnotation.containsColumnName(column))
throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column);
}
this.variantCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_CODON);
this.variantAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_AA);
this.changesAA = Boolean.parseBoolean(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CHANGES_AA));
this.functionalClass = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS);
this.proteinCoordStr = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION);
}
public String toString() { // INTERNAL use only
StringBuilder sb = new StringBuilder();
sb.append("variantCodon= ").append(variantCodon);
sb.append(", variantAA= ").append(variantAA);
sb.append(", changesAA= ").append(changesAA);
sb.append(", functionalClass= ").append(functionalClass);
sb.append(", proteinCoordStr= ").append(proteinCoordStr);
return sb.toString();
}
}

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator;
package org.broadinstitute.sting.utils;
/**
* Represents a single amino acid.

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator;
package org.broadinstitute.sting.utils;
import java.util.HashMap;

View File

@ -1,83 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator;
import java.util.Arrays;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
public class GenomicAnnotatorIntegrationTest extends WalkerTest {
String testFileWithIndels = validationDataLocation + "/GenomicAnnotatorValidation/1KGBroadWEx.cleaned.indels.vcf";
String testFileWithSNPsAndIndels = validationDataLocation + "/GenomicAnnotatorValidation/1KGBroadWEx.variants.vcf";
@Test
public void testGenomicAnnotatorOnDbSNP() {
/*
TODO put this test back in once it gets faster.
String[] md5 = {"d19d6d1eb52fb09e7493653dc645d92a"};
WalkerTestSpec spec = new WalkerTestSpec(
"-T GenomicAnnotator -R " + b36KGReference + " " +
"-B:variant,vcf /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf " +
"-B:dbsnp,AnnotatorInputTable /humgen/gsa-hpprojects/GATK/data/Annotations/dbsnp/b130/snp130-b36-only-the-SNPs.txt " +
"-m " + //generate many records from one input record if necessary
"-o %s " +
"-BTI variant",
1,
Arrays.asList(md5));
executeTest("test with dbSNP", spec);
*/
String[] md5WithDashSArg = {"efba4ce1641cfa2ef88a64395f2ebce8"};
WalkerTestSpec specWithSArg = new WalkerTestSpec(
"-T GenomicAnnotator -R " + b36KGReference +
" -B:variant,vcf3 /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf" +
" -B:dbsnp,AnnotatorInputTable /humgen/gsa-hpprojects/GATK/data/Annotations/dbsnp/b130/snp130-b36-only-the-SNPs.txt" +
" -m" + //generate many records from one input record if necessary
" -o %s" +
" -BTI variant" +
" -s dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet" +
" -NO_HEADER",
1,
Arrays.asList(md5WithDashSArg));
executeTest("test with dbSNP and -s arg", specWithSArg);
}
@Test
public void testGenomicAnnotatorOnIndels() {
WalkerTestSpec testOnIndels = new WalkerTestSpec(
buildCommandLine(
"-T GenomicAnnotator",
"-R " + b37KGReference,
"-L 22:10000000-20000000",
"-B:refseq,AnnotatorInputTable " + b37Refseq,
"-B:variant,VCF " + testFileWithIndels,
"-NO_HEADER",
"-o %s"
),
1,
Arrays.asList("772fc3f43b70770ec6c6acbb8bbbd4c0")
);
executeTest("testGenomicAnnotatorOnIndels", testOnIndels);
}
@Test
public void testGenomicAnnotatorOnSNPsAndIndels() {
WalkerTestSpec testOnSNPsAndIndels = new WalkerTestSpec(
buildCommandLine(
"-T GenomicAnnotator",
"-R " + b37KGReference,
"-L 22:10000000-20000000",
"-B:refseq,AnnotatorInputTable " + b37Refseq,
"-B:variant,VCF " + testFileWithSNPsAndIndels,
"-NO_HEADER",
"-o %s"
),
1,
Arrays.asList("081ade7f3d2d3c5f19cb1e8651a626f3")
);
executeTest("testGenomicAnnotatorOnSNPsAndIndels", testOnSNPsAndIndels);
}
}

View File

@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("c97829259463d04b0159591bb6fb44af"));
Arrays.asList("16b0c7b47745abcd1ddaa2e261719530"));
executeTest("test MultiSample Pilot1", spec);
}
@ -54,12 +54,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testWithAllelesPassedIn() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("2b69667f4770e8c0c894066b7f27e440"));
Arrays.asList("811ddc0bd8322b14f14f58df8c627aa9"));
executeTest("test MultiSample Pilot2 with alleles passed in", spec1);
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("b77fe007c2a97fcd59dfd5eef94d8b95"));
Arrays.asList("5cf08dd7ac3d218082f7be3915ce0b15"));
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
}
@ -67,7 +67,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("ee8a5e63ddd470726a749e69c0c20f60"));
Arrays.asList("75156264696563c2f47620fef9424f7c"));
executeTest("test SingleSample Pilot2", spec);
}
@ -77,7 +77,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "ef31654a2b85b9b2d3bba4f4a75a17b6";
private final static String COMPRESSED_OUTPUT_MD5 = "7255e03430549cb97d8fcae34cbffb02";
@Test
public void testCompressedOutput() {
@ -107,7 +107,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
String md5 = "46868a9c4134651c54535fb46b408aee";
String md5 = "7912109e83fda21dae90ef8d5dd0140d";
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
@ -138,9 +138,10 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testCallingParameters() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "--min_base_quality_score 26", "5043c9a101e691602eb7a3f9704bdf20" );
e.put( "--min_mapping_quality_score 26", "71a833eb8fd93ee62ae0d5a430f27940" );
e.put( "--p_nonref_model GRID_SEARCH", "ddf443e9dcadef367476b26b4d52c134" );
e.put( "--min_base_quality_score 26", "6d3aa9f783ca63f37c952f83eeda593c" );
e.put( "--min_mapping_quality_score 26", "51bfdf777123bf49de5d92ffde5c74e7" );
e.put( "--p_nonref_model GRID_SEARCH", "333328ab2c8da2875fade599e80a271f" );
e.put( "--computeSLOD", "226caa28a4fa9fe34f3beb8a23f3d53d" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -153,9 +154,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testOutputParameter() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "-sites_only", "eaad6ceb71ab94290650a70bea5ab951" );
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "05bf7db8a3d19ef4a3d14772c90b732f" );
e.put( "--output_mode EMIT_ALL_SITES", "e4b86740468d7369f0156550855586c7" );
e.put( "-sites_only", "5f659dee408710d3709ed72005cd863a" );
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "55d09bf13149bddc06cc36be0801507b" );
e.put( "--output_mode EMIT_ALL_SITES", "727f49dcb2439b18446829efc3b1561c" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -169,12 +170,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
Arrays.asList("71a833eb8fd93ee62ae0d5a430f27940"));
Arrays.asList("51bfdf777123bf49de5d92ffde5c74e7"));
executeTest("test confidence 1", spec1);
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1,
Arrays.asList("79968844dc3ddecb97748c1acf2984c7"));
Arrays.asList("c67c285e70fd4457c9f9ce7bd878ddca"));
executeTest("test confidence 2", spec2);
}
@ -186,8 +187,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testHeterozyosity() {
HashMap<Double, String> e = new HashMap<Double, String>();
e.put( 0.01, "4e878664f61d2d800146d3762303fde1" );
e.put( 1.0 / 1850, "9204caec095ff5e63ca21a10b6fab453" );
e.put( 0.01, "7ecc564d4db97d5932cef2e558550ed2" );
e.put( 1.0 / 1850, "aa9e101bb9f9e111fe292fec467d915a" );
for ( Map.Entry<Double, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -211,7 +212,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("1a58ec52df545f946f80cc16c5736a91"));
Arrays.asList("2efd686186b2c5129be4cf89274a24dd"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -230,25 +231,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("62d0f6d9de344ce68ce121c13b1e78b1"));
Arrays.asList("2892d35331fe9fc141ba19269ec7caed"));
executeTest(String.format("test calling with BAQ"), spec);
}
@Test
public void testCallingWithBAQOff() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand +
" -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" +
" -o %s" +
" -L 1:10,000,000-10,100,000" +
" -baq OFF",
1,
Arrays.asList("1a58ec52df545f946f80cc16c5736a91"));
executeTest(String.format("test calling with BAQ OFF"), spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing indel caller
@ -263,7 +250,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("631ae1f1eb6bc4c1a4136b8495250536"));
Arrays.asList("8c2afb4289ed44521933d1a74c8d6c7f"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -278,7 +265,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("fd556585c79e2b892a5976668f45aa43"));
Arrays.asList("b6fb70590a10e1c27fb611732916f27d"));
executeTest(String.format("test indel caller in SLX witn low min allele count"), spec);
}
@ -291,7 +278,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("9cd56feedd2787919e571383889fde70"));
Arrays.asList("61642502bd08cc03cdaaeb83a5426b46"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -301,14 +288,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("315e1b78d7a403d7fcbcf0caa8c496b8"));
Arrays.asList("69b0b3f089c80b9864294d838a061336"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1);
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf "
+ validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("cf89e0c54f14482a23c105b73a333d8a"));
Arrays.asList("c90174cfd7dd68bdef36fe2c60145e10"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2);
}

View File

@ -32,13 +32,6 @@ public class IndelRealignerIntegrationTest extends WalkerTest {
1,
Arrays.asList(base_md5_with_SW_or_VCF));
executeTest("test realigner defaults with VCF", spec2);
WalkerTestSpec spec3 = new WalkerTestSpec(
baseCommand + "-D " + GATKDataLocation + "dbsnp_129_b36.rod",
1,
Arrays.asList(base_md5));
executeTest("realigner defaults with dbsnp", spec3);
}
@Test
@ -48,12 +41,6 @@ public class IndelRealignerIntegrationTest extends WalkerTest {
1,
Arrays.asList("3dd5d2c9931b375455af0bff1a2c4888"));
executeTest("realigner known indels only from VCF", spec1);
WalkerTestSpec spec2 = new WalkerTestSpec(
baseCommand + "--consensusDeterminationModel KNOWNS_ONLY -D " + GATKDataLocation + "dbsnp_129_b36.rod",
1,
Arrays.asList("05a114623c126b0398fbc1703437461e"));
executeTest("realigner known indels only from dbsnp", spec2);
}
@Test
@ -63,12 +50,6 @@ public class IndelRealignerIntegrationTest extends WalkerTest {
1,
Arrays.asList(base_md5_with_SW_or_VCF));
executeTest("realigner use SW from VCF", spec1);
WalkerTestSpec spec2 = new WalkerTestSpec(
baseCommand + "--consensusDeterminationModel USE_SW -D " + GATKDataLocation + "dbsnp_129_b36.rod",
1,
Arrays.asList(base_md5_with_SW_or_VCF));
executeTest("realigner use SW from dbsnp", spec2);
}
@Test

View File

@ -30,7 +30,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest {
" -LOD 5" +
" -maxConsensuses 100" +
" -greedy 100" +
" -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" +
" -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.hg18.vcf" +
" -o /dev/null" +
" -I " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.bam" +
" -L chr1:1-5,650,000" +
@ -45,7 +45,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest {
" -LOD 5" +
" -maxConsensuses 100" +
" -greedy 100" +
" -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" +
" -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.hg18.vcf" +
" -o /dev/null" +
" -I " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.bam" +
" -L chr1:1-150,000,000" +

View File

@ -11,15 +11,15 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest {
public void testIntervals() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
"-T RealignerTargetCreator -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s",
"-T RealignerTargetCreator -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam --mismatchFraction 0.15 -L 1:10,000,000-10,050,000 -o %s",
1,
Arrays.asList("e7accfa58415d6da80383953b1a3a986"));
executeTest("test standard", spec1);
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
"-T RealignerTargetCreator -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s",
"-T RealignerTargetCreator -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.b36.excluding_sites_after_129.vcf -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s",
1,
Arrays.asList("f23ba17ee0f9573dd307708175d90cd2"));
Arrays.asList("0367d39a122c8ac0899fb868a82ef728"));
executeTest("test dbsnp", spec2);
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(

View File

@ -12,7 +12,7 @@ public class RealignerTargetCreatorPerformanceTest extends WalkerTest {
WalkerTestSpec spec1 = new WalkerTestSpec(
"-R " + hg18Reference +
" -T RealignerTargetCreator" +
" -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" +
" -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.hg18.vcf" +
" -I " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.bam" +
" -L chr1:1-50,000,000" +
" -o /dev/null",
@ -23,7 +23,7 @@ public class RealignerTargetCreatorPerformanceTest extends WalkerTest {
WalkerTestSpec spec2 = new WalkerTestSpec(
"-R " + hg18Reference +
" -T RealignerTargetCreator" +
" -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" +
" -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.hg18.vcf" +
" -I " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.bam" +
" -L " + evaluationDataLocation + "whole_exome_agilent_designed_120.targets.chr1.interval_list" +
" -o /dev/null",