Expanded, contrastive VariantContextBenchmark

-- Compares performance across a bunch of common operations with GATK 1.3 version of VariantContext and GATK 1.4
-- 1.3 VC and associated utilities copied wholesale into test directory under v13
This commit is contained in:
Mark DePristo 2011-11-16 13:35:16 -05:00
parent e56d52006a
commit 101ffc4dfd
31 changed files with 7727 additions and 34 deletions

View File

@ -27,8 +27,14 @@ package org.broadinstitute.sting.utils.variantcontext;
import com.google.caliper.Param;
import com.google.caliper.SimpleBenchmark;
import com.google.caliper.runner.CaliperMain;
import net.sf.picard.reference.ReferenceSequenceFile;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.readers.AsciiLineReader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import java.io.*;
import java.util.*;
@ -46,24 +52,39 @@ public class VariantContextBenchmark extends SimpleBenchmark {
@Param({"100"})
int nSamplesToTake; // set automatically by framework
@Param({"READ", "READ_SUBSET"})
Operation operation; // set automatically by framework
@Param({"10"})
int dupsToMerge; // set automatically by framework
@Param({"OF_SAMPLES"})
SubContextOp subContextOp; // set automatically by framework
@Param
Operation operation; // set automatically by framework
private String INPUT_STRING;
public enum Operation {
READ,
READ_SUBSET
SUBSET_TO_SAMPLES,
GET_TYPE,
GET_ID,
GET_GENOTYPES,
GET_ATTRIBUTE_STRING,
GET_ATTRIBUTE_INT,
GET_N_SAMPLES,
GET_GENOTYPES_FOR_SAMPLES,
GET_GENOTYPES_IN_ORDER_OF_NAME,
CALC_GENOTYPE_COUNTS,
MERGE
}
public enum SubContextOp {
OF_SAMPLES
}
private GenomeLocParser b37GenomeLocParser;
@Override protected void setUp() {
try {
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.b37KGReference));
b37GenomeLocParser = new GenomeLocParser(seq);
} catch ( FileNotFoundException e) {
throw new RuntimeException(e);
}
// read it into a String so that we don't try to benchmark IO issues
try {
FileInputStream s = new FileInputStream(new File(vcfFile));
@ -83,52 +104,266 @@ public class VariantContextBenchmark extends SimpleBenchmark {
}
}
private void parseGenotypes(VCFCodec codec, Operation op, SubContextOp subop ) {
private interface FunctionToBenchmark<T extends Feature> {
public void run(T vc);
}
private <T extends Feature> void runBenchmark(FeatureCodec<T> codec, FunctionToBenchmark<T> func) {
try {
InputStream is = new ByteArrayInputStream(INPUT_STRING.getBytes());
AsciiLineReader lineReader = new AsciiLineReader(is);
codec.readHeader(lineReader);
int counter = 0;
Set<String> samples = null;
while (counter++ < linesToRead ) {
String line = lineReader.readLine();
if ( line == null )
break;
VariantContext vc = (VariantContext)codec.decode(line);
if ( samples == null ) {
samples = new HashSet<String>(new ArrayList<String>(vc.getSampleNames()).subList(0, nSamplesToTake));
}
if ( op == Operation.READ_SUBSET)
processOneVC(vc, samples, subop);
T vc = codec.decode(line);
func.run(vc);
}
} catch (Exception e) {
System.out.println("Benchmarking run failure because of " + e.getMessage());
}
}
public void timeMe(int rep) {
for ( int i = 0; i < rep; i++ )
parseGenotypes(new VCFCodec(), operation, subContextOp);
public void timeV14(int rep) {
for ( int i = 0; i < rep; i++ ) {
FunctionToBenchmark<VariantContext> func = getV14FunctionToBenchmark();
FeatureCodec<VariantContext> codec = new VCFCodec();
runBenchmark(codec, func);
}
}
public FunctionToBenchmark<VariantContext> getV14FunctionToBenchmark() {
switch ( operation ) {
case READ:
return new FunctionToBenchmark<VariantContext>() {
public void run(final VariantContext vc) {
; // empty operation
}
};
case SUBSET_TO_SAMPLES:
return new FunctionToBenchmark<VariantContext>() {
Set<String> samples;
public void run(final VariantContext vc) {
if ( samples == null )
samples = new HashSet<String>(new ArrayList<String>(vc.getSampleNames()).subList(0, nSamplesToTake));
VariantContext sub = vc.subContextFromSamples(samples);
sub.getNSamples();
}
};
case GET_TYPE:
return new FunctionToBenchmark<VariantContext>() {
public void run(final VariantContext vc) {
vc.getType();
}
};
case GET_ID:
return new FunctionToBenchmark<VariantContext>() {
public void run(final VariantContext vc) {
vc.getID();
}
};
case GET_GENOTYPES:
return new FunctionToBenchmark<VariantContext>() {
public void run(final VariantContext vc) {
vc.getGenotypes().size();
}
};
case GET_GENOTYPES_FOR_SAMPLES:
return new FunctionToBenchmark<VariantContext>() {
Set<String> samples;
public void run(final VariantContext vc) {
if ( samples == null )
samples = new HashSet<String>(new ArrayList<String>(vc.getSampleNames()).subList(0, nSamplesToTake));
vc.getGenotypes(samples).size();
}
};
case GET_ATTRIBUTE_STRING:
return new FunctionToBenchmark<VariantContext>() {
public void run(final VariantContext vc) {
vc.getAttribute("AN", null);
}
};
case GET_ATTRIBUTE_INT:
return new FunctionToBenchmark<VariantContext>() {
public void run(final VariantContext vc) {
vc.getAttributeAsInt("AC", 0);
}
};
case GET_N_SAMPLES:
return new FunctionToBenchmark<VariantContext>() {
public void run(final VariantContext vc) {
vc.getNSamples();
}
};
case GET_GENOTYPES_IN_ORDER_OF_NAME:
return new FunctionToBenchmark<VariantContext>() {
public void run(final VariantContext vc) {
; // TODO - TEST IS BROKEN
// int n = 0;
// for ( final Genotype g: vc.getGenotypesSortedByName() ) n++;
}
};
case CALC_GENOTYPE_COUNTS:
return new FunctionToBenchmark<VariantContext>() {
public void run(final VariantContext vc) {
vc.getHetCount();
}
};
case MERGE:
return new FunctionToBenchmark<VariantContext>() {
public void run(final VariantContext vc) {
List<VariantContext> toMerge = new ArrayList<VariantContext>();
for ( int i = 0; i < dupsToMerge; i++ ) {
GenotypeCollection gc = GenotypeCollection.create(vc.getNSamples());
for ( final Genotype g : vc.getGenotypes() ) {
gc.add(new Genotype(g.getSampleName()+"_"+i, g));
}
toMerge.add(VariantContext.modifyGenotypes(vc, gc));
}
VariantContextUtils.simpleMerge(b37GenomeLocParser, toMerge, null,
VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
VariantContextUtils.GenotypeMergeType.UNSORTED,
true, false, "set", false, true);
}
};
default: throw new IllegalArgumentException("Unexpected operation " + operation);
}
}
public void timeV13(int rep) {
for ( int i = 0; i < rep; i++ ) {
FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext> func = getV13FunctionToBenchmark();
FeatureCodec<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext> codec = new org.broadinstitute.sting.utils.variantcontext.v13.VCFCodec();
runBenchmark(codec, func);
}
}
public FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext> getV13FunctionToBenchmark() {
switch ( operation ) {
case READ:
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
; // empty operation
}
};
case SUBSET_TO_SAMPLES:
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
List<String> samples;
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
if ( samples == null )
samples = new ArrayList<String>(vc.getSampleNames()).subList(0, nSamplesToTake);
org.broadinstitute.sting.utils.variantcontext.v13.VariantContext sub = vc.subContextFromGenotypes(vc.getGenotypes(samples).values());
sub.getNSamples();
}
};
case GET_TYPE:
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
vc.getType();
}
};
case GET_ID:
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
vc.getID();
}
};
case GET_GENOTYPES:
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
vc.getGenotypes().size();
}
};
case GET_GENOTYPES_FOR_SAMPLES:
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
Set<String> samples;
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
if ( samples == null )
samples = new HashSet<String>(new ArrayList<String>(vc.getSampleNames()).subList(0, nSamplesToTake));
vc.getGenotypes(samples).size();
}
};
case GET_ATTRIBUTE_STRING:
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
vc.getAttribute("AN", null);
}
};
case GET_ATTRIBUTE_INT:
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
vc.getAttributeAsInt("AC", 0);
}
};
case GET_N_SAMPLES:
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
vc.getNSamples();
}
};
case GET_GENOTYPES_IN_ORDER_OF_NAME:
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
; // TODO - TEST IS BROKEN
//vc.getGenotypesSortedByName();
}
};
case CALC_GENOTYPE_COUNTS:
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
vc.getHetCount();
}
};
case MERGE:
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
List<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext> toMerge = new ArrayList<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>();
for ( int i = 0; i < dupsToMerge; i++ ) {
Map<String, org.broadinstitute.sting.utils.variantcontext.v13.Genotype> gc = new HashMap<String, org.broadinstitute.sting.utils.variantcontext.v13.Genotype>();
for ( final org.broadinstitute.sting.utils.variantcontext.v13.Genotype g : vc.getGenotypes().values() ) {
String name = g.getSampleName()+"_"+i;
gc.put(name, new org.broadinstitute.sting.utils.variantcontext.v13.Genotype(name,
g.getAlleles(), g.getNegLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased(), g.getLikelihoods().getAsVector()));
toMerge.add(org.broadinstitute.sting.utils.variantcontext.v13.VariantContext.modifyGenotypes(vc, gc));
}
}
org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.simpleMerge(b37GenomeLocParser,
toMerge, null,
org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.GenotypeMergeType.UNSORTED,
true, false, "set", false, true);
}
};
default: throw new IllegalArgumentException("Unexpected operation " + operation);
}
}
public static void main(String[] args) {
CaliperMain.main(VariantContextBenchmark.class, args);
}
private static final void processOneVC(VariantContext vc, Set<String> samples, SubContextOp subop) {
VariantContext sub;
switch ( subop ) {
case OF_SAMPLES:
sub = vc.subContextFromSamples(samples, vc.getAlleles());
break;
default:
throw new RuntimeException("Unexpected op: " + subop);
}
sub.getNSamples();
}
}

View File

@ -0,0 +1,635 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.NameAwareCodec;
import org.broad.tribble.TribbleException;
import org.broad.tribble.readers.LineReader;
import org.broad.tribble.util.BlockCompressedInputStream;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.*;
import java.util.*;
import java.util.zip.GZIPInputStream;
abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser {
protected final static Logger log = Logger.getLogger(VCFCodec.class);
protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column
protected VCFHeaderVersion version;
// we have to store the list of strings that make up the header until they're needed
protected VCFHeader header = null;
// a mapping of the allele
protected Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
// for ParsingUtils.split
protected String[] GTValueArray = new String[100];
protected String[] genotypeKeyArray = new String[100];
protected String[] infoFieldArray = new String[1000];
protected String[] infoValueArray = new String[1000];
// for performance testing purposes
public static boolean validate = true;
// a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
// todo: make this thread safe?
protected String[] parts = null;
protected String[] genotypeParts = null;
// for performance we cache the hashmap of filter encodings for quick lookup
protected HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>();
// a mapping of the VCF fields to their type, filter fields, and format fields, for quick lookup to validate against
TreeMap<String, VCFHeaderLineType> infoFields = new TreeMap<String, VCFHeaderLineType>();
TreeMap<String, VCFHeaderLineType> formatFields = new TreeMap<String, VCFHeaderLineType>();
Set<String> filterFields = new HashSet<String>();
// we store a name to give to each of the variant contexts we emit
protected String name = "Unknown";
protected int lineNo = 0;
protected Map<String, String> stringCache = new HashMap<String, String>();
/**
* @param reader the line reader to take header lines from
* @return the number of header lines
*/
public abstract Object readHeader(LineReader reader);
/**
* create a genotype map
* @param str the string
* @param alleles the list of alleles
* @param chr chrom
* @param pos position
* @return a mapping of sample name to genotype object
*/
public abstract Map<String, Genotype> createGenotypeMap(String str, List<Allele> alleles, String chr, int pos);
/**
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
* @param filterString the string to parse
* @return a set of the filters applied
*/
protected abstract Set<String> parseFilters(String filterString);
/**
* create a VCF header
* @param headerStrings a list of strings that represent all the ## entries
* @param line the single # line (column names)
* @return the count of header lines
*/
protected Object createHeader(List<String> headerStrings, String line) {
headerStrings.add(line);
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
Set<String> auxTags = new LinkedHashSet<String>();
// iterate over all the passed in strings
for ( String str : headerStrings ) {
if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) {
String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR);
if ( strings.length < VCFHeader.HEADER_FIELDS.values().length )
throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str);
int arrayIndex = 0;
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
try {
if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex]))
throw new TribbleException.InvalidHeader("we were expecting column name '" + field + "' but we saw '" + strings[arrayIndex] + "'");
} catch (IllegalArgumentException e) {
throw new TribbleException.InvalidHeader("unknown column name '" + strings[arrayIndex] + "'; it does not match a legal column header name.");
}
arrayIndex++;
}
boolean sawFormatTag = false;
if ( arrayIndex < strings.length ) {
if ( !strings[arrayIndex].equals("FORMAT") )
throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'");
sawFormatTag = true;
arrayIndex++;
}
while ( arrayIndex < strings.length )
auxTags.add(strings[arrayIndex++]);
if ( sawFormatTag && auxTags.size() == 0 )
throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data");
} else {
if ( str.startsWith("##INFO=") ) {
VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version);
metaData.add(info);
infoFields.put(info.getName(), info.getType());
} else if ( str.startsWith("##FILTER=") ) {
VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9),version);
metaData.add(filter);
filterFields.add(filter.getName());
} else if ( str.startsWith("##FORMAT=") ) {
VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9),version);
metaData.add(format);
formatFields.put(format.getName(), format.getType());
} else {
int equals = str.indexOf("=");
if ( equals != -1 )
metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1)));
}
}
}
header = new VCFHeader(metaData, auxTags);
return header;
}
/**
* the fast decode function
* @param line the line of text for the record
* @return a feature, (not guaranteed complete) that has the correct start and stop
*/
public Feature decodeLoc(String line) {
lineNo++;
// the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
// our header cannot be null, we need the genotype sample names and counts
if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
final String[] locParts = new String[6];
int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
if ( nParts != 6 )
throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo);
// get our alleles (because the end position depends on them)
final String ref = getCachedString(locParts[3].toUpperCase());
final String alts = getCachedString(locParts[4].toUpperCase());
final List<Allele> alleles = parseAlleles(ref, alts, lineNo);
// find out our location
final int start = Integer.valueOf(locParts[1]);
int stop = start;
// ref alleles don't need to be single bases for monomorphic sites
if ( alleles.size() == 1 ) {
stop = start + alleles.get(0).length() - 1;
} else if ( !isSingleNucleotideEvent(alleles) ) {
stop = clipAlleles(start, ref, alleles, null, lineNo);
}
return new VCFLocFeature(locParts[0], start, stop);
}
private final static class VCFLocFeature implements Feature {
final String chr;
final int start, stop;
private VCFLocFeature(String chr, int start, int stop) {
this.chr = chr;
this.start = start;
this.stop = stop;
}
public String getChr() { return chr; }
public int getStart() { return start; }
public int getEnd() { return stop; }
}
/**
* decode the line into a feature (VariantContext)
* @param line the line
* @return a VariantContext
*/
public Feature decode(String line) {
// the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
// our header cannot be null, we need the genotype sample names and counts
if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
if (parts == null)
parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)];
int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
// if we have don't have a header, or we have a header with no genotyping data check that we have eight columns. Otherwise check that we have nine (normal colummns + genotyping data)
if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) ||
(header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) )
throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
" tokens, and saw " + nParts + " )", lineNo);
return parseVCFLine(parts);
}
protected void generateException(String message) {
throw new UserException.MalformedVCF(message, lineNo);
}
protected static void generateException(String message, int lineNo) {
throw new UserException.MalformedVCF(message, lineNo);
}
/**
* parse out the VCF line
*
* @param parts the parts split up
* @return a variant context object
*/
private VariantContext parseVCFLine(String[] parts) {
// increment the line count
lineNo++;
// parse out the required fields
String contig = getCachedString(parts[0]);
int pos = Integer.valueOf(parts[1]);
String id = null;
if ( parts[2].length() == 0 )
generateException("The VCF specification requires a valid ID field");
else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) )
id = VCFConstants.EMPTY_ID_FIELD;
else
id = new String(parts[2]);
String ref = getCachedString(parts[3].toUpperCase());
String alts = getCachedString(parts[4].toUpperCase());
Double qual = parseQual(parts[5]);
String filter = getCachedString(parts[6]);
String info = new String(parts[7]);
// get our alleles, filters, and setup an attribute map
List<Allele> alleles = parseAlleles(ref, alts, lineNo);
Set<String> filters = parseFilters(filter);
Map<String, Object> attributes = parseInfo(info, id);
// find out our current location, and clip the alleles down to their minimum length
int loc = pos;
// ref alleles don't need to be single bases for monomorphic sites
if ( alleles.size() == 1 ) {
loc = pos + alleles.get(0).length() - 1;
} else if ( !isSingleNucleotideEvent(alleles) ) {
ArrayList<Allele> newAlleles = new ArrayList<Allele>();
loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo);
alleles = newAlleles;
}
// do we have genotyping data
if (parts.length > NUM_STANDARD_FIELDS) {
attributes.put(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, new String(parts[8]));
attributes.put(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY, this);
}
VariantContext vc = null;
try {
vc = new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes, ref.getBytes()[0]);
} catch (Exception e) {
generateException(e.getMessage());
}
// did we resort the sample names? If so, we need to load the genotype data
if ( !header.samplesWereAlreadySorted() )
vc.getGenotypes();
return vc;
}
/**
*
* @return the type of record
*/
public Class<VariantContext> getFeatureType() {
return VariantContext.class;
}
/**
* get the name of this codec
* @return our set name
*/
public String getName() {
return name;
}
/**
* set the name of this codec
* @param name new name
*/
public void setName(String name) {
this.name = name;
}
/**
* Return a cached copy of the supplied string.
*
* @param str string
* @return interned string
*/
protected String getCachedString(String str) {
String internedString = stringCache.get(str);
if ( internedString == null ) {
internedString = new String(str);
stringCache.put(internedString, internedString);
}
return internedString;
}
/**
* parse out the info fields
* @param infoField the fields
* @param id the indentifier
* @return a mapping of keys to objects
*/
private Map<String, Object> parseInfo(String infoField, String id) {
Map<String, Object> attributes = new HashMap<String, Object>();
if ( infoField.length() == 0 )
generateException("The VCF specification requires a valid info field");
if ( !infoField.equals(VCFConstants.EMPTY_INFO_FIELD) ) {
if ( infoField.indexOf("\t") != -1 || infoField.indexOf(" ") != -1 )
generateException("The VCF specification does not allow for whitespace in the INFO field");
int infoFieldSplitSize = ParsingUtils.split(infoField, infoFieldArray, VCFConstants.INFO_FIELD_SEPARATOR_CHAR, false);
for (int i = 0; i < infoFieldSplitSize; i++) {
String key;
Object value;
int eqI = infoFieldArray[i].indexOf("=");
if ( eqI != -1 ) {
key = infoFieldArray[i].substring(0, eqI);
String str = infoFieldArray[i].substring(eqI+1);
// split on the INFO field separator
int infoValueSplitSize = ParsingUtils.split(str, infoValueArray, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR, false);
if ( infoValueSplitSize == 1 ) {
value = infoValueArray[0];
} else {
ArrayList<String> valueList = new ArrayList<String>(infoValueSplitSize);
for ( int j = 0; j < infoValueSplitSize; j++ )
valueList.add(infoValueArray[j]);
value = valueList;
}
} else {
key = infoFieldArray[i];
value = true;
}
attributes.put(key, value);
}
}
if ( ! id.equals(VCFConstants.EMPTY_ID_FIELD) )
attributes.put(VariantContext.ID_KEY, id);
return attributes;
}
/**
* create a an allele from an index and an array of alleles
* @param index the index
* @param alleles the alleles
* @return an Allele
*/
protected static Allele oneAllele(String index, List<Allele> alleles) {
if ( index.equals(VCFConstants.EMPTY_ALLELE) )
return Allele.NO_CALL;
int i = Integer.valueOf(index);
if ( i >= alleles.size() )
throw new TribbleException.InternalCodecException("The allele with index " + index + " is not defined in the REF/ALT columns in the record");
return alleles.get(i);
}
/**
* parse genotype alleles from the genotype string
* @param GT GT string
* @param alleles list of possible alleles
* @param cache cache of alleles for GT
* @return the allele list for the GT string
*/
protected static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
// cache results [since they are immutable] and return a single object for each genotype
List<Allele> GTAlleles = cache.get(GT);
if ( GTAlleles == null ) {
StringTokenizer st = new StringTokenizer(GT, VCFConstants.PHASING_TOKENS);
GTAlleles = new ArrayList<Allele>(st.countTokens());
while ( st.hasMoreTokens() ) {
String genotype = st.nextToken();
GTAlleles.add(oneAllele(genotype, alleles));
}
cache.put(GT, GTAlleles);
}
return GTAlleles;
}
/**
* parse out the qual value
* @param qualString the quality string
* @return return a double
*/
protected static Double parseQual(String qualString) {
// if we're the VCF 4 missing char, return immediately
if ( qualString.equals(VCFConstants.MISSING_VALUE_v4))
return VariantContext.NO_NEG_LOG_10PERROR;
Double val = Double.valueOf(qualString);
// check to see if they encoded the missing qual score in VCF 3 style, with either the -1 or -1.0. check for val < 0 to save some CPU cycles
if ((val < 0) && (Math.abs(val - VCFConstants.MISSING_QUALITY_v3_DOUBLE) < VCFConstants.VCF_ENCODING_EPSILON))
return VariantContext.NO_NEG_LOG_10PERROR;
// scale and return the value
return val / 10.0;
}
/**
* parse out the alleles
* @param ref the reference base
* @param alts a string of alternates to break into alleles
* @param lineNo the line number for this record
* @return a list of alleles, and a pair of the shortest and longest sequence
*/
protected static List<Allele> parseAlleles(String ref, String alts, int lineNo) {
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
// ref
checkAllele(ref, true, lineNo);
Allele refAllele = Allele.create(ref, true);
alleles.add(refAllele);
if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split
parseSingleAltAllele(alleles, alts, lineNo);
else
for ( String alt : alts.split(",") )
parseSingleAltAllele(alleles, alt, lineNo);
return alleles;
}
/**
* check to make sure the allele is an acceptable allele
* @param allele the allele to check
* @param isRef are we the reference allele?
* @param lineNo the line number for this record
*/
private static void checkAllele(String allele, boolean isRef, int lineNo) {
if ( allele == null || allele.length() == 0 )
generateException("Empty alleles are not permitted in VCF records", lineNo);
if ( isSymbolicAllele(allele) ) {
if ( isRef ) {
generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo);
}
} else {
// check for VCF3 insertions or deletions
if ( (allele.charAt(0) == VCFConstants.DELETION_ALLELE_v3) || (allele.charAt(0) == VCFConstants.INSERTION_ALLELE_v3) )
generateException("Insertions/Deletions are not supported when reading 3.x VCF's. Please" +
" convert your file to VCF4 using VCFTools, available at http://vcftools.sourceforge.net/index.html", lineNo);
if (!Allele.acceptableAlleleBases(allele))
generateException("Unparsable vcf record with allele " + allele, lineNo);
if ( isRef && allele.equals(VCFConstants.EMPTY_ALLELE) )
generateException("The reference allele cannot be missing", lineNo);
}
}
/**
* return true if this is a symbolic allele (e.g. <SOMETAG>) otherwise false
* @param allele the allele to check
* @return true if the allele is a symbolic allele, otherwise false
*/
private static boolean isSymbolicAllele(String allele) {
return (allele != null && allele.startsWith("<") && allele.endsWith(">") && allele.length() > 2);
}
/**
* parse a single allele, given the allele list
* @param alleles the alleles available
* @param alt the allele to parse
* @param lineNo the line number for this record
*/
private static void parseSingleAltAllele(List<Allele> alleles, String alt, int lineNo) {
checkAllele(alt, false, lineNo);
Allele allele = Allele.create(alt, false);
if ( ! allele.isNoCall() )
alleles.add(allele);
}
protected static boolean isSingleNucleotideEvent(List<Allele> alleles) {
for ( Allele a : alleles ) {
if ( a.length() != 1 )
return false;
}
return true;
}
public static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) {
boolean clipping = true;
for ( Allele a : unclippedAlleles ) {
if ( a.isSymbolic() )
continue;
if ( a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0]) ) {
clipping = false;
break;
}
}
return (clipping) ? 1 : 0;
}
protected static int computeReverseClipping(List<Allele> unclippedAlleles, String ref, int forwardClipping, int lineNo) {
int clipping = 0;
boolean stillClipping = true;
while ( stillClipping ) {
for ( Allele a : unclippedAlleles ) {
if ( a.isSymbolic() )
continue;
if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 )
stillClipping = false;
else if ( ref.length() == clipping )
generateException("bad alleles encountered", lineNo);
else if ( a.getBases()[a.length()-clipping-1] != ref.getBytes()[ref.length()-clipping-1] )
stillClipping = false;
}
if ( stillClipping )
clipping++;
}
return clipping;
}
/**
* clip the alleles, based on the reference
*
* @param position the unadjusted start position (pre-clipping)
* @param ref the reference string
* @param unclippedAlleles the list of unclipped alleles
* @param clippedAlleles output list of clipped alleles
* @param lineNo the current line number in the file
* @return the new reference end position of this event
*/
protected static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
int forwardClipping = computeForwardClipping(unclippedAlleles, ref);
int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo);
if ( clippedAlleles != null ) {
for ( Allele a : unclippedAlleles ) {
if ( a.isSymbolic() ) {
clippedAlleles.add(a);
} else {
clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length-reverseClipping), a.isReference()));
}
}
}
// the new reference length
int refLength = ref.length() - reverseClipping;
return position+Math.max(refLength - 1,0);
}
public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) {
try {
return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) ||
isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE) ||
isVCFStream(new BlockCompressedInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE);
} catch ( FileNotFoundException e ) {
return false;
} catch ( IOException e ) {
return false;
}
}
private final static boolean isVCFStream(final InputStream stream, final String MAGIC_HEADER_LINE) {
try {
byte[] buff = new byte[MAGIC_HEADER_LINE.length()];
int nread = stream.read(buff, 0, MAGIC_HEADER_LINE.length());
boolean eq = Arrays.equals(buff, MAGIC_HEADER_LINE.getBytes());
return eq;
// String firstLine = new String(buff);
// return firstLine.startsWith(MAGIC_HEADER_LINE);
} catch ( IOException e ) {
return false;
} catch ( RuntimeException e ) {
return false;
} finally {
try { stream.close(); } catch ( IOException e ) {}
}
}
}

View File

@ -0,0 +1,456 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
/**
* Immutable representation of an allele
*
* Types of alleles:
*
* Ref: a t C g a // C is the reference base
*
* : a t G g a // C base is a G in some individuals
*
* : a t - g a // C base is deleted w.r.t. the reference
*
* : a t CAg a // A base is inserted w.r.t. the reference sequence
*
* In these cases, where are the alleles?
*
* SNP polymorphism of C/G -> { C , G } -> C is the reference allele
* 1 base deletion of C -> { C , - } -> C is the reference allele
* 1 base insertion of A -> { - ; A } -> Null is the reference allele
*
* Suppose I see a the following in the population:
*
* Ref: a t C g a // C is the reference base
* : a t G g a // C base is a G in some individuals
* : a t - g a // C base is deleted w.r.t. the reference
*
* How do I represent this? There are three segregating alleles:
*
* { C , G , - }
*
* Now suppose I have this more complex example:
*
* Ref: a t C g a // C is the reference base
* : a t - g a
* : a t - - a
* : a t CAg a
*
* There are actually four segregating alleles:
*
* { C g , - g, - -, and CAg } over bases 2-4
*
* However, the molecular equivalence explicitly listed above is usually discarded, so the actual
* segregating alleles are:
*
* { C g, g, -, C a g }
*
* Critically, it should be possible to apply an allele to a reference sequence to create the
* correct haplotype sequence:
*
* Allele + reference => haplotype
*
* For convenience, we are going to create Alleles where the GenomeLoc of the allele is stored outside of the
* Allele object itself. So there's an idea of an A/C polymorphism independent of it's surrounding context.
*
* Given list of alleles it's possible to determine the "type" of the variation
*
* A / C @ loc => SNP with
* - / A => INDEL
*
* If you know where allele is the reference, you can determine whether the variant is an insertion or deletion.
*
* Alelle also supports is concept of a NO_CALL allele. This Allele represents a haplotype that couldn't be
* determined. This is usually represented by a '.' allele.
*
* Note that Alleles store all bases as bytes, in **UPPER CASE**. So 'atc' == 'ATC' from the perspective of an
* Allele.
* @author ebanks, depristo
*/
class Allele implements Comparable<Allele> {
private static final byte[] EMPTY_ALLELE_BASES = new byte[0];
private boolean isRef = false;
private boolean isNull = false;
private boolean isNoCall = false;
private boolean isSymbolic = false;
private byte[] bases = null;
public final static String NULL_ALLELE_STRING = "-";
public final static String NO_CALL_STRING = ".";
/** A generic static NO_CALL allele for use */
// no public way to create an allele
private Allele(byte[] bases, boolean isRef) {
// standardize our representation of null allele and bases
if ( wouldBeNullAllele(bases) ) {
bases = EMPTY_ALLELE_BASES;
isNull = true;
} else if ( wouldBeNoCallAllele(bases) ) {
bases = EMPTY_ALLELE_BASES;
isNoCall = true;
if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele");
} else if ( wouldBeSymbolicAllele(bases) ) {
isSymbolic = true;
if ( isRef ) throw new IllegalArgumentException("Cannot tag a symbolic allele as the reference allele");
}
// else
// bases = new String(bases).toUpperCase().getBytes(); // todo -- slow performance
this.isRef = isRef;
this.bases = bases;
if ( ! acceptableAlleleBases(bases) )
throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'");
}
private Allele(String bases, boolean isRef) {
this(bases.getBytes(), isRef);
}
private final static Allele REF_A = new Allele("A", true);
private final static Allele ALT_A = new Allele("A", false);
private final static Allele REF_C = new Allele("C", true);
private final static Allele ALT_C = new Allele("C", false);
private final static Allele REF_G = new Allele("G", true);
private final static Allele ALT_G = new Allele("G", false);
private final static Allele REF_T = new Allele("T", true);
private final static Allele ALT_T = new Allele("T", false);
private final static Allele REF_N = new Allele("N", true);
private final static Allele ALT_N = new Allele("N", false);
private final static Allele REF_NULL = new Allele(NULL_ALLELE_STRING, true);
private final static Allele ALT_NULL = new Allele(NULL_ALLELE_STRING, false);
public final static Allele NO_CALL = new Allele(NO_CALL_STRING, false);
// ---------------------------------------------------------------------------------------------------------
//
// creation routines
//
// ---------------------------------------------------------------------------------------------------------
/**
* Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases
* == '-', a Null allele is created. If bases == '.', a no call Allele is created.
*
* @param bases the DNA sequence of this variation, '-', of '.'
* @param isRef should we make this a reference allele?
* @throws IllegalArgumentException if bases contains illegal characters or is otherwise malformated
*/
public static Allele create(byte[] bases, boolean isRef) {
if ( bases == null )
throw new IllegalArgumentException("create: the Allele base string cannot be null; use new Allele() or new Allele(\"\") to create a Null allele");
if ( bases.length == 1 ) {
// optimization to return a static constant Allele for each single base object
switch (bases[0]) {
case '.':
if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele");
return NO_CALL;
case '-': return isRef ? REF_NULL : ALT_NULL;
case 'A': case 'a' : return isRef ? REF_A : ALT_A;
case 'C': case 'c' : return isRef ? REF_C : ALT_C;
case 'G': case 'g' : return isRef ? REF_G : ALT_G;
case 'T': case 't' : return isRef ? REF_T : ALT_T;
case 'N': case 'n' : return isRef ? REF_N : ALT_N;
default: throw new IllegalArgumentException("Illegal base: " + (char)bases[0]);
}
} else {
return new Allele(bases, isRef);
}
}
public static Allele create(byte base, boolean isRef) {
// public Allele(byte base, boolean isRef) {
return create( new byte[]{ base }, isRef);
}
public static Allele create(byte base) {
return create( base, false );
}
public static Allele extend(Allele left, byte[] right) {
if (left.isSymbolic())
throw new IllegalArgumentException("Cannot extend a symbolic allele");
byte[] bases = null;
if ( left.length() == 0 )
bases = right;
else {
bases = new byte[left.length() + right.length];
System.arraycopy(left.getBases(), 0, bases, 0, left.length());
System.arraycopy(right, 0, bases, left.length(), right.length);
}
return create(bases, left.isReference());
}
/**
* @param bases bases representing an allele
* @return true if the bases represent the null allele
*/
public static boolean wouldBeNullAllele(byte[] bases) {
return (bases.length == 1 && bases[0] == '-') || bases.length == 0;
}
/**
* @param bases bases representing an allele
* @return true if the bases represent the NO_CALL allele
*/
public static boolean wouldBeNoCallAllele(byte[] bases) {
return bases.length == 1 && bases[0] == '.';
}
/**
* @param bases bases representing an allele
* @return true if the bases represent a symbolic allele
*/
public static boolean wouldBeSymbolicAllele(byte[] bases) {
return bases.length > 2 && bases[0] == '<' && bases[bases.length-1] == '>';
}
/**
* @param bases bases representing an allele
* @return true if the bases represent the well formatted allele
*/
public static boolean acceptableAlleleBases(String bases) {
return acceptableAlleleBases(bases.getBytes());
}
/**
* @param bases bases representing an allele
* @return true if the bases represent the well formatted allele
*/
public static boolean acceptableAlleleBases(byte[] bases) {
if ( wouldBeNullAllele(bases) || wouldBeNoCallAllele(bases) || wouldBeSymbolicAllele(bases) )
return true;
for ( int i = 0; i < bases.length; i++ ) {
switch (bases[i]) {
case 'A': case 'C': case 'G': case 'T': case 'N' : case 'a': case 'c': case 'g': case 't': case 'n' :
break;
default:
return false;
}
}
return true;
}
/**
* @see Allele(byte[], boolean)
*
* @param bases bases representing an allele
* @param isRef is this the reference allele?
*/
public static Allele create(String bases, boolean isRef) {
//public Allele(String bases, boolean isRef) {
return create(bases.getBytes(), isRef);
}
/**
* Creates a non-Ref allele. @see Allele(byte[], boolean) for full information
*
* @param bases bases representing an allele
*/
public static Allele create(String bases) {
return create(bases, false);
}
/**
* Creates a non-Ref allele. @see Allele(byte[], boolean) for full information
*
* @param bases bases representing an allele
*/
public static Allele create(byte[] bases) {
return create(bases, false);
//this(bases, false);
}
// ---------------------------------------------------------------------------------------------------------
//
// accessor routines
//
// ---------------------------------------------------------------------------------------------------------
//Returns true if this is the null allele
public boolean isNull() { return isNull; }
// Returns true if this is not the null allele
public boolean isNonNull() { return ! isNull(); }
// Returns true if this is the NO_CALL allele
public boolean isNoCall() { return isNoCall; }
// Returns true if this is not the NO_CALL allele
public boolean isCalled() { return ! isNoCall(); }
// Returns true if this Allele is the reference allele
public boolean isReference() { return isRef; }
// Returns true if this Allele is not the reference allele
public boolean isNonReference() { return ! isReference(); }
// Returns true if this Allele is symbolic (i.e. no well-defined base sequence)
public boolean isSymbolic() { return isSymbolic; }
// Returns a nice string representation of this object
public String toString() {
return (isNull() ? NULL_ALLELE_STRING : ( isNoCall() ? NO_CALL_STRING : getDisplayString() )) + (isReference() ? "*" : "");
}
/**
* Return the DNA bases segregating in this allele. Note this isn't reference polarized,
* so the Null allele is represented by a vector of length 0
*
* @return the segregating bases
*/
public byte[] getBases() { return isSymbolic ? EMPTY_ALLELE_BASES : bases; }
/**
* Return the DNA bases segregating in this allele in String format.
* This is useful, because toString() adds a '*' to reference alleles and getBases() returns garbage when you call toString() on it.
*
* @return the segregating bases
*/
public String getBaseString() { return new String(getBases()); }
/**
* Return the printed representation of this allele.
* Same as getBaseString(), except for symbolic alleles.
* For symbolic alleles, the base string is empty while the display string contains <TAG>.
*
* @return the allele string representation
*/
public String getDisplayString() { return new String(bases); }
/**
* @param other the other allele
*
* @return true if these alleles are equal
*/
public boolean equals(Object other) {
return ( ! (other instanceof Allele) ? false : equals((Allele)other, false) );
}
/**
* @return hash code
*/
public int hashCode() {
int hash = 1;
for (int i = 0; i < bases.length; i++)
hash += (i+1) * bases[i];
return hash;
}
/**
* Returns true if this and other are equal. If ignoreRefState is true, then doesn't require both alleles has the
* same ref tag
*
* @param other allele to compare to
* @param ignoreRefState if true, ignore ref state in comparison
* @return true if this and other are equal
*/
public boolean equals(Allele other, boolean ignoreRefState) {
return this == other || (isRef == other.isRef || ignoreRefState) && isNull == other.isNull && isNoCall == other.isNoCall && (bases == other.bases || Arrays.equals(bases, other.bases));
}
/**
* @param test bases to test against
*
* @return true if this Alelle contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
*/
public boolean basesMatch(byte[] test) { return !isSymbolic && (bases == test || Arrays.equals(bases, test)); }
/**
* @param test bases to test against
*
* @return true if this Alelle contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
*/
public boolean basesMatch(String test) { return basesMatch(test.toUpperCase().getBytes()); }
/**
* @param test allele to test against
*
* @return true if this Alelle contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
*/
public boolean basesMatch(Allele test) { return basesMatch(test.getBases()); }
/**
* @return the length of this allele. Null and NO_CALL alleles have 0 length.
*/
public int length() {
return isSymbolic ? 0 : bases.length;
}
// ---------------------------------------------------------------------------------------------------------
//
// useful static functions
//
// ---------------------------------------------------------------------------------------------------------
public static Allele getMatchingAllele(Collection<Allele> allAlleles, String alleleBases) {
return getMatchingAllele(allAlleles, alleleBases.getBytes());
}
public static Allele getMatchingAllele(Collection<Allele> allAlleles, byte[] alleleBases) {
for ( Allele a : allAlleles ) {
if ( a.basesMatch(alleleBases) ) {
return a;
}
}
if ( wouldBeNoCallAllele(alleleBases) )
return NO_CALL;
else
return null; // couldn't find anything
}
public static List<Allele> resolveAlleles(List<Allele> possibleAlleles, List<String> alleleStrings) {
List<Allele> myAlleles = new ArrayList<Allele>(alleleStrings.size());
for ( String alleleString : alleleStrings ) {
Allele allele = getMatchingAllele(possibleAlleles, alleleString);
if ( allele == null ) {
if ( Allele.wouldBeNoCallAllele(alleleString.getBytes()) ) {
allele = create(alleleString);
} else {
throw new IllegalArgumentException("Allele " + alleleString + " not present in the list of alleles " + possibleAlleles);
}
}
myAlleles.add(allele);
}
return myAlleles;
}
public int compareTo(Allele other) {
if ( isReference() && other.isNonReference() )
return -1;
else if ( isNonReference() && other.isReference() )
return 1;
else
return getBaseString().compareTo(other.getBaseString()); // todo -- potential performance issue
}
public static boolean oneIsPrefixOfOther(Allele a1, Allele a2) {
if ( a1.isNull() || a2.isNull() )
return true;
if ( a2.length() >= a1.length() )
return firstIsPrefixOfSecond(a1, a2);
else
return firstIsPrefixOfSecond(a2, a1);
}
private static boolean firstIsPrefixOfSecond(Allele a1, Allele a2) {
String a1String = a1.getBaseString();
return a2.getBaseString().substring(0, a1String.length()).equals(a1String);
}
}

View File

@ -0,0 +1,349 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
/**
* This class encompasses all the basic information about a genotype. It is immutable.
*
* @author Mark DePristo
*/
public class Genotype {
public final static String PHASED_ALLELE_SEPARATOR = "|";
public final static String UNPHASED_ALLELE_SEPARATOR = "/";
protected InferredGeneticContext commonInfo;
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
protected List<Allele> alleles = null; // new ArrayList<Allele>();
protected Type type = null;
protected boolean isPhased = false;
protected boolean filtersWereAppliedToContext;
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased) {
this(sampleName, alleles, negLog10PError, filters, attributes, isPhased, null);
}
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased, double[] log10Likelihoods) {
if ( alleles != null )
this.alleles = Collections.unmodifiableList(alleles);
commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes);
if ( log10Likelihoods != null )
commonInfo.putAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(log10Likelihoods));
filtersWereAppliedToContext = filters != null;
this.isPhased = isPhased;
validate();
}
/**
* Creates a new Genotype for sampleName with genotype according to alleles.
* @param sampleName
* @param alleles
* @param negLog10PError the confidence in these alleles
* @param log10Likelihoods a log10 likelihoods for each of the genotype combinations possible for alleles, in the standard VCF ordering, or null if not known
*/
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, double[] log10Likelihoods) {
this(sampleName, alleles, negLog10PError, null, null, false, log10Likelihoods);
}
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError) {
this(sampleName, alleles, negLog10PError, null, null, false);
}
public Genotype(String sampleName, List<Allele> alleles) {
this(sampleName, alleles, NO_NEG_LOG_10PERROR, null, null, false);
}
// ---------------------------------------------------------------------------------------------------------
//
// Partial-cloning routines (because Genotype is immutable).
//
// ---------------------------------------------------------------------------------------------------------
public static Genotype modifyName(Genotype g, String name) {
return new Genotype(name, g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased());
}
public static Genotype modifyAttributes(Genotype g, Map<String, Object> attributes) {
return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.isPhased());
}
public static Genotype modifyAlleles(Genotype g, List<Allele> alleles) {
return new Genotype(g.getSampleName(), alleles, g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased());
}
/**
* @return the alleles for this genotype
*/
public List<Allele> getAlleles() {
return alleles;
}
public List<Allele> getAlleles(Allele allele) {
if ( getType() == Type.UNAVAILABLE )
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
List<Allele> al = new ArrayList<Allele>();
for ( Allele a : alleles )
if ( a.equals(allele) )
al.add(a);
return Collections.unmodifiableList(al);
}
public Allele getAllele(int i) {
if ( getType() == Type.UNAVAILABLE )
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
return alleles.get(i);
}
public boolean isPhased() { return isPhased; }
/**
* @return the ploidy of this genotype
*/
public int getPloidy() {
if ( alleles == null )
throw new ReviewedStingException("Requesting ploidy for an UNAVAILABLE genotype");
return alleles.size();
}
public enum Type {
NO_CALL,
HOM_REF,
HET,
HOM_VAR,
UNAVAILABLE,
MIXED // no-call and call in the same genotype
}
public Type getType() {
if ( type == null ) {
type = determineType();
}
return type;
}
protected Type determineType() {
if ( alleles == null )
return Type.UNAVAILABLE;
boolean sawNoCall = false, sawMultipleAlleles = false;
Allele observedAllele = null;
for ( Allele allele : alleles ) {
if ( allele.isNoCall() )
sawNoCall = true;
else if ( observedAllele == null )
observedAllele = allele;
else if ( !allele.equals(observedAllele) )
sawMultipleAlleles = true;
}
if ( sawNoCall ) {
if ( observedAllele == null )
return Type.NO_CALL;
return Type.MIXED;
}
if ( observedAllele == null )
throw new ReviewedStingException("BUG: there are no alleles present in this genotype but the alleles list is not null");
return sawMultipleAlleles ? Type.HET : observedAllele.isReference() ? Type.HOM_REF : Type.HOM_VAR;
}
/**
* @return true if all observed alleles are the same (regardless of whether they are ref or alt); if any alleles are no-calls, this method will return false.
*/
public boolean isHom() { return isHomRef() || isHomVar(); }
/**
* @return true if all observed alleles are ref; if any alleles are no-calls, this method will return false.
*/
public boolean isHomRef() { return getType() == Type.HOM_REF; }
/**
* @return true if all observed alleles are alt; if any alleles are no-calls, this method will return false.
*/
public boolean isHomVar() { return getType() == Type.HOM_VAR; }
/**
* @return true if we're het (observed alleles differ); if the ploidy is less than 2 or if any alleles are no-calls, this method will return false.
*/
public boolean isHet() { return getType() == Type.HET; }
/**
* @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF); if any alleles are not no-calls (even if some are), this method will return false.
*/
public boolean isNoCall() { return getType() == Type.NO_CALL; }
/**
* @return true if this genotype is comprised of any alleles that are not no-calls (even if some are).
*/
public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; }
/**
* @return true if this genotype is comprised of both calls and no-calls.
*/
public boolean isMixed() { return getType() == Type.MIXED; }
/**
* @return true if the type of this genotype is set.
*/
public boolean isAvailable() { return getType() != Type.UNAVAILABLE; }
//
// Useful methods for getting genotype likelihoods for a genotype object, if present
//
public boolean hasLikelihoods() {
return (hasAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) && !getAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY).equals(VCFConstants.MISSING_VALUE_v4)) ||
(hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && !getAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).equals(VCFConstants.MISSING_VALUE_v4));
}
public GenotypeLikelihoods getLikelihoods() {
GenotypeLikelihoods x = getLikelihoods(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, true);
if ( x != null )
return x;
else {
x = getLikelihoods(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, false);
if ( x != null ) return x;
else
throw new IllegalStateException("BUG: genotype likelihood field in " + this.getSampleName() + " sample are not either a string or a genotype likelihood class!");
}
}
private GenotypeLikelihoods getLikelihoods(String key, boolean asPL) {
Object x = getAttribute(key);
if ( x instanceof String ) {
if ( asPL )
return GenotypeLikelihoods.fromPLField((String)x);
else
return GenotypeLikelihoods.fromGLField((String)x);
}
else if ( x instanceof GenotypeLikelihoods ) return (GenotypeLikelihoods)x;
else return null;
}
public void validate() {
if ( alleles == null ) return;
if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0");
// int nNoCalls = 0;
for ( Allele allele : alleles ) {
if ( allele == null )
throw new IllegalArgumentException("BUG: allele cannot be null in Genotype");
// nNoCalls += allele.isNoCall() ? 1 : 0;
}
// Technically, the spec does allow for the below case so this is not an illegal state
//if ( nNoCalls > 0 && nNoCalls != alleles.size() )
// throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this);
}
public String getGenotypeString() {
return getGenotypeString(true);
}
public String getGenotypeString(boolean ignoreRefState) {
if ( alleles == null )
return null;
// Notes:
// 1. Make sure to use the appropriate separator depending on whether the genotype is phased
// 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele)
// 3. So that everything is deterministic with regards to integration tests, we sort Alleles (when the genotype isn't phased, of course)
return ParsingUtils.join(isPhased() ? PHASED_ALLELE_SEPARATOR : UNPHASED_ALLELE_SEPARATOR,
ignoreRefState ? getAlleleStrings() : (isPhased() ? getAlleles() : ParsingUtils.sortList(getAlleles())));
}
private List<String> getAlleleStrings() {
List<String> al = new ArrayList<String>();
for ( Allele a : alleles )
al.add(a.getBaseString());
return al;
}
public String toString() {
int Q = (int)Math.round(getPhredScaledQual());
return String.format("[%s %s Q%s %s]", getSampleName(), getGenotypeString(false),
Q == -10 ? "." : String.format("%2d",Q), sortedString(getAttributes()));
}
public String toBriefString() {
return String.format("%s:Q%.2f", getGenotypeString(false), getPhredScaledQual());
}
public boolean sameGenotype(Genotype other) {
return sameGenotype(other, true);
}
public boolean sameGenotype(Genotype other, boolean ignorePhase) {
if (getPloidy() != other.getPloidy())
return false; // gotta have the same number of allele to be equal
// By default, compare the elements in the lists of alleles, element-by-element
Collection<Allele> thisAlleles = this.getAlleles();
Collection<Allele> otherAlleles = other.getAlleles();
if (ignorePhase) { // do not care about order, only identity of Alleles
thisAlleles = new TreeSet<Allele>(thisAlleles); //implemented Allele.compareTo()
otherAlleles = new TreeSet<Allele>(otherAlleles);
}
return thisAlleles.equals(otherAlleles);
}
/**
* a utility method for generating sorted strings from a map key set.
* @param c the map
* @param <T> the key type
* @param <V> the value type
* @return a sting, enclosed in {}, with comma seperated key value pairs in order of the keys
*/
private static <T extends Comparable<T>, V> String sortedString(Map<T, V> c) {
// NOTE -- THIS IS COPIED FROM GATK UTILS TO ALLOW US TO KEEP A SEPARATION BETWEEN THE GATK AND VCF CODECS
List<T> t = new ArrayList<T>(c.keySet());
Collections.sort(t);
List<String> pairs = new ArrayList<String>();
for (T k : t) {
pairs.add(k + "=" + c.get(k));
}
return "{" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}";
}
// ---------------------------------------------------------------------------------------------------------
//
// get routines to access context info fields
//
// ---------------------------------------------------------------------------------------------------------
public String getSampleName() { return commonInfo.getName(); }
public Set<String> getFilters() { return commonInfo.getFilters(); }
public boolean isFiltered() { return commonInfo.isFiltered(); }
public boolean isNotFiltered() { return commonInfo.isNotFiltered(); }
public boolean filtersWereApplied() { return filtersWereAppliedToContext; }
public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); }
public double getNegLog10PError() { return commonInfo.getNegLog10PError(); }
public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); }
public Map<String, Object> getAttributes() { return commonInfo.getAttributes(); }
public boolean hasAttribute(String key) { return commonInfo.hasAttribute(key); }
public Object getAttribute(String key) { return commonInfo.getAttribute(key); }
public Object getAttribute(String key, Object defaultValue) {
return commonInfo.getAttribute(key, defaultValue);
}
public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); }
public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); }
public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); }
public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); }
}

View File

@ -0,0 +1,196 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext.v13;
import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.EnumMap;
import java.util.Map;
public class GenotypeLikelihoods {
public static final boolean CAP_PLS = false;
public static final int PL_CAP = 255;
//
// There are two objects here because we are lazy in creating both representations
// for this object: a vector of log10 Probs and the PL phred-scaled string. Supports
// having one set during initializating, and dynamic creation of the other, if needed
//
private double[] log10Likelihoods = null;
private String likelihoodsAsString_PLs = null;
public final static GenotypeLikelihoods fromPLField(String PLs) {
return new GenotypeLikelihoods(PLs);
}
public final static GenotypeLikelihoods fromGLField(String GLs) {
return new GenotypeLikelihoods(parseDeprecatedGLString(GLs));
}
public final static GenotypeLikelihoods fromLog10Likelihoods(double[] log10Likelihoods) {
return new GenotypeLikelihoods(log10Likelihoods);
}
//
// You must use the factory methods now
//
protected GenotypeLikelihoods(String asString) {
likelihoodsAsString_PLs = asString;
}
protected GenotypeLikelihoods(double[] asVector) {
log10Likelihoods = asVector;
}
/**
* Returns the genotypes likelihoods in negative log10 vector format. pr{AA} = x, this
* vector returns math.log10(x) for each of the genotypes. Can return null if the
* genotype likelihoods are "missing".
*
* @return
*/
public double[] getAsVector() {
// assumes one of the likelihoods vector or the string isn't null
if ( log10Likelihoods == null ) {
// make sure we create the GL string if it doesn't already exist
log10Likelihoods = parsePLsIntoLikelihoods(likelihoodsAsString_PLs);
}
return log10Likelihoods;
}
public String toString() {
return getAsString();
}
public String getAsString() {
if ( likelihoodsAsString_PLs == null ) {
// todo -- should we accept null log10Likelihoods and set PLs as MISSING?
if ( log10Likelihoods == null )
throw new TribbleException("BUG: Attempted to get likelihoods as strings and neither the vector nor the string is set!");
likelihoodsAsString_PLs = convertLikelihoodsToPLString(log10Likelihoods);
}
return likelihoodsAsString_PLs;
}
//Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values
//Returns null in case of missing likelihoods
public EnumMap<Genotype.Type,Double> getAsMap(boolean normalizeFromLog10){
//Make sure that the log10likelihoods are set
double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector();
if(likelihoods == null)
return null;
EnumMap<Genotype.Type,Double> likelihoodsMap = new EnumMap<Genotype.Type, Double>(Genotype.Type.class);
likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]);
likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]);
likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]);
return likelihoodsMap;
}
//Return the neg log10 Genotype Quality (GQ) for the given genotype
//Returns Double.NEGATIVE_INFINITY in case of missing genotype
public double getNegLog10GQ(Genotype.Type genotype){
double qual = Double.NEGATIVE_INFINITY;
EnumMap<Genotype.Type,Double> likelihoods = getAsMap(false);
if(likelihoods == null)
return qual;
for(Map.Entry<Genotype.Type,Double> likelihood : likelihoods.entrySet()){
if(likelihood.getKey() == genotype)
continue;
if(likelihood.getValue() > qual)
qual = likelihood.getValue();
}
//Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best)
qual = likelihoods.get(genotype) - qual;
//Quality of other genotypes 1-P(G)
if (qual < 0) {
double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
double chosenGenotype = normalized[genotype.ordinal()-1];
qual = -1.0 * Math.log10(1.0 - chosenGenotype);
}
return qual;
}
private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) {
if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) {
String[] strings = likelihoodsAsString_PLs.split(",");
double[] likelihoodsAsVector = new double[strings.length];
for ( int i = 0; i < strings.length; i++ ) {
likelihoodsAsVector[i] = Integer.parseInt(strings[i]) / -10.0;
}
return likelihoodsAsVector;
} else
return null;
}
/**
* Back-compatibility function to read old style GL formatted genotype likelihoods in VCF format
* @param GLString
* @return
*/
private final static double[] parseDeprecatedGLString(String GLString) {
if ( !GLString.equals(VCFConstants.MISSING_VALUE_v4) ) {
String[] strings = GLString.split(",");
double[] likelihoodsAsVector = new double[strings.length];
for ( int i = 0; i < strings.length; i++ ) {
likelihoodsAsVector[i] = Double.parseDouble(strings[i]);
}
return likelihoodsAsVector;
}
return null;
}
private final static String convertLikelihoodsToPLString(double[] GLs) {
if ( GLs == null )
return VCFConstants.MISSING_VALUE_v4;
StringBuilder s = new StringBuilder();
double adjust = Double.NEGATIVE_INFINITY;
for ( double l : GLs ) adjust = Math.max(adjust, l);
boolean first = true;
for ( double l : GLs ) {
if ( ! first )
s.append(",");
else
first = false;
long PL = Math.round(-10 * (l - adjust));
if ( CAP_PLS )
PL = Math.min(PL, PL_CAP);
s.append(Long.toString(PL));
}
return s.toString();
}
}

View File

@ -0,0 +1,143 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext.v13;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMSequenceDictionary;
import org.broad.tribble.Tribble;
import org.broad.tribble.TribbleException;
import org.broad.tribble.index.DynamicIndexCreator;
import org.broad.tribble.index.Index;
import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.util.LittleEndianOutputStream;
import org.broad.tribble.util.PositionalStream;
import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.*;
/**
* this class writes VCF files
*/
abstract class IndexingVCFWriter implements VCFWriter {
final private String name;
private final SAMSequenceDictionary refDict;
private OutputStream outputStream;
private PositionalStream positionalStream = null;
private DynamicIndexCreator indexer = null;
private LittleEndianOutputStream idxStream = null;
@Requires({"name != null",
"! ( location == null && output == null )",
"! ( enableOnTheFlyIndexing && location == null )"})
protected IndexingVCFWriter(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) {
outputStream = output;
this.name = name;
this.refDict = refDict;
if ( enableOnTheFlyIndexing ) {
try {
idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(location)));
//System.out.println("Creating index on the fly for " + location);
indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
indexer.initialize(location, indexer.defaultBinSize());
positionalStream = new PositionalStream(output);
outputStream = positionalStream;
} catch ( IOException ex ) {
// No matter what we keep going, since we don't care if we can't create the index file
idxStream = null;
indexer = null;
positionalStream = null;
}
}
}
@Ensures("result != null")
public OutputStream getOutputStream() {
return outputStream;
}
@Ensures("result != null")
public String getStreamName() {
return name;
}
public abstract void writeHeader(VCFHeader header);
/**
* attempt to close the VCF file
*/
public void close() {
// try to close the index stream (keep it separate to help debugging efforts)
if ( indexer != null ) {
try {
Index index = indexer.finalizeIndex(positionalStream.getPosition());
IndexDictionaryUtils.setIndexSequenceDictionary(index, refDict);
index.write(idxStream);
idxStream.close();
} catch (IOException e) {
throw new ReviewedStingException("Unable to close index for " + getStreamName(), e);
}
}
}
/**
* add a record to the file
*
* @param vc the Variant Context object
*/
public void add(VariantContext vc) {
// if we are doing on the fly indexing, add the record ***before*** we write any bytes
if ( indexer != null )
indexer.addFeature(vc, positionalStream.getPosition());
}
/**
* Returns a reasonable "name" for this writer, to display to the user if something goes wrong
*
* @param location
* @param stream
* @return
*/
protected static final String writerName(final File location, final OutputStream stream) {
return location == null ? stream.toString() : location.getAbsolutePath();
}
/**
* Returns a output stream writing to location, or throws a UserException if this fails
* @param location
* @return
*/
protected static OutputStream openOutputStream(final File location) {
try {
return new FileOutputStream(location);
} catch (FileNotFoundException e) {
throw new UserException.CouldNotCreateOutputFile(location, "Unable to create VCF writer", e);
}
}
}

View File

@ -0,0 +1,243 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
import java.util.*;
/**
* Common utility routines for VariantContext and Genotype
*
* @author depristo
*/
final class InferredGeneticContext {
public static final double NO_NEG_LOG_10PERROR = -1.0;
private static Set<String> NO_FILTERS = Collections.unmodifiableSet(new HashSet<String>());
private static Map<String, Object> NO_ATTRIBUTES = Collections.unmodifiableMap(new HashMap<String, Object>());
private double negLog10PError = NO_NEG_LOG_10PERROR;
private String name = null;
private Set<String> filters = NO_FILTERS;
private Map<String, Object> attributes = NO_ATTRIBUTES;
// public InferredGeneticContext(String name) {
// this.name = name;
// }
//
// public InferredGeneticContext(String name, double negLog10PError) {
// this(name);
// setNegLog10PError(negLog10PError);
// }
public InferredGeneticContext(String name, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
this.name = name;
setNegLog10PError(negLog10PError);
if ( filters != null )
setFilters(filters);
if ( attributes != null )
setAttributes(attributes);
}
/**
* @return the name
*/
public String getName() {
return name;
}
/**
* Sets the name
*
* @param name the name associated with this information
*/
public void setName(String name) {
if ( name == null ) throw new IllegalArgumentException("Name cannot be null " + this);
this.name = name;
}
// ---------------------------------------------------------------------------------------------------------
//
// Filter
//
// ---------------------------------------------------------------------------------------------------------
public Set<String> getFilters() {
return Collections.unmodifiableSet(filters);
}
public boolean isFiltered() {
return filters.size() > 0;
}
public boolean isNotFiltered() {
return ! isFiltered();
}
public void addFilter(String filter) {
if ( filters == NO_FILTERS ) // immutable -> mutable
filters = new HashSet<String>(filters);
if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this);
if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this);
filters.add(filter);
}
public void addFilters(Collection<String> filters) {
if ( filters == null ) throw new IllegalArgumentException("BUG: Attempting to add null filters at" + this);
for ( String f : filters )
addFilter(f);
}
public void clearFilters() {
filters = new HashSet<String>();
}
public void setFilters(Collection<String> filters) {
clearFilters();
addFilters(filters);
}
// ---------------------------------------------------------------------------------------------------------
//
// Working with log error rates
//
// ---------------------------------------------------------------------------------------------------------
public boolean hasNegLog10PError() {
return getNegLog10PError() != NO_NEG_LOG_10PERROR;
}
/**
* @return the -1 * log10-based error estimate
*/
public double getNegLog10PError() { return negLog10PError; }
public double getPhredScaledQual() { return getNegLog10PError() * 10; }
public void setNegLog10PError(double negLog10PError) {
if ( negLog10PError < 0 && negLog10PError != NO_NEG_LOG_10PERROR ) throw new IllegalArgumentException("BUG: negLog10PError cannot be < than 0 : " + negLog10PError);
if ( Double.isInfinite(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be Infinity");
if ( Double.isNaN(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be NaN");
this.negLog10PError = negLog10PError;
}
// ---------------------------------------------------------------------------------------------------------
//
// Working with attributes
//
// ---------------------------------------------------------------------------------------------------------
public void clearAttributes() {
attributes = new HashMap<String, Object>();
}
/**
* @return the attribute map
*/
public Map<String, Object> getAttributes() {
return Collections.unmodifiableMap(attributes);
}
// todo -- define common attributes as enum
public void setAttributes(Map<String, ?> map) {
clearAttributes();
putAttributes(map);
}
public void putAttribute(String key, Object value) {
putAttribute(key, value, false);
}
public void putAttribute(String key, Object value, boolean allowOverwrites) {
if ( ! allowOverwrites && hasAttribute(key) )
throw new IllegalStateException("Attempting to overwrite key->value binding: key = " + key + " this = " + this);
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
attributes = new HashMap<String, Object>();
attributes.put(key, value);
}
public void removeAttribute(String key) {
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
attributes = new HashMap<String, Object>();
attributes.remove(key);
}
public void putAttributes(Map<String, ?> map) {
if ( map != null ) {
// for efficiency, we can skip the validation if the map is empty
if ( attributes.size() == 0 ) {
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
attributes = new HashMap<String, Object>();
attributes.putAll(map);
} else {
for ( Map.Entry<String, ?> elt : map.entrySet() ) {
putAttribute(elt.getKey(), elt.getValue(), false);
}
}
}
}
public boolean hasAttribute(String key) {
return attributes.containsKey(key);
}
public int getNumAttributes() {
return attributes.size();
}
/**
* @param key the attribute key
*
* @return the attribute value for the given key (or null if not set)
*/
public Object getAttribute(String key) {
return attributes.get(key);
}
public Object getAttribute(String key, Object defaultValue) {
if ( hasAttribute(key) )
return attributes.get(key);
else
return defaultValue;
}
public String getAttributeAsString(String key, String defaultValue) {
Object x = getAttribute(key);
if ( x == null ) return defaultValue;
if ( x instanceof String ) return (String)x;
return String.valueOf(x); // throws an exception if this isn't a string
}
public int getAttributeAsInt(String key, int defaultValue) {
Object x = getAttribute(key);
if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue;
if ( x instanceof Integer ) return (Integer)x;
return Integer.valueOf((String)x); // throws an exception if this isn't a string
}
public double getAttributeAsDouble(String key, double defaultValue) {
Object x = getAttribute(key);
if ( x == null ) return defaultValue;
if ( x instanceof Double ) return (Double)x;
return Double.valueOf((String)x); // throws an exception if this isn't a string
}
public boolean getAttributeAsBoolean(String key, boolean defaultValue) {
Object x = getAttribute(key);
if ( x == null ) return defaultValue;
if ( x instanceof Boolean ) return (Boolean)x;
return Boolean.valueOf((String)x); // throws an exception if this isn't a string
}
// public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null"
// public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); }
// public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); }
// public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); }
// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} }
// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} }
// public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); }
// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} }
}

View File

@ -0,0 +1,68 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
import java.util.*;
/**
* This class emcompasses all the basic information about a genotype. It is immutable.
*
* @author Mark DePristo
*/
class MutableGenotype extends Genotype {
public MutableGenotype(Genotype parent) {
super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased());
}
public MutableGenotype(String sampleName, Genotype parent) {
super(sampleName, parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased());
}
public MutableGenotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean genotypesArePhased) {
super(sampleName, alleles, negLog10PError, filters, attributes, genotypesArePhased);
}
public MutableGenotype(String sampleName, List<Allele> alleles, double negLog10PError) {
super(sampleName, alleles, negLog10PError);
}
public MutableGenotype(String sampleName, List<Allele> alleles) {
super(sampleName, alleles);
}
public Genotype unmodifiableGenotype() {
return new Genotype(getSampleName(), getAlleles(), getNegLog10PError(), getFilters(), getAttributes(), isPhased());
}
/**
*
* @param alleles list of alleles
*/
public void setAlleles(List<Allele> alleles) {
this.alleles = new ArrayList<Allele>(alleles);
validate();
}
public void setPhase(boolean isPhased) {
super.isPhased = isPhased;
}
// ---------------------------------------------------------------------------------------------------------
//
// InferredGeneticContext mutation operators
//
// ---------------------------------------------------------------------------------------------------------
public void setName(String name) { commonInfo.setName(name); }
public void addFilter(String filter) { commonInfo.addFilter(filter); }
public void addFilters(Collection<String> filters) { commonInfo.addFilters(filters); }
public void clearFilters() { commonInfo.clearFilters(); }
public void setFilters(Collection<String> filters) { commonInfo.setFilters(filters); }
public void setAttributes(Map<String, ?> map) { commonInfo.setAttributes(map); }
public void clearAttributes() { commonInfo.clearAttributes(); }
public void putAttribute(String key, Object value) { commonInfo.putAttribute(key, value); }
public void removeAttribute(String key) { commonInfo.removeAttribute(key); }
public void putAttributes(Map<String, ?> map) { commonInfo.putAttributes(map); }
public void setNegLog10PError(double negLog10PError) { commonInfo.setNegLog10PError(negLog10PError); }
public void putAttribute(String key, Object value, boolean allowOverwrites) { commonInfo.putAttribute(key, value, allowOverwrites); }
}

View File

@ -0,0 +1,213 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
/**
* Mutable version of VariantContext
*
* @author depristo
*/
class MutableVariantContext extends VariantContext {
// ---------------------------------------------------------------------------------------------------------
//
// constructors
//
// ---------------------------------------------------------------------------------------------------------
public MutableVariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles, Collection<Genotype> genotypes, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
super(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes);
}
public MutableVariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles, Map<String, Genotype> genotypes, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
super(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes);
}
public MutableVariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles) {
super(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null);
}
public MutableVariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles, Collection<Genotype> genotypes) {
super(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null);
}
public MutableVariantContext(VariantContext parent) {
super(parent.getSource(), parent.contig, parent.start, parent.stop, parent.getAlleles(), parent.getGenotypes(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.getReferenceBaseForIndel());
}
/**
* Sets the alleles segregating in this context to the collect of alleles. Each of which must be unique according
* to equals() in Allele. Validate() should be called when you are done modifying the context.
*
* @param alleles
*/
public void setAlleles(Collection<Allele> alleles) {
this.alleles.clear();
for ( Allele a : alleles )
addAllele(a);
}
/**
* Adds allele to the segregating allele list in this context to the collection of alleles. The new
* allele must be be unique according to equals() in Allele.
* Validate() should be called when you are done modifying the context.
*
* @param allele
*/
public void addAllele(Allele allele) {
final boolean allowDuplicates = false; // used to be a parameter
type = null;
for ( Allele a : alleles ) {
if ( a.basesMatch(allele) && ! allowDuplicates )
throw new IllegalArgumentException("Duplicate allele added to VariantContext" + this);
}
// we are a novel allele
alleles.add(allele);
}
public void clearGenotypes() {
genotypes = new TreeMap<String, Genotype>();
}
/**
* Adds this single genotype to the context, not allowing duplicate genotypes to be added
* @param genotype
*/
public void addGenotypes(Genotype genotype) {
putGenotype(genotype.getSampleName(), genotype, false);
}
/**
* Adds these genotypes to the context, not allowing duplicate genotypes to be added
* @param genotypes
*/
public void addGenotypes(Collection<Genotype> genotypes) {
for ( Genotype g : genotypes ) {
addGenotype(g);
}
}
/**
* Adds these genotype to the context, not allowing duplicate genotypes to be added.
* @param genotypes
*/
public void addGenotypes(Map<String, Genotype> genotypes) {
for ( Map.Entry<String, Genotype> elt : genotypes.entrySet() ) {
addGenotype(elt.getValue());
}
}
/**
* Adds these genotypes to the context.
*
* @param genotypes
*/
public void putGenotypes(Map<String, Genotype> genotypes) {
for ( Map.Entry<String, Genotype> g : genotypes.entrySet() )
putGenotype(g.getKey(), g.getValue());
}
/**
* Adds these genotypes to the context.
*
* @param genotypes
*/
public void putGenotypes(Collection<Genotype> genotypes) {
for ( Genotype g : genotypes )
putGenotype(g);
}
/**
* Adds this genotype to the context, throwing an error if it's already bound.
*
* @param genotype
*/
public void addGenotype(Genotype genotype) {
addGenotype(genotype.getSampleName(), genotype);
}
/**
* Adds this genotype to the context, throwing an error if it's already bound.
*
* @param genotype
*/
public void addGenotype(String sampleName, Genotype genotype) {
putGenotype(sampleName, genotype, false);
}
/**
* Adds this genotype to the context.
*
* @param genotype
*/
public void putGenotype(Genotype genotype) {
putGenotype(genotype.getSampleName(), genotype);
}
/**
* Adds this genotype to the context.
*
* @param genotype
*/
public void putGenotype(String sampleName, Genotype genotype) {
putGenotype(sampleName, genotype, true);
}
private void putGenotype(String sampleName, Genotype genotype, boolean allowOverwrites) {
if ( hasGenotype(sampleName) && ! allowOverwrites )
throw new IllegalStateException("Attempting to overwrite sample->genotype binding: " + sampleName + " this=" + this);
if ( ! sampleName.equals(genotype.getSampleName()) )
throw new IllegalStateException("Sample name doesn't equal genotype.getSample(): " + sampleName + " genotype=" + genotype);
this.genotypes.put(sampleName, genotype);
}
/**
* Removes the binding from sampleName to genotype. If this doesn't exist, throws an IllegalArgumentException
* @param sampleName
*/
public void removeGenotype(String sampleName) {
if ( ! this.genotypes.containsKey(sampleName) )
throw new IllegalArgumentException("Sample name isn't contained in genotypes " + sampleName + " genotypes =" + genotypes);
this.genotypes.remove(sampleName);
}
/**
* Removes genotype from the context. If this doesn't exist, throws an IllegalArgumentException
* @param genotype
*/
public void removeGenotype(Genotype genotype) {
removeGenotype(genotype.getSampleName());
}
// todo -- add replace genotype routine
// ---------------------------------------------------------------------------------------------------------
//
// InferredGeneticContext mutation operators
//
// ---------------------------------------------------------------------------------------------------------
public void setSource(String source) { commonInfo.setName(source); }
public void addFilter(String filter) { commonInfo.addFilter(filter); }
public void addFilters(Collection<String> filters) { commonInfo.addFilters(filters); }
public void clearFilters() { commonInfo.clearFilters(); }
public void setFilters(Collection<String> filters) { commonInfo.setFilters(filters); }
public void setAttributes(Map<String, ?> map) { commonInfo.setAttributes(map); }
public void clearAttributes() { commonInfo.clearAttributes(); }
public void putAttribute(String key, Object value) { commonInfo.putAttribute(key, value); }
public void removeAttribute(String key) { commonInfo.removeAttribute(key); }
public void putAttributes(Map<String, ?> map) { commonInfo.putAttributes(map); }
public void setNegLog10PError(double negLog10PError) { commonInfo.setNegLog10PError(negLog10PError); }
public void putAttribute(String key, Object value, boolean allowOverwrites) { commonInfo.putAttribute(key, value, allowOverwrites); }
public void setID(String id) { putAttribute(ID_KEY, id, true); }
}

View File

@ -0,0 +1,198 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
import org.broad.tribble.TribbleException;
import org.broad.tribble.readers.LineReader;
import org.broad.tribble.util.ParsingUtils;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
/**
* A feature codec for the VCF3 specification, to read older VCF files. VCF3 has been
* depreciated in favor of VCF4 (See VCF codec for the latest information)
*
* <p>
* Reads historical VCF3 encoded files (1000 Genomes Pilot results, for example)
* </p>
*
* <p>
* See also: @see <a href="http://vcftools.sourceforge.net/specs.html">VCF specification</a><br>
* See also: @see <a href="http://www.ncbi.nlm.nih.gov/pubmed/21653522">VCF spec. publication</a>
* </p>
*
* @author Mark DePristo
* @since 2010
*/
class VCF3Codec extends AbstractVCFCodec {
public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3";
/**
* @param reader the line reader to take header lines from
* @return the number of header lines
*/
public Object readHeader(LineReader reader) {
List<String> headerStrings = new ArrayList<String>();
String line;
try {
boolean foundHeaderVersion = false;
while ((line = reader.readLine()) != null) {
lineNo++;
if (line.startsWith(VCFHeader.METADATA_INDICATOR)) {
String[] lineFields = line.substring(2).split("=");
if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) {
if ( !VCFHeaderVersion.isVersionString(lineFields[1]) )
throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version");
foundHeaderVersion = true;
version = VCFHeaderVersion.toHeaderVersion(lineFields[1]);
if ( version != VCFHeaderVersion.VCF3_3 && version != VCFHeaderVersion.VCF3_2 )
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv3 and does not support " + lineFields[1]);
}
headerStrings.add(line);
}
else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) {
if (!foundHeaderVersion) {
throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version");
}
return createHeader(headerStrings, line);
}
else {
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
}
}
} catch (IOException e) {
throw new RuntimeException("IO Exception ", e);
}
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
}
/**
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
* @param filterString the string to parse
* @return a set of the filters applied
*/
protected Set<String> parseFilters(String filterString) {
// null for unfiltered
if ( filterString.equals(VCFConstants.UNFILTERED) )
return null;
// empty set for passes filters
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
return fFields;
if ( filterString.length() == 0 )
generateException("The VCF specification requires a valid filter status");
// do we have the filter string cached?
if ( filterHash.containsKey(filterString) )
return filterHash.get(filterString);
// otherwise we have to parse and cache the value
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
fFields.add(filterString);
else
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
filterHash.put(filterString, fFields);
return fFields;
}
/**
* create a genotype map
* @param str the string
* @param alleles the list of alleles
* @param chr chrom
* @param pos position
* @return a mapping of sample name to genotype object
*/
public Map<String, Genotype> createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
if (genotypeParts == null)
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
Map<String, Genotype> genotypes = new LinkedHashMap<String, Genotype>(nParts);
// get the format keys
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
// cycle through the sample names
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
// clear out our allele mapping
alleleMap.clear();
// cycle through the genotype strings
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
Set<String> genotypeFilters = null;
Map<String, String> gtAttributes = null;
String sampleName = sampleNameIterator.next();
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1;
if (nGTKeys >= 1) {
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
for (int i = 0; i < nGTKeys; i++) {
final String gtKey = new String(genotypeKeyArray[i]);
boolean missing = i >= GTValueSplitSize;
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i]));
} else if ( missing || GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) ) {
gtAttributes.put(gtKey, VCFConstants.MISSING_VALUE_v4);
} else {
gtAttributes.put(gtKey, new String(GTValueArray[i]));
}
}
}
// check to make sure we found a genotype field
if ( genotypeAlleleLocation < 0 )
generateException("Unable to find the GT field for the record; the GT field is required");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes");
boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
// add it to the list
try {
genotypes.put(sampleName, new Genotype(sampleName,
parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap),
GTQual,
genotypeFilters,
gtAttributes,
phased));
} catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
}
}
return genotypes;
}
@Override
public boolean canDecode(final File potentialInput) {
return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER);
}
}

View File

@ -0,0 +1,28 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
/**
* @author ebanks
* A class representing a key=value entry for ALT fields in the VCF header
*/
class VCFAltHeaderLine extends VCFSimpleHeaderLine {
/**
* create a VCF filter header line
*
* @param name the name for this header line
* @param description the description for this header line
*/
public VCFAltHeaderLine(String name, String description) {
super(name, description, SupportedHeaderLineType.ALT);
}
/**
* create a VCF info header line
*
* @param line the header line
* @param version the vcf header version
*/
protected VCFAltHeaderLine(String line, VCFHeaderVersion version) {
super(line, version, SupportedHeaderLineType.ALT);
}
}

View File

@ -0,0 +1,228 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
import org.broad.tribble.TribbleException;
import org.broad.tribble.readers.LineReader;
import org.broad.tribble.util.ParsingUtils;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
/**
* A feature codec for the VCF 4 specification
*
* <p>
* VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a
* header line, and then data lines each containing information about a position in the genome.
* </p>
* <p>One of the main uses of next-generation sequencing is to discover variation amongst large populations
* of related samples. Recently the format for storing next-generation read alignments has been
* standardised by the SAM/BAM file format specification. This has significantly improved the
* interoperability of next-generation tools for alignment, visualisation, and variant calling.
* We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent
* types of sequence variation, including SNPs, indels and larger structural variants, together
* with rich annotations. VCF is usually stored in a compressed manner and can be indexed for
* fast data retrieval of variants from a range of positions on the reference genome.
* The format was developed for the 1000 Genomes Project, and has also been adopted by other projects
* such as UK10K, dbSNP, or the NHLBI Exome Project. VCFtools is a software suite that implements
* various utilities for processing VCF files, including validation, merging and comparing,
* and also provides a general Perl and Python API.
* The VCF specification and VCFtools are available from http://vcftools.sourceforge.net.</p>
*
* <p>
* See also: @see <a href="http://vcftools.sourceforge.net/specs.html">VCF specification</a><br>
* See also: @see <a href="http://www.ncbi.nlm.nih.gov/pubmed/21653522">VCF spec. publication</a>
* </p>
*
* <h2>File format example</h2>
* <pre>
* ##fileformat=VCFv4.0
* #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
* chr1 109 . A T 0 PASS AC=1 GT:AD:DP:GL:GQ 0/1:610,327:308:-316.30,-95.47,-803.03:99
* chr1 147 . C A 0 PASS AC=1 GT:AD:DP:GL:GQ 0/1:294,49:118:-57.87,-34.96,-338.46:99
* </pre>
*
* @author Mark DePristo
* @since 2010
*/
public class VCFCodec extends AbstractVCFCodec {
// Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4";
/**
* @param reader the line reader to take header lines from
* @return the number of header lines
*/
public Object readHeader(LineReader reader) {
List<String> headerStrings = new ArrayList<String>();
String line;
try {
boolean foundHeaderVersion = false;
while ((line = reader.readLine()) != null) {
lineNo++;
if (line.startsWith(VCFHeader.METADATA_INDICATOR)) {
String[] lineFields = line.substring(2).split("=");
if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) {
if ( !VCFHeaderVersion.isVersionString(lineFields[1]) )
throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version");
foundHeaderVersion = true;
version = VCFHeaderVersion.toHeaderVersion(lineFields[1]);
if ( version == VCFHeaderVersion.VCF3_3 || version == VCFHeaderVersion.VCF3_2 )
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]);
if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 )
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]);
}
headerStrings.add(line);
}
else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) {
if (!foundHeaderVersion) {
throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version");
}
return createHeader(headerStrings, line);
}
else {
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
}
}
} catch (IOException e) {
throw new RuntimeException("IO Exception ", e);
}
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
}
/**
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
*
* @param filterString the string to parse
* @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF)
*/
protected Set<String> parseFilters(String filterString) {
return parseFilters(filterHash, lineNo, filterString);
}
public static Set<String> parseFilters(final Map<String, LinkedHashSet<String>> cache, final int lineNo, final String filterString) {
// null for unfiltered
if ( filterString.equals(VCFConstants.UNFILTERED) )
return null;
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) )
return Collections.emptySet();
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo);
if ( filterString.length() == 0 )
generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo);
// do we have the filter string cached?
if ( cache != null && cache.containsKey(filterString) )
return Collections.unmodifiableSet(cache.get(filterString));
// empty set for passes filters
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
// otherwise we have to parse and cache the value
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
fFields.add(filterString);
else
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
fFields = fFields;
if ( cache != null ) cache.put(filterString, fFields);
return Collections.unmodifiableSet(fFields);
}
/**
* create a genotype map
* @param str the string
* @param alleles the list of alleles
* @return a mapping of sample name to genotype object
*/
public Map<String, Genotype> createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
if (genotypeParts == null)
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
Map<String, Genotype> genotypes = new LinkedHashMap<String, Genotype>(nParts);
// get the format keys
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
// cycle through the sample names
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
// clear out our allele mapping
alleleMap.clear();
// cycle through the genotype strings
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
Set<String> genotypeFilters = null;
Map<String, String> gtAttributes = null;
String sampleName = sampleNameIterator.next();
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1;
if (nGTKeys >= 1) {
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
for (int i = 0; i < nGTKeys; i++) {
final String gtKey = new String(genotypeKeyArray[i]);
boolean missing = i >= GTValueSplitSize;
// todo -- all of these on the fly parsing of the missing value should be static constants
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i]));
} else if ( missing ) {
gtAttributes.put(gtKey, VCFConstants.MISSING_VALUE_v4);
} else {
gtAttributes.put(gtKey, new String(GTValueArray[i]));
}
}
}
// check to make sure we found a genotype field if we are a VCF4.0 file
if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 )
generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? null : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
// add it to the list
try {
genotypes.put(sampleName,
new Genotype(sampleName,
GTalleles,
GTQual,
genotypeFilters,
gtAttributes,
phased));
} catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
}
}
return genotypes;
}
@Override
public boolean canDecode(final File potentialInput) {
return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER);
}
}

View File

@ -0,0 +1,224 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext.v13;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* a base class for compound header lines, which include info lines and format lines (so far)
*/
abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {
public enum SupportedHeaderLineType {
INFO(true), FORMAT(false);
public final boolean allowFlagValues;
SupportedHeaderLineType(boolean flagValues) {
allowFlagValues = flagValues;
}
}
// the field types
private String name;
private int count = -1;
private VCFHeaderLineCount countType;
private String description;
private VCFHeaderLineType type;
// access methods
public String getName() { return name; }
public String getDescription() { return description; }
public VCFHeaderLineType getType() { return type; }
public VCFHeaderLineCount getCountType() { return countType; }
public int getCount() {
if ( countType != VCFHeaderLineCount.INTEGER )
throw new ReviewedStingException("Asking for header line count when type is not an integer");
return count;
}
// utility method
public int getCount(int numAltAlleles) {
int myCount;
switch ( countType ) {
case INTEGER: myCount = count; break;
case UNBOUNDED: myCount = -1; break;
case A: myCount = numAltAlleles; break;
case G: myCount = ((numAltAlleles + 1) * (numAltAlleles + 2) / 2); break;
default: throw new ReviewedStingException("Unknown count type: " + countType);
}
return myCount;
}
public void setNumberToUnbounded() {
countType = VCFHeaderLineCount.UNBOUNDED;
count = -1;
}
// our type of line, i.e. format, info, etc
private final SupportedHeaderLineType lineType;
/**
* create a VCF format header line
*
* @param name the name for this header line
* @param count the count for this header line
* @param type the type for this header line
* @param description the description for this header line
* @param lineType the header line type
*/
protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) {
super(lineType.toString(), "");
this.name = name;
this.countType = VCFHeaderLineCount.INTEGER;
this.count = count;
this.type = type;
this.description = description;
this.lineType = lineType;
validate();
}
/**
* create a VCF format header line
*
* @param name the name for this header line
* @param count the count type for this header line
* @param type the type for this header line
* @param description the description for this header line
* @param lineType the header line type
*/
protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) {
super(lineType.toString(), "");
this.name = name;
this.countType = count;
this.type = type;
this.description = description;
this.lineType = lineType;
validate();
}
/**
* create a VCF format header line
*
* @param line the header line
* @param version the VCF header version
* @param lineType the header line type
*
*/
protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) {
super(lineType.toString(), "");
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description"));
name = mapping.get("ID");
count = -1;
final String numberStr = mapping.get("Number");
if ( numberStr.equals(VCFConstants.PER_ALLELE_COUNT) ) {
countType = VCFHeaderLineCount.A;
} else if ( numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT) ) {
countType = VCFHeaderLineCount.G;
} else if ( ((version == VCFHeaderVersion.VCF4_0 || version == VCFHeaderVersion.VCF4_1) &&
numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) ||
((version == VCFHeaderVersion.VCF3_2 || version == VCFHeaderVersion.VCF3_3) &&
numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3)) ) {
countType = VCFHeaderLineCount.UNBOUNDED;
} else {
countType = VCFHeaderLineCount.INTEGER;
count = Integer.valueOf(numberStr);
}
type = VCFHeaderLineType.valueOf(mapping.get("Type"));
if (type == VCFHeaderLineType.Flag && !allowFlagValues())
throw new IllegalArgumentException("Flag is an unsupported type for this kind of field");
description = mapping.get("Description");
if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided
description = UNBOUND_DESCRIPTION;
this.lineType = lineType;
validate();
}
private void validate() {
if ( name == null || type == null || description == null || lineType == null )
throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s",
super.getKey(), name, type, description, lineType ));
}
/**
* make a string representation of this header line
* @return a string representation
*/
protected String toStringEncoding() {
Map<String,Object> map = new LinkedHashMap<String,Object>();
map.put("ID", name);
Object number;
switch ( countType ) {
case A: number = VCFConstants.PER_ALLELE_COUNT; break;
case G: number = VCFConstants.PER_GENOTYPE_COUNT; break;
case UNBOUNDED: number = VCFConstants.UNBOUNDED_ENCODING_v4; break;
case INTEGER:
default: number = count;
}
map.put("Number", number);
map.put("Type", type);
map.put("Description", description);
return lineType.toString() + "=" + toStringEncoding(map);
}
/**
* returns true if we're equal to another compounder header line
* @param o a compound header line
* @return true if equal
*/
public boolean equals(Object o) {
if ( !(o instanceof VCFCompoundHeaderLine) )
return false;
VCFCompoundHeaderLine other = (VCFCompoundHeaderLine)o;
return equalsExcludingDescription(other) &&
description.equals(other.description);
}
public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) {
return count == other.count &&
countType == other.countType &&
type == other.type &&
lineType == other.lineType &&
name.equals(other.name);
}
public boolean sameLineTypeAndName(VCFCompoundHeaderLine other) {
return lineType == other.lineType &&
name.equals(other.name);
}
/**
* do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true)
* @return true if we do, false otherwise
*/
abstract boolean allowFlagValues();
}

View File

@ -0,0 +1,112 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext.v13;
import java.util.Locale;
final class VCFConstants {
public static final Locale VCF_LOCALE = Locale.US;
// reserved INFO/FORMAT field keys
public static final String ANCESTRAL_ALLELE_KEY = "AA";
public static final String ALLELE_COUNT_KEY = "AC";
public static final String ALLELE_FREQUENCY_KEY = "AF";
public static final String ALLELE_NUMBER_KEY = "AN";
public static final String RMS_BASE_QUALITY_KEY = "BQ";
public static final String CIGAR_KEY = "CIGAR";
public static final String DBSNP_KEY = "DB";
public static final String DEPTH_KEY = "DP";
public static final String DOWNSAMPLED_KEY = "DS";
public static final String EXPECTED_ALLELE_COUNT_KEY = "EC";
public static final String END_KEY = "END";
public static final String GENOTYPE_FILTER_KEY = "FT";
public static final String GENOTYPE_KEY = "GT";
@Deprecated
public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods
public static final String GENOTYPE_POSTERIORS_KEY = "GP";
public static final String GENOTYPE_QUALITY_KEY = "GQ";
public static final String HAPMAP2_KEY = "H2";
public static final String HAPMAP3_KEY = "H3";
public static final String HAPLOTYPE_QUALITY_KEY = "HQ";
public static final String RMS_MAPPING_QUALITY_KEY = "MQ";
public static final String MAPPING_QUALITY_ZERO_KEY = "MQ0";
public static final String SAMPLE_NUMBER_KEY = "NS";
public static final String PHRED_GENOTYPE_LIKELIHOODS_KEY = "PL"; // phred-scaled genotype likelihoods
public static final String PHASE_QUALITY_KEY = "PQ";
public static final String PHASE_SET_KEY = "PS";
public static final String OLD_DEPTH_KEY = "RD";
public static final String STRAND_BIAS_KEY = "SB";
public static final String SOMATIC_KEY = "SOMATIC";
public static final String VALIDATED_KEY = "VALIDATED";
public static final String THOUSAND_GENOMES_KEY = "1000G";
// separators
public static final String FORMAT_FIELD_SEPARATOR = ":";
public static final String GENOTYPE_FIELD_SEPARATOR = ":";
public static final char GENOTYPE_FIELD_SEPARATOR_CHAR = ':';
public static final String FIELD_SEPARATOR = "\t";
public static final char FIELD_SEPARATOR_CHAR = '\t';
public static final String FILTER_CODE_SEPARATOR = ";";
public static final String INFO_FIELD_ARRAY_SEPARATOR = ",";
public static final char INFO_FIELD_ARRAY_SEPARATOR_CHAR = ',';
public static final String ID_FIELD_SEPARATOR = ";";
public static final String INFO_FIELD_SEPARATOR = ";";
public static final char INFO_FIELD_SEPARATOR_CHAR = ';';
public static final String UNPHASED = "/";
public static final String PHASED = "|";
public static final String PHASED_SWITCH_PROB_v3 = "\\";
public static final String PHASING_TOKENS = "/|\\";
// old indel alleles
public static final char DELETION_ALLELE_v3 = 'D';
public static final char INSERTION_ALLELE_v3 = 'I';
// missing/default values
public static final String UNFILTERED = ".";
public static final String PASSES_FILTERS_v3 = "0";
public static final String PASSES_FILTERS_v4 = "PASS";
public static final String EMPTY_ID_FIELD = ".";
public static final String EMPTY_INFO_FIELD = ".";
public static final String EMPTY_ALTERNATE_ALLELE_FIELD = ".";
public static final String MISSING_VALUE_v4 = ".";
public static final String MISSING_QUALITY_v3 = "-1";
public static final Double MISSING_QUALITY_v3_DOUBLE = Double.valueOf(MISSING_QUALITY_v3);
public static final String MISSING_GENOTYPE_QUALITY_v3 = "-1";
public static final String MISSING_HAPLOTYPE_QUALITY_v3 = "-1";
public static final String MISSING_DEPTH_v3 = "-1";
public static final String UNBOUNDED_ENCODING_v4 = ".";
public static final String UNBOUNDED_ENCODING_v3 = "-1";
public static final String PER_ALLELE_COUNT = "A";
public static final String PER_GENOTYPE_COUNT = "G";
public static final String EMPTY_ALLELE = ".";
public static final String EMPTY_GENOTYPE = "./.";
public static final double MAX_GENOTYPE_QUAL = 99.0;
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
public static final String DOUBLE_PRECISION_INT_SUFFIX = ".00";
public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare
}

View File

@ -0,0 +1,28 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
/**
* @author ebanks
* A class representing a key=value entry for FILTER fields in the VCF header
*/
class VCFFilterHeaderLine extends VCFSimpleHeaderLine {
/**
* create a VCF filter header line
*
* @param name the name for this header line
* @param description the description for this header line
*/
public VCFFilterHeaderLine(String name, String description) {
super(name, description, SupportedHeaderLineType.FILTER);
}
/**
* create a VCF info header line
*
* @param line the header line
* @param version the vcf header version
*/
protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) {
super(line, version, SupportedHeaderLineType.FILTER);
}
}

View File

@ -0,0 +1,32 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
/**
* @author ebanks
* <p/>
* Class VCFFormatHeaderLine
* <p/>
* A class representing a key=value entry for genotype FORMAT fields in the VCF header
*/
class VCFFormatHeaderLine extends VCFCompoundHeaderLine {
public VCFFormatHeaderLine(String name, int count, VCFHeaderLineType type, String description) {
super(name, count, type, description, SupportedHeaderLineType.FORMAT);
if (type == VCFHeaderLineType.Flag)
throw new IllegalArgumentException("Flag is an unsupported type for format fields");
}
public VCFFormatHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) {
super(name, count, type, description, SupportedHeaderLineType.FORMAT);
}
protected VCFFormatHeaderLine(String line, VCFHeaderVersion version) {
super(line, version, SupportedHeaderLineType.FORMAT);
}
// format fields do not allow flag values (that wouldn't make much sense, how would you encode this in the genotype).
@Override
boolean allowFlagValues() {
return false;
}
}

View File

@ -0,0 +1,198 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
import org.broad.tribble.util.ParsingUtils;
import java.util.*;
/**
* @author aaron
* <p/>
* Class VCFHeader
* <p/>
* A class representing the VCF header
*/
class VCFHeader {
// the mandatory header fields
public enum HEADER_FIELDS {
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
}
// the associated meta data
private final Set<VCFHeaderLine> mMetaData;
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
private final Map<String, VCFHeaderLine> mOtherMetaData = new HashMap<String, VCFHeaderLine>();
// the list of auxillary tags
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
// the character string that indicates meta data
public static final String METADATA_INDICATOR = "##";
// the header string indicator
public static final String HEADER_INDICATOR = "#";
// were the input samples sorted originally (or are we sorting them)?
private boolean samplesWereAlreadySorted = true;
/**
* create a VCF header, given a list of meta data and auxillary tags
*
* @param metaData the meta data associated with this header
*/
public VCFHeader(Set<VCFHeaderLine> metaData) {
mMetaData = new TreeSet<VCFHeaderLine>(metaData);
loadVCFVersion();
loadMetaDataMaps();
}
/**
* create a VCF header, given a list of meta data and auxillary tags
*
* @param metaData the meta data associated with this header
* @param genotypeSampleNames the sample names
*/
public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
mMetaData = new TreeSet<VCFHeaderLine>();
if ( metaData != null )
mMetaData.addAll(metaData);
mGenotypeSampleNames.addAll(genotypeSampleNames);
loadVCFVersion();
loadMetaDataMaps();
samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames);
}
/**
* Adds a header line to the header metadata.
*
* @param headerLine Line to add to the existing metadata component.
*/
public void addMetaDataLine(VCFHeaderLine headerLine) {
mMetaData.add(headerLine);
}
/**
* check our metadata for a VCF version tag, and throw an exception if the version is out of date
* or the version is not present
*/
public void loadVCFVersion() {
List<VCFHeaderLine> toRemove = new ArrayList<VCFHeaderLine>();
for ( VCFHeaderLine line : mMetaData )
if ( VCFHeaderVersion.isFormatString(line.getKey())) {
toRemove.add(line);
}
// remove old header lines for now,
mMetaData.removeAll(toRemove);
}
/**
* load the format/info meta data maps (these are used for quick lookup by key name)
*/
private void loadMetaDataMaps() {
for ( VCFHeaderLine line : mMetaData ) {
if ( line instanceof VCFInfoHeaderLine ) {
VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
mInfoMetaData.put(infoLine.getName(), infoLine);
}
else if ( line instanceof VCFFormatHeaderLine ) {
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
mFormatMetaData.put(formatLine.getName(), formatLine);
}
else {
mOtherMetaData.put(line.getKey(), line);
}
}
}
/**
* get the header fields in order they're presented in the input file (which is now required to be
* the order presented in the spec).
*
* @return a set of the header fields, in order
*/
public Set<HEADER_FIELDS> getHeaderFields() {
Set<HEADER_FIELDS> fields = new LinkedHashSet<HEADER_FIELDS>();
for (HEADER_FIELDS field : HEADER_FIELDS.values())
fields.add(field);
return fields;
}
/**
* get the meta data, associated with this header
*
* @return a set of the meta data
*/
public Set<VCFHeaderLine> getMetaData() {
Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>();
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString()));
lines.addAll(mMetaData);
return Collections.unmodifiableSet(lines);
}
/**
* get the genotyping sample names
*
* @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false
*/
public Set<String> getGenotypeSamples() {
return mGenotypeSampleNames;
}
/**
* do we have genotyping data?
*
* @return true if we have genotyping columns, false otherwise
*/
public boolean hasGenotypingData() {
return mGenotypeSampleNames.size() > 0;
}
/**
* were the input samples sorted originally?
*
* @return true if the input samples were sorted originally, false otherwise
*/
public boolean samplesWereAlreadySorted() {
return samplesWereAlreadySorted;
}
/** @return the column count */
public int getColumnCount() {
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
}
/**
* @param key the header key name
* @return the meta data line, or null if there is none
*/
public VCFInfoHeaderLine getInfoHeaderLine(String key) {
return mInfoMetaData.get(key);
}
/**
* @param key the header key name
* @return the meta data line, or null if there is none
*/
public VCFFormatHeaderLine getFormatHeaderLine(String key) {
return mFormatMetaData.get(key);
}
/**
* @param key the header key name
* @return the meta data line, or null if there is none
*/
public VCFHeaderLine getOtherHeaderLine(String key) {
return mOtherMetaData.get(key);
}
}

View File

@ -0,0 +1,134 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext.v13;
import org.broad.tribble.TribbleException;
import java.util.Map;
/**
* @author ebanks
* <p/>
* Class VCFHeaderLine
* <p/>
* A class representing a key=value entry in the VCF header
*/
class VCFHeaderLine implements Comparable {
protected static boolean ALLOW_UNBOUND_DESCRIPTIONS = true;
protected static String UNBOUND_DESCRIPTION = "Not provided in original VCF header";
private String mKey = null;
private String mValue = null;
/**
* create a VCF header line
*
* @param key the key for this header line
* @param value the value for this header line
*/
public VCFHeaderLine(String key, String value) {
if ( key == null )
throw new IllegalArgumentException("VCFHeaderLine: key cannot be null: key = " + key);
mKey = key;
mValue = value;
}
/**
* Get the key
*
* @return the key
*/
public String getKey() {
return mKey;
}
/**
* Get the value
*
* @return the value
*/
public String getValue() {
return mValue;
}
public String toString() {
return toStringEncoding();
}
/**
* Should be overloaded in sub classes to do subclass specific
*
* @return the string encoding
*/
protected String toStringEncoding() {
return mKey + "=" + mValue;
}
public boolean equals(Object o) {
if ( !(o instanceof VCFHeaderLine) )
return false;
return mKey.equals(((VCFHeaderLine)o).getKey()) && mValue.equals(((VCFHeaderLine)o).getValue());
}
public int compareTo(Object other) {
return toString().compareTo(other.toString());
}
/**
* @param line the line
* @return true if the line is a VCF meta data line, or false if it is not
*/
public static boolean isHeaderLine(String line) {
return line != null && line.length() > 0 && VCFHeader.HEADER_INDICATOR.equals(line.substring(0,1));
}
/**
* create a string of a mapping pair for the target VCF version
* @param keyValues a mapping of the key->value pairs to output
* @return a string, correctly formatted
*/
public static String toStringEncoding(Map<String, ? extends Object> keyValues) {
StringBuilder builder = new StringBuilder();
builder.append("<");
boolean start = true;
for (Map.Entry<String,?> entry : keyValues.entrySet()) {
if (start) start = false;
else builder.append(",");
if ( entry.getValue() == null ) throw new TribbleException.InternalCodecException("Header problem: unbound value at " + entry + " from " + keyValues);
builder.append(entry.getKey());
builder.append("=");
builder.append(entry.getValue().toString().contains(",") ||
entry.getValue().toString().contains(" ") ||
entry.getKey().equals("Description") ? "\""+ entry.getValue() + "\"" : entry.getValue());
}
builder.append(">");
return builder.toString();
}
}

View File

@ -0,0 +1,8 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
/**
* the count encodings we use for fields in VCF header lines
*/
public enum VCFHeaderLineCount {
INTEGER, A, G, UNBOUNDED;
}

View File

@ -0,0 +1,124 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
import java.util.*;
/**
* A class for translating between vcf header versions
*/
public class VCFHeaderLineTranslator {
private static Map<VCFHeaderVersion,VCFLineParser> mapping;
static {
mapping = new HashMap<VCFHeaderVersion,VCFLineParser>();
mapping.put(VCFHeaderVersion.VCF4_0,new VCF4Parser());
mapping.put(VCFHeaderVersion.VCF4_1,new VCF4Parser());
mapping.put(VCFHeaderVersion.VCF3_3,new VCF3Parser());
mapping.put(VCFHeaderVersion.VCF3_2,new VCF3Parser());
}
public static Map<String,String> parseLine(VCFHeaderVersion version, String valueLine, List<String> expectedTagOrder) {
return mapping.get(version).parseLine(valueLine,expectedTagOrder);
}
}
interface VCFLineParser {
public Map<String,String> parseLine(String valueLine, List<String> expectedTagOrder);
}
/**
* a class that handles the to and from disk for VCF 4 lines
*/
class VCF4Parser implements VCFLineParser {
Set<String> bracketed = new HashSet<String>();
/**
* parse a VCF4 line
* @param valueLine the line
* @return a mapping of the tags parsed out
*/
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
// our return map
Map<String, String> ret = new LinkedHashMap<String, String>();
// a builder to store up characters as we go
StringBuilder builder = new StringBuilder();
// store the key when we're parsing out the values
String key = "";
// where are we in the stream of characters?
int index = 0;
// are we inside a quotation? we don't special case ',' then
boolean inQuote = false;
// a little switch machine to parse out the tags. Regex ended up being really complicated and ugly [yes, but this machine is getting ugly now... MAD]
for (char c: valueLine.toCharArray()) {
if ( c == '\"' ) {
inQuote = ! inQuote;
} else if ( inQuote ) {
builder.append(c);
} else {
switch (c) {
case ('<') : if (index == 0) break; // if we see a open bracket at the beginning, ignore it
case ('>') : if (index == valueLine.length()-1) ret.put(key,builder.toString().trim()); break; // if we see a close bracket, and we're at the end, add an entry to our list
case ('=') : key = builder.toString().trim(); builder = new StringBuilder(); break; // at an equals, copy the key and reset the builder
case (',') : ret.put(key,builder.toString().trim()); builder = new StringBuilder(); break; // drop the current key value to the return map
default: builder.append(c); // otherwise simply append to the current string
}
}
index++;
}
// validate the tags against the expected list
index = 0;
if (ret.size() > expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size());
for (String str : ret.keySet()) {
if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
index++;
}
return ret;
}
}
class VCF3Parser implements VCFLineParser {
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
// our return map
Map<String, String> ret = new LinkedHashMap<String, String>();
// a builder to store up characters as we go
StringBuilder builder = new StringBuilder();
// where are we in the stream of characters?
int index = 0;
// where in the expected tag order are we?
int tagIndex = 0;
// are we inside a quotation? we don't special case ',' then
boolean inQuote = false;
// a little switch machine to parse out the tags. Regex ended up being really complicated and ugly
for (char c: valueLine.toCharArray()) {
switch (c) {
case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it
case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map
default: builder.append(c); // otherwise simply append to the current string
}
index++;
}
ret.put(expectedTagOrder.get(tagIndex++),builder.toString());
// validate the tags against the expected list
index = 0;
if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size());
for (String str : ret.keySet()){
if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
index++;
}
return ret;
}
}

View File

@ -0,0 +1,8 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
/**
* the type encodings we use for fields in VCF header lines
*/
enum VCFHeaderLineType {
Integer, Float, String, Character, Flag;
}

View File

@ -0,0 +1,91 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
import org.broad.tribble.TribbleException;
/**
* information that identifies each header version
*/
enum VCFHeaderVersion {
VCF3_2("VCRv3.2","format"),
VCF3_3("VCFv3.3","fileformat"),
VCF4_0("VCFv4.0","fileformat"),
VCF4_1("VCFv4.1","fileformat");
private final String versionString;
private final String formatString;
/**
* create the enum, privately, using:
* @param vString the version string
* @param fString the format string
*/
VCFHeaderVersion(String vString, String fString) {
this.versionString = vString;
this.formatString = fString;
}
/**
* get the header version
* @param version the version string
* @return a VCFHeaderVersion object
*/
public static VCFHeaderVersion toHeaderVersion(String version) {
version = clean(version);
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
if (hv.versionString.equals(version))
return hv;
return null;
}
/**
* are we a valid version string of some type
* @param version the version string
* @return true if we're valid of some type, false otherwise
*/
public static boolean isVersionString(String version){
return toHeaderVersion(version) != null;
}
/**
* are we a valid format string for some type
* @param format the format string
* @return true if we're valid of some type, false otherwise
*/
public static boolean isFormatString(String format){
format = clean(format);
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
if (hv.formatString.equals(format))
return true;
return false;
}
public static VCFHeaderVersion getHeaderVersion(String versionLine) {
String[] lineFields = versionLine.split("=");
if ( lineFields.length != 2 || !isFormatString(lineFields[0].substring(2)) )
throw new TribbleException.InvalidHeader(versionLine + " is not a valid VCF version line");
if ( !isVersionString(lineFields[1]) )
throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version");
return toHeaderVersion(lineFields[1]);
}
/**
* Utility function to clean up a VCF header string
*
* @param s string
* @return trimmed version of s
*/
private static String clean(String s) {
return s.trim();
}
public String getVersionString() {
return versionString;
}
public String getFormatString() {
return formatString;
}
}

View File

@ -0,0 +1,29 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
/**
* @author ebanks
* <p/>
* Class VCFInfoHeaderLine
* <p/>
* A class representing a key=value entry for INFO fields in the VCF header
*/
class VCFInfoHeaderLine extends VCFCompoundHeaderLine {
public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) {
super(name, count, type, description, SupportedHeaderLineType.INFO);
}
public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) {
super(name, count, type, description, SupportedHeaderLineType.INFO);
}
protected VCFInfoHeaderLine(String line, VCFHeaderVersion version) {
super(line, version, SupportedHeaderLineType.INFO);
}
// info fields allow flag values
@Override
boolean allowFlagValues() {
return true;
}
}

View File

@ -0,0 +1,30 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext.v13;
/** an interface for named header lines **/
interface VCFNamedHeaderLine {
String getName();
}

View File

@ -0,0 +1,22 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
import java.util.List;
import java.util.Map;
/**
* All VCF codecs need to implement this interface so that we can perform lazy loading.
*/
interface VCFParser {
/**
* create a genotype map
* @param str the string
* @param alleles the list of alleles
* @param chr chrom
* @param pos position
* @return a mapping of sample name to genotype object
*/
public Map<String, Genotype> createGenotypeMap(String str, List<Allele> alleles, String chr, int pos);
}

View File

@ -0,0 +1,81 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* @author ebanks
* A class representing a key=value entry for simple VCF header types
*/
abstract class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {
public enum SupportedHeaderLineType {
FILTER, ALT;
}
private String name;
private String description;
// our type of line, i.e. filter, alt, etc
private final SupportedHeaderLineType lineType;
/**
* create a VCF filter header line
*
* @param name the name for this header line
* @param description the description for this header line
* @param lineType the header line type
*/
public VCFSimpleHeaderLine(String name, String description, SupportedHeaderLineType lineType) {
super(lineType.toString(), "");
this.lineType = lineType;
this.name = name;
this.description = description;
if ( name == null || description == null )
throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s desc=%s", super.getKey(), name, description ));
}
/**
* create a VCF info header line
*
* @param line the header line
* @param version the vcf header version
* @param lineType the header line type
*/
protected VCFSimpleHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) {
super(lineType.toString(), "");
this.lineType = lineType;
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description"));
name = mapping.get("ID");
description = mapping.get("Description");
if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided
description = UNBOUND_DESCRIPTION;
}
protected String toStringEncoding() {
Map<String,Object> map = new LinkedHashMap<String,Object>();
map.put("ID", name);
map.put("Description", description);
return lineType.toString() + "=" + toStringEncoding(map);
}
public boolean equals(Object o) {
if ( !(o instanceof VCFSimpleHeaderLine) )
return false;
VCFSimpleHeaderLine other = (VCFSimpleHeaderLine)o;
return name.equals(other.name) &&
description.equals(other.description);
}
public String getName() {
return name;
}
public String getDescription() {
return description;
}
}

View File

@ -0,0 +1,227 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext.v13;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import java.util.*;
/**
* A set of static utility methods for common operations on VCF files/records.
*/
class VCFUtils {
/**
* Constructor access disallowed...static utility methods only!
*/
private VCFUtils() { }
public static <T extends Feature> Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List<RodBinding<T>> rodBindings) {
// Collect the eval rod names
final Set<String> names = new TreeSet<String>();
for ( final RodBinding<T> evalRod : rodBindings )
names.add(evalRod.getName());
return getVCFHeadersFromRods(toolkit, names);
}
public static Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit) {
return getVCFHeadersFromRods(toolkit, (Collection<String>)null);
}
public static Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();
// iterate to get all of the sample names
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
// ignore the rod if it's not in our list
if ( rodNames != null && !rodNames.contains(source.getName()) )
continue;
if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader )
data.put(source.getName(), (VCFHeader)source.getHeader());
}
return data;
}
public static Map<String,VCFHeader> getVCFHeadersFromRodPrefix(GenomeAnalysisEngine toolkit,String prefix) {
Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();
// iterate to get all of the sample names
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
// ignore the rod if lacks the prefix
if ( ! source.getName().startsWith(prefix) )
continue;
if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader )
data.put(source.getName(), (VCFHeader)source.getHeader());
}
return data;
}
/**
* Gets the header fields from all VCF rods input by the user
*
* @param toolkit GATK engine
*
* @return a set of all fields
*/
public static Set<VCFHeaderLine> getHeaderFields(GenomeAnalysisEngine toolkit) {
return getHeaderFields(toolkit, null);
}
/**
* Gets the header fields from all VCF rods input by the user
*
* @param toolkit GATK engine
* @param rodNames names of rods to use, or null if we should use all possible ones
*
* @return a set of all fields
*/
public static Set<VCFHeaderLine> getHeaderFields(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
// keep a map of sample name to occurrences encountered
TreeSet<VCFHeaderLine> fields = new TreeSet<VCFHeaderLine>();
// iterate to get all of the sample names
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
// ignore the rod if it's not in our list
if ( rodNames != null && !rodNames.contains(source.getName()) )
continue;
if ( source.getRecordType().equals(VariantContext.class)) {
VCFHeader header = (VCFHeader)source.getHeader();
if ( header != null )
fields.addAll(header.getMetaData());
}
}
return fields;
}
/** Only displays a warning if a logger is provided and an identical warning hasn't been already issued */
private static final class HeaderConflictWarner {
Logger logger;
Set<String> alreadyIssued = new HashSet<String>();
private HeaderConflictWarner(final Logger logger) {
this.logger = logger;
}
public void warn(final VCFHeaderLine line, final String msg) {
if ( logger != null && ! alreadyIssued.contains(line.getKey()) ) {
alreadyIssued.add(line.getKey());
logger.warn(msg);
}
}
}
public static Set<VCFHeaderLine> smartMergeHeaders(Collection<VCFHeader> headers, Logger logger) throws IllegalStateException {
HashMap<String, VCFHeaderLine> map = new HashMap<String, VCFHeaderLine>(); // from KEY.NAME -> line
HeaderConflictWarner conflictWarner = new HeaderConflictWarner(logger);
// todo -- needs to remove all version headers from sources and add its own VCF version line
for ( VCFHeader source : headers ) {
//System.out.printf("Merging in header %s%n", source);
for ( VCFHeaderLine line : source.getMetaData()) {
String key = line.getKey();
if ( line instanceof VCFNamedHeaderLine)
key = key + "" + ((VCFNamedHeaderLine) line).getName();
if ( map.containsKey(key) ) {
VCFHeaderLine other = map.get(key);
if ( line.equals(other) )
continue;
else if ( ! line.getClass().equals(other.getClass()) )
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
else if ( line instanceof VCFFilterHeaderLine) {
String lineName = ((VCFFilterHeaderLine) line).getName(); String otherName = ((VCFFilterHeaderLine) other).getName();
if ( ! lineName.equals(otherName) )
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
} else if ( line instanceof VCFCompoundHeaderLine ) {
VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line;
VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other;
// if the names are the same, but the values are different, we need to quit
if (! (compLine).equalsExcludingDescription(compOther) ) {
if ( compLine.getType().equals(compOther.getType()) ) {
// The Number entry is an Integer that describes the number of values that can be
// included with the INFO field. For example, if the INFO field contains a single
// number, then this value should be 1. However, if the INFO field describes a pair
// of numbers, then this value should be 2 and so on. If the number of possible
// values varies, is unknown, or is unbounded, then this value should be '.'.
conflictWarner.warn(line, "Promoting header field Number to . due to number differences in header lines: " + line + " " + other);
compOther.setNumberToUnbounded();
} else if ( compLine.getType() == VCFHeaderLineType.Integer && compOther.getType() == VCFHeaderLineType.Float ) {
// promote key to Float
conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther);
map.put(key, compOther);
} else if ( compLine.getType() == VCFHeaderLineType.Float && compOther.getType() == VCFHeaderLineType.Integer ) {
// promote key to Float
conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther);
} else {
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
}
}
if ( ! compLine.getDescription().equals(compOther) )
conflictWarner.warn(line, "Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine);
} else {
// we are not equal, but we're not anything special either
conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + line + " already present header = " + other);
}
} else {
map.put(key, line);
//System.out.printf("Adding header line %s%n", line);
}
}
}
return new HashSet<VCFHeaderLine>(map.values());
}
public static String rsIDOfFirstRealVariant(List<VariantContext> VCs, VariantContext.Type type) {
if ( VCs == null )
return null;
String rsID = null;
for ( VariantContext vc : VCs ) {
if ( vc.getType() == type ) {
rsID = vc.getID();
break;
}
}
return rsID;
}
}

View File

@ -0,0 +1,16 @@
package org.broadinstitute.sting.utils.variantcontext.v13;
/**
* this class writes VCF files
*/
public interface VCFWriter {
public void writeHeader(VCFHeader header);
/**
* attempt to close the VCF file
*/
public void close();
public void add(VariantContext vc);
}

View File

@ -0,0 +1,315 @@
/*
* Copyright (c) 2010. The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext.v13;
import org.apache.commons.jexl2.JexlContext;
import org.apache.commons.jexl2.MapContext;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
/**
*
* @author aaron
* @author depristo
*
* Class VariantJEXLContext
*
* implements the JEXML context for VariantContext; this saves us from
* having to generate a JEXML context lookup map everytime we want to evaluate an expression.
*
* This is package protected, only classes in variantcontext should have access to it.
*
* // todo -- clean up to remove or better support genotype filtering
*/
class VariantJEXLContext implements JexlContext {
// our stored variant context
private VariantContext vc;
private interface AttributeGetter {
public Object get(VariantContext vc);
}
private static Map<String, AttributeGetter> x = new HashMap<String, AttributeGetter>();
static {
x.put("vc", new AttributeGetter() { public Object get(VariantContext vc) { return vc; }});
x.put("CHROM", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getChr(); }});
x.put("POS", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getStart(); }});
x.put("TYPE", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getType().toString(); }});
x.put("QUAL", new AttributeGetter() { public Object get(VariantContext vc) { return 10 * vc.getNegLog10PError(); }});
x.put("ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getAlleles(); }});
x.put("N_ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getNAlleles(); }});
x.put("FILTER", new AttributeGetter() { public Object get(VariantContext vc) { return vc.isFiltered() ? "1" : "0"; }});
// x.put("GT", new AttributeGetter() { public Object get(VariantContext vc) { return g.getGenotypeString(); }});
x.put("homRefCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHomRefCount(); }});
x.put("hetCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHetCount(); }});
x.put("homVarCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHomVarCount(); }});
}
public VariantJEXLContext(VariantContext vc) {
this.vc = vc;
}
public Object get(String name) {
Object result = null;
if ( x.containsKey(name) ) { // dynamic resolution of name -> value via map
result = x.get(name).get(vc);
} else if ( vc.hasAttribute(name)) {
result = vc.getAttribute(name);
} else if ( vc.getFilters().contains(name) ) {
result = "1";
}
//System.out.printf("dynamic lookup %s => %s%n", name, result);
return result;
}
public boolean has(String name) {
return get(name) != null;
}
public void set(String name, Object value) {
throw new UnsupportedOperationException("remove() not supported on a VariantJEXLContext");
}
}
/**
* this is an implementation of a Map of JexlVCMatchExp to true or false values. It lazy initializes each value
* as requested to save as much processing time as possible.
*
* Compatible with JEXL 1.1 (this code will be easier if we move to 2.0, all of the functionality can go into the
* JexlContext's get()
*
*/
class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
// our variant context and/or Genotype
private final VariantContext vc;
private final Genotype g;
// our context
private JexlContext jContext = null;
// our mapping from JEXLVCMatchExp to Booleans, which will be set to NULL for previously uncached JexlVCMatchExp
private Map<VariantContextUtils.JexlVCMatchExp,Boolean> jexl;
public JEXLMap(Collection<VariantContextUtils.JexlVCMatchExp> jexlCollection, VariantContext vc, Genotype g) {
this.vc = vc;
this.g = g;
initialize(jexlCollection);
}
public JEXLMap(Collection<VariantContextUtils.JexlVCMatchExp> jexlCollection, VariantContext vc) {
this(jexlCollection, vc, null);
}
private void initialize(Collection<VariantContextUtils.JexlVCMatchExp> jexlCollection) {
jexl = new HashMap<VariantContextUtils.JexlVCMatchExp,Boolean>();
for (VariantContextUtils.JexlVCMatchExp exp: jexlCollection) {
jexl.put(exp, null);
}
}
/**
* create the internal JexlContext, only when required. This code is where new JEXL context variables
* should get added.
*
*/
private void createContext() {
if ( g == null ) {
// todo -- remove dependancy on g to the entire system
jContext = new VariantJEXLContext(vc);
} else {
//
// this whole branch is here just to support G jexl operations
//
Map<String, Object> infoMap = new HashMap<String, Object>();
if ( vc != null ) {
// create a mapping of what we know about the variant context, its Chromosome, positions, etc.
infoMap.put("CHROM", vc.getChr());
infoMap.put("POS", vc.getStart());
infoMap.put("TYPE", vc.getType().toString());
infoMap.put("QUAL", String.valueOf(vc.getPhredScaledQual()));
// add alleles
infoMap.put("ALLELES", Utils.join(";", vc.getAlleles()));
infoMap.put("N_ALLELES", String.valueOf(vc.getNAlleles()));
// add attributes
addAttributesToMap(infoMap, vc.getAttributes());
// add filter fields
infoMap.put("FILTER", vc.isFiltered() ? "1" : "0");
for ( Object filterCode : vc.getFilters() ) {
infoMap.put(String.valueOf(filterCode), "1");
}
// add genotype-specific fields
// TODO -- implement me when we figure out a good way to represent this
// for ( Genotype g : vc.getGenotypes().values() ) {
// String prefix = g.getSampleName() + ".";
// addAttributesToMap(infoMap, g.getAttributes(), prefix);
// infoMap.put(prefix + "GT", g.getGenotypeString());
// }
// add specific genotype if one is provided
infoMap.put(VCFConstants.GENOTYPE_KEY, g.getGenotypeString());
infoMap.put("isHomRef", g.isHomRef() ? "1" : "0");
infoMap.put("isHet", g.isHet() ? "1" : "0");
infoMap.put("isHomVar", g.isHomVar() ? "1" : "0");
infoMap.put(VCFConstants.GENOTYPE_QUALITY_KEY, new Double(g.getPhredScaledQual()));
for ( Map.Entry<String, Object> e : g.getAttributes().entrySet() ) {
if ( e.getValue() != null && !e.getValue().equals(VCFConstants.MISSING_VALUE_v4) )
infoMap.put(e.getKey(), e.getValue());
}
}
// create the internal context that we can evaluate expressions against
jContext = new MapContext(infoMap);
}
}
/**
* @return the size of the internal data structure
*/
public int size() {
return jexl.size();
}
/**
* @return true if we're empty
*/
public boolean isEmpty() { return this.jexl.isEmpty(); }
/**
* do we contain the specified key
* @param o the key
* @return true if we have a value for that key
*/
public boolean containsKey(Object o) { return jexl.containsKey(o); }
public Boolean get(Object o) {
// if we've already determined the value, return it
if (jexl.containsKey(o) && jexl.get(o) != null) return jexl.get(o);
// try and cast the expression
VariantContextUtils.JexlVCMatchExp e = (VariantContextUtils.JexlVCMatchExp) o;
evaluateExpression(e);
return jexl.get(e);
}
/**
* get the keyset of map
* @return a set of keys of type JexlVCMatchExp
*/
public Set<VariantContextUtils.JexlVCMatchExp> keySet() {
return jexl.keySet();
}
/**
* get all the values of the map. This is an expensive call, since it evaluates all keys that haven't
* been evaluated yet. This is fine if you truely want all the keys, but if you only want a portion, or know
* the keys you want, you would be better off using get() to get them by name.
* @return a collection of boolean values, representing the results of all the variants evaluated
*/
public Collection<Boolean> values() {
// this is an expensive call
for (VariantContextUtils.JexlVCMatchExp exp : jexl.keySet())
if (jexl.get(exp) == null)
evaluateExpression(exp);
return jexl.values();
}
/**
* evaulate a JexlVCMatchExp's expression, given the current context (and setup the context if it's null)
* @param exp the JexlVCMatchExp to evaluate
*/
private void evaluateExpression(VariantContextUtils.JexlVCMatchExp exp) {
// if the context is null, we need to create it to evaluate the JEXL expression
if (this.jContext == null) createContext();
try {
jexl.put (exp, (Boolean) exp.exp.evaluate(jContext));
} catch (Exception e) {
throw new UserException.CommandLineException(String.format("Invalid JEXL expression detected for %s with message %s", exp.name, e.getMessage()));
}
}
/**
* helper function: adds the list of attributes to the information map we're building
* @param infoMap the map
* @param attributes the attributes
*/
private static void addAttributesToMap(Map<String, Object> infoMap, Map<String, ?> attributes ) {
for (Map.Entry<String, ?> e : attributes.entrySet()) {
infoMap.put(e.getKey(), String.valueOf(e.getValue()));
}
}
public Boolean put(VariantContextUtils.JexlVCMatchExp jexlVCMatchExp, Boolean aBoolean) {
return jexl.put(jexlVCMatchExp,aBoolean);
}
public void putAll(Map<? extends VariantContextUtils.JexlVCMatchExp, ? extends Boolean> map) {
jexl.putAll(map);
}
// //////////////////////////////////////////////////////////////////////////////////////
// The Following are unsupported at the moment
// //////////////////////////////////////////////////////////////////////////////////////
// this doesn't make much sense to implement, boolean doesn't offer too much variety to deal
// with evaluating every key in the internal map.
public boolean containsValue(Object o) {
throw new UnsupportedOperationException("containsValue() not supported on a JEXLMap");
}
// this doesn't make much sense
public Boolean remove(Object o) {
throw new UnsupportedOperationException("remove() not supported on a JEXLMap");
}
public Set<Entry<VariantContextUtils.JexlVCMatchExp, Boolean>> entrySet() {
throw new UnsupportedOperationException("clear() not supported on a JEXLMap");
}
// nope
public void clear() {
throw new UnsupportedOperationException("clear() not supported on a JEXLMap");
}
}