diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java index e16176dff..eda74a965 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java @@ -244,124 +244,134 @@ public class VariantContextBenchmark extends SimpleBenchmark { } } - public void timeV13(int rep) { - for ( int i = 0; i < rep; i++ ) { - FunctionToBenchmark func = getV13FunctionToBenchmark(); - FeatureCodec codec = new org.broadinstitute.sting.utils.variantcontext.v13.VCFCodec(); - runBenchmark(codec, func); - } - } + // -------------------------------------------------------------------------------- + // + // V13 + // + // In order to use this, you must move the v13 version from archive and uncomment + // + // git mv private/archive/java/src/org/broadinstitute/sting/utils/variantcontext/v13 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13 + // + // -------------------------------------------------------------------------------- - public FunctionToBenchmark getV13FunctionToBenchmark() { - switch ( operation ) { - case READ: - return new FunctionToBenchmark() { - public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { - ; // empty operation - } - }; - case SUBSET_TO_SAMPLES: - return new FunctionToBenchmark() { - List samples; - public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { - if ( samples == null ) - samples = new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake); - org.broadinstitute.sting.utils.variantcontext.v13.VariantContext sub = vc.subContextFromGenotypes(vc.getGenotypes(samples).values()); - sub.getNSamples(); - } - }; - - case GET_TYPE: - return new FunctionToBenchmark() { - public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { - vc.getType(); - } - }; - case GET_ID: - return new FunctionToBenchmark() { - public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { - vc.getID(); - } - }; - case GET_GENOTYPES: - return new FunctionToBenchmark() { - public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { - vc.getGenotypes().size(); - } - }; - - case GET_GENOTYPES_FOR_SAMPLES: - return new FunctionToBenchmark() { - Set samples; - public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { - if ( samples == null ) - samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); - vc.getGenotypes(samples).size(); - } - }; - - case GET_ATTRIBUTE_STRING: - return new FunctionToBenchmark() { - public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { - vc.getAttribute("AN", null); - } - }; - - case GET_ATTRIBUTE_INT: - return new FunctionToBenchmark() { - public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { - vc.getAttributeAsInt("AC", 0); - } - }; - - case GET_N_SAMPLES: - return new FunctionToBenchmark() { - public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { - vc.getNSamples(); - } - }; - - case GET_GENOTYPES_IN_ORDER_OF_NAME: - return new FunctionToBenchmark() { - public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { - ; // TODO - TEST IS BROKEN - //vc.getGenotypesSortedByName(); - } - }; - - case CALC_GENOTYPE_COUNTS: - return new FunctionToBenchmark() { - public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { - vc.getHetCount(); - } - }; - - case MERGE: - return new FunctionToBenchmark() { - public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { - List toMerge = new ArrayList(); - - for ( int i = 0; i < dupsToMerge; i++ ) { - Map gc = new HashMap(); - for ( final org.broadinstitute.sting.utils.variantcontext.v13.Genotype g : vc.getGenotypes().values() ) { - String name = g.getSampleName()+"_"+i; - gc.put(name, new org.broadinstitute.sting.utils.variantcontext.v13.Genotype(name, - g.getAlleles(), g.getNegLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased(), g.getLikelihoods().getAsVector())); - toMerge.add(org.broadinstitute.sting.utils.variantcontext.v13.VariantContext.modifyGenotypes(vc, gc)); - } - } - - org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.simpleMerge(b37GenomeLocParser, - toMerge, null, - org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.GenotypeMergeType.UNSORTED, - true, false, "set", false, true); - } - }; - - default: throw new IllegalArgumentException("Unexpected operation " + operation); - } - } +// public void timeV13(int rep) { +// for ( int i = 0; i < rep; i++ ) { +// FunctionToBenchmark func = getV13FunctionToBenchmark(); +// FeatureCodec codec = new org.broadinstitute.sting.utils.variantcontext.v13.VCFCodec(); +// runBenchmark(codec, func); +// } +// } +// +// public FunctionToBenchmark getV13FunctionToBenchmark() { +// switch ( operation ) { +// case READ: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// ; // empty operation +// } +// }; +// case SUBSET_TO_SAMPLES: +// return new FunctionToBenchmark() { +// List samples; +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// if ( samples == null ) +// samples = new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake); +// org.broadinstitute.sting.utils.variantcontext.v13.VariantContext sub = vc.subContextFromGenotypes(vc.getGenotypes(samples).values()); +// sub.getNSamples(); +// } +// }; +// +// case GET_TYPE: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getType(); +// } +// }; +// case GET_ID: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getID(); +// } +// }; +// case GET_GENOTYPES: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getGenotypes().size(); +// } +// }; +// +// case GET_GENOTYPES_FOR_SAMPLES: +// return new FunctionToBenchmark() { +// Set samples; +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// if ( samples == null ) +// samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); +// vc.getGenotypes(samples).size(); +// } +// }; +// +// case GET_ATTRIBUTE_STRING: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getAttribute("AN", null); +// } +// }; +// +// case GET_ATTRIBUTE_INT: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getAttributeAsInt("AC", 0); +// } +// }; +// +// case GET_N_SAMPLES: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getNSamples(); +// } +// }; +// +// case GET_GENOTYPES_IN_ORDER_OF_NAME: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// ; // TODO - TEST IS BROKEN +// //vc.getGenotypesSortedByName(); +// } +// }; +// +// case CALC_GENOTYPE_COUNTS: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getHetCount(); +// } +// }; +// +// case MERGE: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// List toMerge = new ArrayList(); +// +// for ( int i = 0; i < dupsToMerge; i++ ) { +// Map gc = new HashMap(); +// for ( final org.broadinstitute.sting.utils.variantcontext.v13.Genotype g : vc.getGenotypes().values() ) { +// String name = g.getSampleName()+"_"+i; +// gc.put(name, new org.broadinstitute.sting.utils.variantcontext.v13.Genotype(name, +// g.getAlleles(), g.getNegLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased(), g.getLikelihoods().getAsVector())); +// toMerge.add(org.broadinstitute.sting.utils.variantcontext.v13.VariantContext.modifyGenotypes(vc, gc)); +// } +// } +// +// org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.simpleMerge(b37GenomeLocParser, +// toMerge, null, +// org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, +// org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.GenotypeMergeType.UNSORTED, +// true, false, "set", false, true); +// } +// }; +// +// default: throw new IllegalArgumentException("Unexpected operation " + operation); +// } +// } public static void main(String[] args) { CaliperMain.main(VariantContextBenchmark.class, args); diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/AbstractVCFCodec.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/AbstractVCFCodec.java deleted file mode 100755 index 5310313e0..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/AbstractVCFCodec.java +++ /dev/null @@ -1,635 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -import org.apache.log4j.Logger; -import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.NameAwareCodec; -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.LineReader; -import org.broad.tribble.util.BlockCompressedInputStream; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.*; -import java.util.*; -import java.util.zip.GZIPInputStream; - - -abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser { - - protected final static Logger log = Logger.getLogger(VCFCodec.class); - protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column - - protected VCFHeaderVersion version; - - // we have to store the list of strings that make up the header until they're needed - protected VCFHeader header = null; - - // a mapping of the allele - protected Map> alleleMap = new HashMap>(3); - - // for ParsingUtils.split - protected String[] GTValueArray = new String[100]; - protected String[] genotypeKeyArray = new String[100]; - protected String[] infoFieldArray = new String[1000]; - protected String[] infoValueArray = new String[1000]; - - // for performance testing purposes - public static boolean validate = true; - - // a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over - // todo: make this thread safe? - protected String[] parts = null; - protected String[] genotypeParts = null; - - // for performance we cache the hashmap of filter encodings for quick lookup - protected HashMap> filterHash = new HashMap>(); - - // a mapping of the VCF fields to their type, filter fields, and format fields, for quick lookup to validate against - TreeMap infoFields = new TreeMap(); - TreeMap formatFields = new TreeMap(); - Set filterFields = new HashSet(); - - // we store a name to give to each of the variant contexts we emit - protected String name = "Unknown"; - - protected int lineNo = 0; - - protected Map stringCache = new HashMap(); - - - /** - * @param reader the line reader to take header lines from - * @return the number of header lines - */ - public abstract Object readHeader(LineReader reader); - - /** - * create a genotype map - * @param str the string - * @param alleles the list of alleles - * @param chr chrom - * @param pos position - * @return a mapping of sample name to genotype object - */ - public abstract Map createGenotypeMap(String str, List alleles, String chr, int pos); - - - /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * @param filterString the string to parse - * @return a set of the filters applied - */ - protected abstract Set parseFilters(String filterString); - - /** - * create a VCF header - * @param headerStrings a list of strings that represent all the ## entries - * @param line the single # line (column names) - * @return the count of header lines - */ - protected Object createHeader(List headerStrings, String line) { - - headerStrings.add(line); - - Set metaData = new TreeSet(); - Set auxTags = new LinkedHashSet(); - // iterate over all the passed in strings - for ( String str : headerStrings ) { - if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) { - String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR); - if ( strings.length < VCFHeader.HEADER_FIELDS.values().length ) - throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str); - - int arrayIndex = 0; - for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { - try { - if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex])) - throw new TribbleException.InvalidHeader("we were expecting column name '" + field + "' but we saw '" + strings[arrayIndex] + "'"); - } catch (IllegalArgumentException e) { - throw new TribbleException.InvalidHeader("unknown column name '" + strings[arrayIndex] + "'; it does not match a legal column header name."); - } - arrayIndex++; - } - - boolean sawFormatTag = false; - if ( arrayIndex < strings.length ) { - if ( !strings[arrayIndex].equals("FORMAT") ) - throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'"); - sawFormatTag = true; - arrayIndex++; - } - - while ( arrayIndex < strings.length ) - auxTags.add(strings[arrayIndex++]); - - if ( sawFormatTag && auxTags.size() == 0 ) - throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data"); - - } else { - if ( str.startsWith("##INFO=") ) { - VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version); - metaData.add(info); - infoFields.put(info.getName(), info.getType()); - } else if ( str.startsWith("##FILTER=") ) { - VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9),version); - metaData.add(filter); - filterFields.add(filter.getName()); - } else if ( str.startsWith("##FORMAT=") ) { - VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9),version); - metaData.add(format); - formatFields.put(format.getName(), format.getType()); - } else { - int equals = str.indexOf("="); - if ( equals != -1 ) - metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1))); - } - } - } - - header = new VCFHeader(metaData, auxTags); - return header; - } - - /** - * the fast decode function - * @param line the line of text for the record - * @return a feature, (not guaranteed complete) that has the correct start and stop - */ - public Feature decodeLoc(String line) { - lineNo++; - - // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line - if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; - - // our header cannot be null, we need the genotype sample names and counts - if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); - - final String[] locParts = new String[6]; - int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); - - if ( nParts != 6 ) - throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo); - - // get our alleles (because the end position depends on them) - final String ref = getCachedString(locParts[3].toUpperCase()); - final String alts = getCachedString(locParts[4].toUpperCase()); - final List alleles = parseAlleles(ref, alts, lineNo); - - // find out our location - final int start = Integer.valueOf(locParts[1]); - int stop = start; - - // ref alleles don't need to be single bases for monomorphic sites - if ( alleles.size() == 1 ) { - stop = start + alleles.get(0).length() - 1; - } else if ( !isSingleNucleotideEvent(alleles) ) { - stop = clipAlleles(start, ref, alleles, null, lineNo); - } - - return new VCFLocFeature(locParts[0], start, stop); - } - - private final static class VCFLocFeature implements Feature { - - final String chr; - final int start, stop; - - private VCFLocFeature(String chr, int start, int stop) { - this.chr = chr; - this.start = start; - this.stop = stop; - } - - public String getChr() { return chr; } - public int getStart() { return start; } - public int getEnd() { return stop; } - } - - - /** - * decode the line into a feature (VariantContext) - * @param line the line - * @return a VariantContext - */ - public Feature decode(String line) { - // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line - if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; - - // our header cannot be null, we need the genotype sample names and counts - if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); - - if (parts == null) - parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)]; - - int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true); - - // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns. Otherwise check that we have nine (normal colummns + genotyping data) - if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) || - (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) ) - throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + - " tokens, and saw " + nParts + " )", lineNo); - - return parseVCFLine(parts); - } - - protected void generateException(String message) { - throw new UserException.MalformedVCF(message, lineNo); - } - - protected static void generateException(String message, int lineNo) { - throw new UserException.MalformedVCF(message, lineNo); - } - - /** - * parse out the VCF line - * - * @param parts the parts split up - * @return a variant context object - */ - private VariantContext parseVCFLine(String[] parts) { - // increment the line count - lineNo++; - - // parse out the required fields - String contig = getCachedString(parts[0]); - int pos = Integer.valueOf(parts[1]); - String id = null; - if ( parts[2].length() == 0 ) - generateException("The VCF specification requires a valid ID field"); - else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) - id = VCFConstants.EMPTY_ID_FIELD; - else - id = new String(parts[2]); - String ref = getCachedString(parts[3].toUpperCase()); - String alts = getCachedString(parts[4].toUpperCase()); - Double qual = parseQual(parts[5]); - String filter = getCachedString(parts[6]); - String info = new String(parts[7]); - - // get our alleles, filters, and setup an attribute map - List alleles = parseAlleles(ref, alts, lineNo); - Set filters = parseFilters(filter); - Map attributes = parseInfo(info, id); - - // find out our current location, and clip the alleles down to their minimum length - int loc = pos; - // ref alleles don't need to be single bases for monomorphic sites - if ( alleles.size() == 1 ) { - loc = pos + alleles.get(0).length() - 1; - } else if ( !isSingleNucleotideEvent(alleles) ) { - ArrayList newAlleles = new ArrayList(); - loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo); - alleles = newAlleles; - } - - // do we have genotyping data - if (parts.length > NUM_STANDARD_FIELDS) { - attributes.put(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, new String(parts[8])); - attributes.put(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY, this); - } - - VariantContext vc = null; - try { - vc = new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes, ref.getBytes()[0]); - } catch (Exception e) { - generateException(e.getMessage()); - } - - // did we resort the sample names? If so, we need to load the genotype data - if ( !header.samplesWereAlreadySorted() ) - vc.getGenotypes(); - - return vc; - } - - /** - * - * @return the type of record - */ - public Class getFeatureType() { - return VariantContext.class; - } - - /** - * get the name of this codec - * @return our set name - */ - public String getName() { - return name; - } - - /** - * set the name of this codec - * @param name new name - */ - public void setName(String name) { - this.name = name; - } - - /** - * Return a cached copy of the supplied string. - * - * @param str string - * @return interned string - */ - protected String getCachedString(String str) { - String internedString = stringCache.get(str); - if ( internedString == null ) { - internedString = new String(str); - stringCache.put(internedString, internedString); - } - return internedString; - } - - /** - * parse out the info fields - * @param infoField the fields - * @param id the indentifier - * @return a mapping of keys to objects - */ - private Map parseInfo(String infoField, String id) { - Map attributes = new HashMap(); - - if ( infoField.length() == 0 ) - generateException("The VCF specification requires a valid info field"); - - if ( !infoField.equals(VCFConstants.EMPTY_INFO_FIELD) ) { - if ( infoField.indexOf("\t") != -1 || infoField.indexOf(" ") != -1 ) - generateException("The VCF specification does not allow for whitespace in the INFO field"); - - int infoFieldSplitSize = ParsingUtils.split(infoField, infoFieldArray, VCFConstants.INFO_FIELD_SEPARATOR_CHAR, false); - for (int i = 0; i < infoFieldSplitSize; i++) { - String key; - Object value; - - int eqI = infoFieldArray[i].indexOf("="); - if ( eqI != -1 ) { - key = infoFieldArray[i].substring(0, eqI); - String str = infoFieldArray[i].substring(eqI+1); - - // split on the INFO field separator - int infoValueSplitSize = ParsingUtils.split(str, infoValueArray, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR, false); - if ( infoValueSplitSize == 1 ) { - value = infoValueArray[0]; - } else { - ArrayList valueList = new ArrayList(infoValueSplitSize); - for ( int j = 0; j < infoValueSplitSize; j++ ) - valueList.add(infoValueArray[j]); - value = valueList; - } - } else { - key = infoFieldArray[i]; - value = true; - } - - attributes.put(key, value); - } - } - - if ( ! id.equals(VCFConstants.EMPTY_ID_FIELD) ) - attributes.put(VariantContext.ID_KEY, id); - return attributes; - } - - /** - * create a an allele from an index and an array of alleles - * @param index the index - * @param alleles the alleles - * @return an Allele - */ - protected static Allele oneAllele(String index, List alleles) { - if ( index.equals(VCFConstants.EMPTY_ALLELE) ) - return Allele.NO_CALL; - int i = Integer.valueOf(index); - if ( i >= alleles.size() ) - throw new TribbleException.InternalCodecException("The allele with index " + index + " is not defined in the REF/ALT columns in the record"); - return alleles.get(i); - } - - - /** - * parse genotype alleles from the genotype string - * @param GT GT string - * @param alleles list of possible alleles - * @param cache cache of alleles for GT - * @return the allele list for the GT string - */ - protected static List parseGenotypeAlleles(String GT, List alleles, Map> cache) { - // cache results [since they are immutable] and return a single object for each genotype - List GTAlleles = cache.get(GT); - - if ( GTAlleles == null ) { - StringTokenizer st = new StringTokenizer(GT, VCFConstants.PHASING_TOKENS); - GTAlleles = new ArrayList(st.countTokens()); - while ( st.hasMoreTokens() ) { - String genotype = st.nextToken(); - GTAlleles.add(oneAllele(genotype, alleles)); - } - cache.put(GT, GTAlleles); - } - - return GTAlleles; - } - - /** - * parse out the qual value - * @param qualString the quality string - * @return return a double - */ - protected static Double parseQual(String qualString) { - // if we're the VCF 4 missing char, return immediately - if ( qualString.equals(VCFConstants.MISSING_VALUE_v4)) - return VariantContext.NO_NEG_LOG_10PERROR; - - Double val = Double.valueOf(qualString); - - // check to see if they encoded the missing qual score in VCF 3 style, with either the -1 or -1.0. check for val < 0 to save some CPU cycles - if ((val < 0) && (Math.abs(val - VCFConstants.MISSING_QUALITY_v3_DOUBLE) < VCFConstants.VCF_ENCODING_EPSILON)) - return VariantContext.NO_NEG_LOG_10PERROR; - - // scale and return the value - return val / 10.0; - } - - /** - * parse out the alleles - * @param ref the reference base - * @param alts a string of alternates to break into alleles - * @param lineNo the line number for this record - * @return a list of alleles, and a pair of the shortest and longest sequence - */ - protected static List parseAlleles(String ref, String alts, int lineNo) { - List alleles = new ArrayList(2); // we are almost always biallelic - // ref - checkAllele(ref, true, lineNo); - Allele refAllele = Allele.create(ref, true); - alleles.add(refAllele); - - if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split - parseSingleAltAllele(alleles, alts, lineNo); - else - for ( String alt : alts.split(",") ) - parseSingleAltAllele(alleles, alt, lineNo); - - return alleles; - } - - /** - * check to make sure the allele is an acceptable allele - * @param allele the allele to check - * @param isRef are we the reference allele? - * @param lineNo the line number for this record - */ - private static void checkAllele(String allele, boolean isRef, int lineNo) { - if ( allele == null || allele.length() == 0 ) - generateException("Empty alleles are not permitted in VCF records", lineNo); - - if ( isSymbolicAllele(allele) ) { - if ( isRef ) { - generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo); - } - } else { - // check for VCF3 insertions or deletions - if ( (allele.charAt(0) == VCFConstants.DELETION_ALLELE_v3) || (allele.charAt(0) == VCFConstants.INSERTION_ALLELE_v3) ) - generateException("Insertions/Deletions are not supported when reading 3.x VCF's. Please" + - " convert your file to VCF4 using VCFTools, available at http://vcftools.sourceforge.net/index.html", lineNo); - - if (!Allele.acceptableAlleleBases(allele)) - generateException("Unparsable vcf record with allele " + allele, lineNo); - - if ( isRef && allele.equals(VCFConstants.EMPTY_ALLELE) ) - generateException("The reference allele cannot be missing", lineNo); - } - } - - /** - * return true if this is a symbolic allele (e.g. ) otherwise false - * @param allele the allele to check - * @return true if the allele is a symbolic allele, otherwise false - */ - private static boolean isSymbolicAllele(String allele) { - return (allele != null && allele.startsWith("<") && allele.endsWith(">") && allele.length() > 2); - } - - /** - * parse a single allele, given the allele list - * @param alleles the alleles available - * @param alt the allele to parse - * @param lineNo the line number for this record - */ - private static void parseSingleAltAllele(List alleles, String alt, int lineNo) { - checkAllele(alt, false, lineNo); - - Allele allele = Allele.create(alt, false); - if ( ! allele.isNoCall() ) - alleles.add(allele); - } - - protected static boolean isSingleNucleotideEvent(List alleles) { - for ( Allele a : alleles ) { - if ( a.length() != 1 ) - return false; - } - return true; - } - - public static int computeForwardClipping(List unclippedAlleles, String ref) { - boolean clipping = true; - - for ( Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) - continue; - - if ( a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0]) ) { - clipping = false; - break; - } - } - - return (clipping) ? 1 : 0; - } - - protected static int computeReverseClipping(List unclippedAlleles, String ref, int forwardClipping, int lineNo) { - int clipping = 0; - boolean stillClipping = true; - - while ( stillClipping ) { - for ( Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) - continue; - - if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) - stillClipping = false; - else if ( ref.length() == clipping ) - generateException("bad alleles encountered", lineNo); - else if ( a.getBases()[a.length()-clipping-1] != ref.getBytes()[ref.length()-clipping-1] ) - stillClipping = false; - } - if ( stillClipping ) - clipping++; - } - - return clipping; - } - /** - * clip the alleles, based on the reference - * - * @param position the unadjusted start position (pre-clipping) - * @param ref the reference string - * @param unclippedAlleles the list of unclipped alleles - * @param clippedAlleles output list of clipped alleles - * @param lineNo the current line number in the file - * @return the new reference end position of this event - */ - protected static int clipAlleles(int position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { - - int forwardClipping = computeForwardClipping(unclippedAlleles, ref); - int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo); - - if ( clippedAlleles != null ) { - for ( Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) { - clippedAlleles.add(a); - } else { - clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length-reverseClipping), a.isReference())); - } - } - } - - // the new reference length - int refLength = ref.length() - reverseClipping; - - return position+Math.max(refLength - 1,0); - } - - public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) { - try { - return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) || - isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE) || - isVCFStream(new BlockCompressedInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE); - } catch ( FileNotFoundException e ) { - return false; - } catch ( IOException e ) { - return false; - } - } - - private final static boolean isVCFStream(final InputStream stream, final String MAGIC_HEADER_LINE) { - try { - byte[] buff = new byte[MAGIC_HEADER_LINE.length()]; - int nread = stream.read(buff, 0, MAGIC_HEADER_LINE.length()); - boolean eq = Arrays.equals(buff, MAGIC_HEADER_LINE.getBytes()); - return eq; -// String firstLine = new String(buff); -// return firstLine.startsWith(MAGIC_HEADER_LINE); - } catch ( IOException e ) { - return false; - } catch ( RuntimeException e ) { - return false; - } finally { - try { stream.close(); } catch ( IOException e ) {} - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Allele.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Allele.java deleted file mode 100755 index f43cd7b98..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Allele.java +++ /dev/null @@ -1,456 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; - -/** - * Immutable representation of an allele - * - * Types of alleles: - * - * Ref: a t C g a // C is the reference base - * - * : a t G g a // C base is a G in some individuals - * - * : a t - g a // C base is deleted w.r.t. the reference - * - * : a t CAg a // A base is inserted w.r.t. the reference sequence - * - * In these cases, where are the alleles? - * - * SNP polymorphism of C/G -> { C , G } -> C is the reference allele - * 1 base deletion of C -> { C , - } -> C is the reference allele - * 1 base insertion of A -> { - ; A } -> Null is the reference allele - * - * Suppose I see a the following in the population: - * - * Ref: a t C g a // C is the reference base - * : a t G g a // C base is a G in some individuals - * : a t - g a // C base is deleted w.r.t. the reference - * - * How do I represent this? There are three segregating alleles: - * - * { C , G , - } - * - * Now suppose I have this more complex example: - * - * Ref: a t C g a // C is the reference base - * : a t - g a - * : a t - - a - * : a t CAg a - * - * There are actually four segregating alleles: - * - * { C g , - g, - -, and CAg } over bases 2-4 - * - * However, the molecular equivalence explicitly listed above is usually discarded, so the actual - * segregating alleles are: - * - * { C g, g, -, C a g } - * - * Critically, it should be possible to apply an allele to a reference sequence to create the - * correct haplotype sequence: - * - * Allele + reference => haplotype - * - * For convenience, we are going to create Alleles where the GenomeLoc of the allele is stored outside of the - * Allele object itself. So there's an idea of an A/C polymorphism independent of it's surrounding context. - * - * Given list of alleles it's possible to determine the "type" of the variation - * - * A / C @ loc => SNP with - * - / A => INDEL - * - * If you know where allele is the reference, you can determine whether the variant is an insertion or deletion. - * - * Alelle also supports is concept of a NO_CALL allele. This Allele represents a haplotype that couldn't be - * determined. This is usually represented by a '.' allele. - * - * Note that Alleles store all bases as bytes, in **UPPER CASE**. So 'atc' == 'ATC' from the perspective of an - * Allele. - - * @author ebanks, depristo - */ -class Allele implements Comparable { - private static final byte[] EMPTY_ALLELE_BASES = new byte[0]; - - private boolean isRef = false; - private boolean isNull = false; - private boolean isNoCall = false; - private boolean isSymbolic = false; - - private byte[] bases = null; - - public final static String NULL_ALLELE_STRING = "-"; - public final static String NO_CALL_STRING = "."; - /** A generic static NO_CALL allele for use */ - - // no public way to create an allele - private Allele(byte[] bases, boolean isRef) { - // standardize our representation of null allele and bases - if ( wouldBeNullAllele(bases) ) { - bases = EMPTY_ALLELE_BASES; - isNull = true; - } else if ( wouldBeNoCallAllele(bases) ) { - bases = EMPTY_ALLELE_BASES; - isNoCall = true; - if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele"); - } else if ( wouldBeSymbolicAllele(bases) ) { - isSymbolic = true; - if ( isRef ) throw new IllegalArgumentException("Cannot tag a symbolic allele as the reference allele"); - } -// else -// bases = new String(bases).toUpperCase().getBytes(); // todo -- slow performance - - this.isRef = isRef; - this.bases = bases; - - if ( ! acceptableAlleleBases(bases) ) - throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'"); - } - - private Allele(String bases, boolean isRef) { - this(bases.getBytes(), isRef); - } - - - private final static Allele REF_A = new Allele("A", true); - private final static Allele ALT_A = new Allele("A", false); - private final static Allele REF_C = new Allele("C", true); - private final static Allele ALT_C = new Allele("C", false); - private final static Allele REF_G = new Allele("G", true); - private final static Allele ALT_G = new Allele("G", false); - private final static Allele REF_T = new Allele("T", true); - private final static Allele ALT_T = new Allele("T", false); - private final static Allele REF_N = new Allele("N", true); - private final static Allele ALT_N = new Allele("N", false); - private final static Allele REF_NULL = new Allele(NULL_ALLELE_STRING, true); - private final static Allele ALT_NULL = new Allele(NULL_ALLELE_STRING, false); - public final static Allele NO_CALL = new Allele(NO_CALL_STRING, false); - - // --------------------------------------------------------------------------------------------------------- - // - // creation routines - // - // --------------------------------------------------------------------------------------------------------- - - /** - * Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases - * == '-', a Null allele is created. If bases == '.', a no call Allele is created. - * - * @param bases the DNA sequence of this variation, '-', of '.' - * @param isRef should we make this a reference allele? - * @throws IllegalArgumentException if bases contains illegal characters or is otherwise malformated - */ - public static Allele create(byte[] bases, boolean isRef) { - if ( bases == null ) - throw new IllegalArgumentException("create: the Allele base string cannot be null; use new Allele() or new Allele(\"\") to create a Null allele"); - - if ( bases.length == 1 ) { - // optimization to return a static constant Allele for each single base object - switch (bases[0]) { - case '.': - if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele"); - return NO_CALL; - case '-': return isRef ? REF_NULL : ALT_NULL; - case 'A': case 'a' : return isRef ? REF_A : ALT_A; - case 'C': case 'c' : return isRef ? REF_C : ALT_C; - case 'G': case 'g' : return isRef ? REF_G : ALT_G; - case 'T': case 't' : return isRef ? REF_T : ALT_T; - case 'N': case 'n' : return isRef ? REF_N : ALT_N; - default: throw new IllegalArgumentException("Illegal base: " + (char)bases[0]); - } - } else { - return new Allele(bases, isRef); - } - } - - public static Allele create(byte base, boolean isRef) { -// public Allele(byte base, boolean isRef) { - return create( new byte[]{ base }, isRef); - } - - public static Allele create(byte base) { - return create( base, false ); - } - - public static Allele extend(Allele left, byte[] right) { - if (left.isSymbolic()) - throw new IllegalArgumentException("Cannot extend a symbolic allele"); - byte[] bases = null; - if ( left.length() == 0 ) - bases = right; - else { - bases = new byte[left.length() + right.length]; - System.arraycopy(left.getBases(), 0, bases, 0, left.length()); - System.arraycopy(right, 0, bases, left.length(), right.length); - } - - return create(bases, left.isReference()); - } - - /** - * @param bases bases representing an allele - * @return true if the bases represent the null allele - */ - public static boolean wouldBeNullAllele(byte[] bases) { - return (bases.length == 1 && bases[0] == '-') || bases.length == 0; - } - - /** - * @param bases bases representing an allele - * @return true if the bases represent the NO_CALL allele - */ - public static boolean wouldBeNoCallAllele(byte[] bases) { - return bases.length == 1 && bases[0] == '.'; - } - - /** - * @param bases bases representing an allele - * @return true if the bases represent a symbolic allele - */ - public static boolean wouldBeSymbolicAllele(byte[] bases) { - return bases.length > 2 && bases[0] == '<' && bases[bases.length-1] == '>'; - } - - /** - * @param bases bases representing an allele - * @return true if the bases represent the well formatted allele - */ - public static boolean acceptableAlleleBases(String bases) { - return acceptableAlleleBases(bases.getBytes()); - } - - /** - * @param bases bases representing an allele - * @return true if the bases represent the well formatted allele - */ - public static boolean acceptableAlleleBases(byte[] bases) { - if ( wouldBeNullAllele(bases) || wouldBeNoCallAllele(bases) || wouldBeSymbolicAllele(bases) ) - return true; - - for ( int i = 0; i < bases.length; i++ ) { - switch (bases[i]) { - case 'A': case 'C': case 'G': case 'T': case 'N' : case 'a': case 'c': case 'g': case 't': case 'n' : - break; - default: - return false; - } - } - - return true; - } - - /** - * @see Allele(byte[], boolean) - * - * @param bases bases representing an allele - * @param isRef is this the reference allele? - */ - public static Allele create(String bases, boolean isRef) { - //public Allele(String bases, boolean isRef) { - return create(bases.getBytes(), isRef); - } - - - /** - * Creates a non-Ref allele. @see Allele(byte[], boolean) for full information - * - * @param bases bases representing an allele - */ - public static Allele create(String bases) { - return create(bases, false); - } - - /** - * Creates a non-Ref allele. @see Allele(byte[], boolean) for full information - * - * @param bases bases representing an allele - */ - public static Allele create(byte[] bases) { - return create(bases, false); - //this(bases, false); - } - - // --------------------------------------------------------------------------------------------------------- - // - // accessor routines - // - // --------------------------------------------------------------------------------------------------------- - - //Returns true if this is the null allele - public boolean isNull() { return isNull; } - // Returns true if this is not the null allele - public boolean isNonNull() { return ! isNull(); } - - // Returns true if this is the NO_CALL allele - public boolean isNoCall() { return isNoCall; } - // Returns true if this is not the NO_CALL allele - public boolean isCalled() { return ! isNoCall(); } - - // Returns true if this Allele is the reference allele - public boolean isReference() { return isRef; } - // Returns true if this Allele is not the reference allele - public boolean isNonReference() { return ! isReference(); } - - // Returns true if this Allele is symbolic (i.e. no well-defined base sequence) - public boolean isSymbolic() { return isSymbolic; } - - // Returns a nice string representation of this object - public String toString() { - return (isNull() ? NULL_ALLELE_STRING : ( isNoCall() ? NO_CALL_STRING : getDisplayString() )) + (isReference() ? "*" : ""); - } - - /** - * Return the DNA bases segregating in this allele. Note this isn't reference polarized, - * so the Null allele is represented by a vector of length 0 - * - * @return the segregating bases - */ - public byte[] getBases() { return isSymbolic ? EMPTY_ALLELE_BASES : bases; } - - /** - * Return the DNA bases segregating in this allele in String format. - * This is useful, because toString() adds a '*' to reference alleles and getBases() returns garbage when you call toString() on it. - * - * @return the segregating bases - */ - public String getBaseString() { return new String(getBases()); } - - /** - * Return the printed representation of this allele. - * Same as getBaseString(), except for symbolic alleles. - * For symbolic alleles, the base string is empty while the display string contains . - * - * @return the allele string representation - */ - public String getDisplayString() { return new String(bases); } - - /** - * @param other the other allele - * - * @return true if these alleles are equal - */ - public boolean equals(Object other) { - return ( ! (other instanceof Allele) ? false : equals((Allele)other, false) ); - } - - /** - * @return hash code - */ - public int hashCode() { - int hash = 1; - for (int i = 0; i < bases.length; i++) - hash += (i+1) * bases[i]; - return hash; - } - - /** - * Returns true if this and other are equal. If ignoreRefState is true, then doesn't require both alleles has the - * same ref tag - * - * @param other allele to compare to - * @param ignoreRefState if true, ignore ref state in comparison - * @return true if this and other are equal - */ - public boolean equals(Allele other, boolean ignoreRefState) { - return this == other || (isRef == other.isRef || ignoreRefState) && isNull == other.isNull && isNoCall == other.isNoCall && (bases == other.bases || Arrays.equals(bases, other.bases)); - } - - /** - * @param test bases to test against - * - * @return true if this Alelle contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles - */ - public boolean basesMatch(byte[] test) { return !isSymbolic && (bases == test || Arrays.equals(bases, test)); } - - /** - * @param test bases to test against - * - * @return true if this Alelle contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles - */ - public boolean basesMatch(String test) { return basesMatch(test.toUpperCase().getBytes()); } - - /** - * @param test allele to test against - * - * @return true if this Alelle contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles - */ - public boolean basesMatch(Allele test) { return basesMatch(test.getBases()); } - - /** - * @return the length of this allele. Null and NO_CALL alleles have 0 length. - */ - public int length() { - return isSymbolic ? 0 : bases.length; - } - - // --------------------------------------------------------------------------------------------------------- - // - // useful static functions - // - // --------------------------------------------------------------------------------------------------------- - - public static Allele getMatchingAllele(Collection allAlleles, String alleleBases) { - return getMatchingAllele(allAlleles, alleleBases.getBytes()); - } - - public static Allele getMatchingAllele(Collection allAlleles, byte[] alleleBases) { - for ( Allele a : allAlleles ) { - if ( a.basesMatch(alleleBases) ) { - return a; - } - } - - if ( wouldBeNoCallAllele(alleleBases) ) - return NO_CALL; - else - return null; // couldn't find anything - } - - public static List resolveAlleles(List possibleAlleles, List alleleStrings) { - List myAlleles = new ArrayList(alleleStrings.size()); - - for ( String alleleString : alleleStrings ) { - Allele allele = getMatchingAllele(possibleAlleles, alleleString); - - if ( allele == null ) { - if ( Allele.wouldBeNoCallAllele(alleleString.getBytes()) ) { - allele = create(alleleString); - } else { - throw new IllegalArgumentException("Allele " + alleleString + " not present in the list of alleles " + possibleAlleles); - } - } - - myAlleles.add(allele); - } - - return myAlleles; - } - - public int compareTo(Allele other) { - if ( isReference() && other.isNonReference() ) - return -1; - else if ( isNonReference() && other.isReference() ) - return 1; - else - return getBaseString().compareTo(other.getBaseString()); // todo -- potential performance issue - } - - public static boolean oneIsPrefixOfOther(Allele a1, Allele a2) { - if ( a1.isNull() || a2.isNull() ) - return true; - - if ( a2.length() >= a1.length() ) - return firstIsPrefixOfSecond(a1, a2); - else - return firstIsPrefixOfSecond(a2, a1); - } - - private static boolean firstIsPrefixOfSecond(Allele a1, Allele a2) { - String a1String = a1.getBaseString(); - return a2.getBaseString().substring(0, a1String.length()).equals(a1String); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Genotype.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Genotype.java deleted file mode 100755 index 91aa3b1da..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Genotype.java +++ /dev/null @@ -1,349 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - - -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.*; - -/** - * This class encompasses all the basic information about a genotype. It is immutable. - * - * @author Mark DePristo - */ -public class Genotype { - - public final static String PHASED_ALLELE_SEPARATOR = "|"; - public final static String UNPHASED_ALLELE_SEPARATOR = "/"; - - protected InferredGeneticContext commonInfo; - public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; - protected List alleles = null; // new ArrayList(); - protected Type type = null; - - protected boolean isPhased = false; - protected boolean filtersWereAppliedToContext; - - public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased) { - this(sampleName, alleles, negLog10PError, filters, attributes, isPhased, null); - } - - public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased, double[] log10Likelihoods) { - if ( alleles != null ) - this.alleles = Collections.unmodifiableList(alleles); - commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes); - if ( log10Likelihoods != null ) - commonInfo.putAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(log10Likelihoods)); - filtersWereAppliedToContext = filters != null; - this.isPhased = isPhased; - validate(); - } - - /** - * Creates a new Genotype for sampleName with genotype according to alleles. - * @param sampleName - * @param alleles - * @param negLog10PError the confidence in these alleles - * @param log10Likelihoods a log10 likelihoods for each of the genotype combinations possible for alleles, in the standard VCF ordering, or null if not known - */ - public Genotype(String sampleName, List alleles, double negLog10PError, double[] log10Likelihoods) { - this(sampleName, alleles, negLog10PError, null, null, false, log10Likelihoods); - } - - public Genotype(String sampleName, List alleles, double negLog10PError) { - this(sampleName, alleles, negLog10PError, null, null, false); - } - - public Genotype(String sampleName, List alleles) { - this(sampleName, alleles, NO_NEG_LOG_10PERROR, null, null, false); - } - - - // --------------------------------------------------------------------------------------------------------- - // - // Partial-cloning routines (because Genotype is immutable). - // - // --------------------------------------------------------------------------------------------------------- - - public static Genotype modifyName(Genotype g, String name) { - return new Genotype(name, g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); - } - - public static Genotype modifyAttributes(Genotype g, Map attributes) { - return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.isPhased()); - } - - public static Genotype modifyAlleles(Genotype g, List alleles) { - return new Genotype(g.getSampleName(), alleles, g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); - } - - /** - * @return the alleles for this genotype - */ - public List getAlleles() { - return alleles; - } - - public List getAlleles(Allele allele) { - if ( getType() == Type.UNAVAILABLE ) - throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype"); - - List al = new ArrayList(); - for ( Allele a : alleles ) - if ( a.equals(allele) ) - al.add(a); - - return Collections.unmodifiableList(al); - } - - public Allele getAllele(int i) { - if ( getType() == Type.UNAVAILABLE ) - throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype"); - return alleles.get(i); - } - - public boolean isPhased() { return isPhased; } - - /** - * @return the ploidy of this genotype - */ - public int getPloidy() { - if ( alleles == null ) - throw new ReviewedStingException("Requesting ploidy for an UNAVAILABLE genotype"); - return alleles.size(); - } - - public enum Type { - NO_CALL, - HOM_REF, - HET, - HOM_VAR, - UNAVAILABLE, - MIXED // no-call and call in the same genotype - } - - public Type getType() { - if ( type == null ) { - type = determineType(); - } - return type; - } - - protected Type determineType() { - if ( alleles == null ) - return Type.UNAVAILABLE; - - boolean sawNoCall = false, sawMultipleAlleles = false; - Allele observedAllele = null; - - for ( Allele allele : alleles ) { - if ( allele.isNoCall() ) - sawNoCall = true; - else if ( observedAllele == null ) - observedAllele = allele; - else if ( !allele.equals(observedAllele) ) - sawMultipleAlleles = true; - } - - if ( sawNoCall ) { - if ( observedAllele == null ) - return Type.NO_CALL; - return Type.MIXED; - } - - if ( observedAllele == null ) - throw new ReviewedStingException("BUG: there are no alleles present in this genotype but the alleles list is not null"); - - return sawMultipleAlleles ? Type.HET : observedAllele.isReference() ? Type.HOM_REF : Type.HOM_VAR; - } - - /** - * @return true if all observed alleles are the same (regardless of whether they are ref or alt); if any alleles are no-calls, this method will return false. - */ - public boolean isHom() { return isHomRef() || isHomVar(); } - - /** - * @return true if all observed alleles are ref; if any alleles are no-calls, this method will return false. - */ - public boolean isHomRef() { return getType() == Type.HOM_REF; } - - /** - * @return true if all observed alleles are alt; if any alleles are no-calls, this method will return false. - */ - public boolean isHomVar() { return getType() == Type.HOM_VAR; } - - /** - * @return true if we're het (observed alleles differ); if the ploidy is less than 2 or if any alleles are no-calls, this method will return false. - */ - public boolean isHet() { return getType() == Type.HET; } - - /** - * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF); if any alleles are not no-calls (even if some are), this method will return false. - */ - public boolean isNoCall() { return getType() == Type.NO_CALL; } - - /** - * @return true if this genotype is comprised of any alleles that are not no-calls (even if some are). - */ - public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; } - - /** - * @return true if this genotype is comprised of both calls and no-calls. - */ - public boolean isMixed() { return getType() == Type.MIXED; } - - /** - * @return true if the type of this genotype is set. - */ - public boolean isAvailable() { return getType() != Type.UNAVAILABLE; } - - // - // Useful methods for getting genotype likelihoods for a genotype object, if present - // - public boolean hasLikelihoods() { - return (hasAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) && !getAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY).equals(VCFConstants.MISSING_VALUE_v4)) || - (hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && !getAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).equals(VCFConstants.MISSING_VALUE_v4)); - } - - public GenotypeLikelihoods getLikelihoods() { - GenotypeLikelihoods x = getLikelihoods(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, true); - if ( x != null ) - return x; - else { - x = getLikelihoods(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, false); - if ( x != null ) return x; - else - throw new IllegalStateException("BUG: genotype likelihood field in " + this.getSampleName() + " sample are not either a string or a genotype likelihood class!"); - } - } - - private GenotypeLikelihoods getLikelihoods(String key, boolean asPL) { - Object x = getAttribute(key); - if ( x instanceof String ) { - if ( asPL ) - return GenotypeLikelihoods.fromPLField((String)x); - else - return GenotypeLikelihoods.fromGLField((String)x); - } - else if ( x instanceof GenotypeLikelihoods ) return (GenotypeLikelihoods)x; - else return null; - } - - public void validate() { - if ( alleles == null ) return; - if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0"); - - // int nNoCalls = 0; - for ( Allele allele : alleles ) { - if ( allele == null ) - throw new IllegalArgumentException("BUG: allele cannot be null in Genotype"); - // nNoCalls += allele.isNoCall() ? 1 : 0; - } - - // Technically, the spec does allow for the below case so this is not an illegal state - //if ( nNoCalls > 0 && nNoCalls != alleles.size() ) - // throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this); - } - - public String getGenotypeString() { - return getGenotypeString(true); - } - - public String getGenotypeString(boolean ignoreRefState) { - if ( alleles == null ) - return null; - - // Notes: - // 1. Make sure to use the appropriate separator depending on whether the genotype is phased - // 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele) - // 3. So that everything is deterministic with regards to integration tests, we sort Alleles (when the genotype isn't phased, of course) - return ParsingUtils.join(isPhased() ? PHASED_ALLELE_SEPARATOR : UNPHASED_ALLELE_SEPARATOR, - ignoreRefState ? getAlleleStrings() : (isPhased() ? getAlleles() : ParsingUtils.sortList(getAlleles()))); - } - - private List getAlleleStrings() { - List al = new ArrayList(); - for ( Allele a : alleles ) - al.add(a.getBaseString()); - - return al; - } - - public String toString() { - int Q = (int)Math.round(getPhredScaledQual()); - return String.format("[%s %s Q%s %s]", getSampleName(), getGenotypeString(false), - Q == -10 ? "." : String.format("%2d",Q), sortedString(getAttributes())); - } - - public String toBriefString() { - return String.format("%s:Q%.2f", getGenotypeString(false), getPhredScaledQual()); - } - - public boolean sameGenotype(Genotype other) { - return sameGenotype(other, true); - } - - public boolean sameGenotype(Genotype other, boolean ignorePhase) { - if (getPloidy() != other.getPloidy()) - return false; // gotta have the same number of allele to be equal - - // By default, compare the elements in the lists of alleles, element-by-element - Collection thisAlleles = this.getAlleles(); - Collection otherAlleles = other.getAlleles(); - - if (ignorePhase) { // do not care about order, only identity of Alleles - thisAlleles = new TreeSet(thisAlleles); //implemented Allele.compareTo() - otherAlleles = new TreeSet(otherAlleles); - } - - return thisAlleles.equals(otherAlleles); - } - - /** - * a utility method for generating sorted strings from a map key set. - * @param c the map - * @param the key type - * @param the value type - * @return a sting, enclosed in {}, with comma seperated key value pairs in order of the keys - */ - private static , V> String sortedString(Map c) { - // NOTE -- THIS IS COPIED FROM GATK UTILS TO ALLOW US TO KEEP A SEPARATION BETWEEN THE GATK AND VCF CODECS - List t = new ArrayList(c.keySet()); - Collections.sort(t); - - List pairs = new ArrayList(); - for (T k : t) { - pairs.add(k + "=" + c.get(k)); - } - - return "{" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}"; - } - - - // --------------------------------------------------------------------------------------------------------- - // - // get routines to access context info fields - // - // --------------------------------------------------------------------------------------------------------- - public String getSampleName() { return commonInfo.getName(); } - public Set getFilters() { return commonInfo.getFilters(); } - public boolean isFiltered() { return commonInfo.isFiltered(); } - public boolean isNotFiltered() { return commonInfo.isNotFiltered(); } - public boolean filtersWereApplied() { return filtersWereAppliedToContext; } - public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); } - public double getNegLog10PError() { return commonInfo.getNegLog10PError(); } - public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); } - - public Map getAttributes() { return commonInfo.getAttributes(); } - public boolean hasAttribute(String key) { return commonInfo.hasAttribute(key); } - public Object getAttribute(String key) { return commonInfo.getAttribute(key); } - - public Object getAttribute(String key, Object defaultValue) { - return commonInfo.getAttribute(key, defaultValue); - } - - public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); } - public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } - public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } - public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/GenotypeLikelihoods.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/GenotypeLikelihoods.java deleted file mode 100755 index 02fb32451..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/GenotypeLikelihoods.java +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.variantcontext.v13; - -import org.broad.tribble.TribbleException; -import org.broadinstitute.sting.utils.MathUtils; - -import java.util.EnumMap; -import java.util.Map; - -public class GenotypeLikelihoods { - public static final boolean CAP_PLS = false; - public static final int PL_CAP = 255; - - // - // There are two objects here because we are lazy in creating both representations - // for this object: a vector of log10 Probs and the PL phred-scaled string. Supports - // having one set during initializating, and dynamic creation of the other, if needed - // - private double[] log10Likelihoods = null; - private String likelihoodsAsString_PLs = null; - - public final static GenotypeLikelihoods fromPLField(String PLs) { - return new GenotypeLikelihoods(PLs); - } - - public final static GenotypeLikelihoods fromGLField(String GLs) { - return new GenotypeLikelihoods(parseDeprecatedGLString(GLs)); - } - - public final static GenotypeLikelihoods fromLog10Likelihoods(double[] log10Likelihoods) { - return new GenotypeLikelihoods(log10Likelihoods); - } - - // - // You must use the factory methods now - // - protected GenotypeLikelihoods(String asString) { - likelihoodsAsString_PLs = asString; - } - - protected GenotypeLikelihoods(double[] asVector) { - log10Likelihoods = asVector; - } - - /** - * Returns the genotypes likelihoods in negative log10 vector format. pr{AA} = x, this - * vector returns math.log10(x) for each of the genotypes. Can return null if the - * genotype likelihoods are "missing". - * - * @return - */ - public double[] getAsVector() { - // assumes one of the likelihoods vector or the string isn't null - if ( log10Likelihoods == null ) { - // make sure we create the GL string if it doesn't already exist - log10Likelihoods = parsePLsIntoLikelihoods(likelihoodsAsString_PLs); - } - - return log10Likelihoods; - } - - public String toString() { - return getAsString(); - } - - public String getAsString() { - if ( likelihoodsAsString_PLs == null ) { - // todo -- should we accept null log10Likelihoods and set PLs as MISSING? - if ( log10Likelihoods == null ) - throw new TribbleException("BUG: Attempted to get likelihoods as strings and neither the vector nor the string is set!"); - likelihoodsAsString_PLs = convertLikelihoodsToPLString(log10Likelihoods); - } - - return likelihoodsAsString_PLs; - } - - //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values - //Returns null in case of missing likelihoods - public EnumMap getAsMap(boolean normalizeFromLog10){ - //Make sure that the log10likelihoods are set - double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector(); - if(likelihoods == null) - return null; - EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class); - likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]); - likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]); - likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]); - return likelihoodsMap; - } - - //Return the neg log10 Genotype Quality (GQ) for the given genotype - //Returns Double.NEGATIVE_INFINITY in case of missing genotype - public double getNegLog10GQ(Genotype.Type genotype){ - - double qual = Double.NEGATIVE_INFINITY; - EnumMap likelihoods = getAsMap(false); - if(likelihoods == null) - return qual; - for(Map.Entry likelihood : likelihoods.entrySet()){ - if(likelihood.getKey() == genotype) - continue; - if(likelihood.getValue() > qual) - qual = likelihood.getValue(); - - } - - //Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best) - qual = likelihoods.get(genotype) - qual; - - //Quality of other genotypes 1-P(G) - if (qual < 0) { - double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); - double chosenGenotype = normalized[genotype.ordinal()-1]; - qual = -1.0 * Math.log10(1.0 - chosenGenotype); - } - return qual; - } - - private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) { - if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) { - String[] strings = likelihoodsAsString_PLs.split(","); - double[] likelihoodsAsVector = new double[strings.length]; - for ( int i = 0; i < strings.length; i++ ) { - likelihoodsAsVector[i] = Integer.parseInt(strings[i]) / -10.0; - } - return likelihoodsAsVector; - } else - return null; - } - - /** - * Back-compatibility function to read old style GL formatted genotype likelihoods in VCF format - * @param GLString - * @return - */ - private final static double[] parseDeprecatedGLString(String GLString) { - if ( !GLString.equals(VCFConstants.MISSING_VALUE_v4) ) { - String[] strings = GLString.split(","); - double[] likelihoodsAsVector = new double[strings.length]; - for ( int i = 0; i < strings.length; i++ ) { - likelihoodsAsVector[i] = Double.parseDouble(strings[i]); - } - return likelihoodsAsVector; - } - - return null; - } - - private final static String convertLikelihoodsToPLString(double[] GLs) { - if ( GLs == null ) - return VCFConstants.MISSING_VALUE_v4; - - StringBuilder s = new StringBuilder(); - - double adjust = Double.NEGATIVE_INFINITY; - for ( double l : GLs ) adjust = Math.max(adjust, l); - - boolean first = true; - for ( double l : GLs ) { - if ( ! first ) - s.append(","); - else - first = false; - - long PL = Math.round(-10 * (l - adjust)); - if ( CAP_PLS ) - PL = Math.min(PL, PL_CAP); - s.append(Long.toString(PL)); - } - - return s.toString(); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/IndexingVCFWriter.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/IndexingVCFWriter.java deleted file mode 100644 index 85444c325..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/IndexingVCFWriter.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.variantcontext.v13; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.SAMSequenceDictionary; -import org.broad.tribble.Tribble; -import org.broad.tribble.TribbleException; -import org.broad.tribble.index.DynamicIndexCreator; -import org.broad.tribble.index.Index; -import org.broad.tribble.index.IndexFactory; -import org.broad.tribble.util.LittleEndianOutputStream; -import org.broad.tribble.util.PositionalStream; -import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.*; - -/** - * this class writes VCF files - */ -abstract class IndexingVCFWriter implements VCFWriter { - final private String name; - private final SAMSequenceDictionary refDict; - - private OutputStream outputStream; - private PositionalStream positionalStream = null; - private DynamicIndexCreator indexer = null; - private LittleEndianOutputStream idxStream = null; - - @Requires({"name != null", - "! ( location == null && output == null )", - "! ( enableOnTheFlyIndexing && location == null )"}) - protected IndexingVCFWriter(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) { - outputStream = output; - this.name = name; - this.refDict = refDict; - - if ( enableOnTheFlyIndexing ) { - try { - idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(location))); - //System.out.println("Creating index on the fly for " + location); - indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - indexer.initialize(location, indexer.defaultBinSize()); - positionalStream = new PositionalStream(output); - outputStream = positionalStream; - } catch ( IOException ex ) { - // No matter what we keep going, since we don't care if we can't create the index file - idxStream = null; - indexer = null; - positionalStream = null; - } - } - } - - @Ensures("result != null") - public OutputStream getOutputStream() { - return outputStream; - } - - @Ensures("result != null") - public String getStreamName() { - return name; - } - - public abstract void writeHeader(VCFHeader header); - - /** - * attempt to close the VCF file - */ - public void close() { - // try to close the index stream (keep it separate to help debugging efforts) - if ( indexer != null ) { - try { - Index index = indexer.finalizeIndex(positionalStream.getPosition()); - IndexDictionaryUtils.setIndexSequenceDictionary(index, refDict); - index.write(idxStream); - idxStream.close(); - } catch (IOException e) { - throw new ReviewedStingException("Unable to close index for " + getStreamName(), e); - } - } - } - - /** - * add a record to the file - * - * @param vc the Variant Context object - */ - public void add(VariantContext vc) { - // if we are doing on the fly indexing, add the record ***before*** we write any bytes - if ( indexer != null ) - indexer.addFeature(vc, positionalStream.getPosition()); - } - - /** - * Returns a reasonable "name" for this writer, to display to the user if something goes wrong - * - * @param location - * @param stream - * @return - */ - protected static final String writerName(final File location, final OutputStream stream) { - return location == null ? stream.toString() : location.getAbsolutePath(); - } - - /** - * Returns a output stream writing to location, or throws a UserException if this fails - * @param location - * @return - */ - protected static OutputStream openOutputStream(final File location) { - try { - return new FileOutputStream(location); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(location, "Unable to create VCF writer", e); - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/InferredGeneticContext.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/InferredGeneticContext.java deleted file mode 100755 index 43f61343e..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/InferredGeneticContext.java +++ /dev/null @@ -1,243 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - - -import java.util.*; - - -/** - * Common utility routines for VariantContext and Genotype - * - * @author depristo - */ -final class InferredGeneticContext { - public static final double NO_NEG_LOG_10PERROR = -1.0; - - private static Set NO_FILTERS = Collections.unmodifiableSet(new HashSet()); - private static Map NO_ATTRIBUTES = Collections.unmodifiableMap(new HashMap()); - - private double negLog10PError = NO_NEG_LOG_10PERROR; - private String name = null; - private Set filters = NO_FILTERS; - private Map attributes = NO_ATTRIBUTES; - -// public InferredGeneticContext(String name) { -// this.name = name; -// } -// -// public InferredGeneticContext(String name, double negLog10PError) { -// this(name); -// setNegLog10PError(negLog10PError); -// } - - public InferredGeneticContext(String name, double negLog10PError, Set filters, Map attributes) { - this.name = name; - setNegLog10PError(negLog10PError); - if ( filters != null ) - setFilters(filters); - if ( attributes != null ) - setAttributes(attributes); - } - - /** - * @return the name - */ - public String getName() { - return name; - } - - /** - * Sets the name - * - * @param name the name associated with this information - */ - public void setName(String name) { - if ( name == null ) throw new IllegalArgumentException("Name cannot be null " + this); - this.name = name; - } - - - // --------------------------------------------------------------------------------------------------------- - // - // Filter - // - // --------------------------------------------------------------------------------------------------------- - - public Set getFilters() { - return Collections.unmodifiableSet(filters); - } - - public boolean isFiltered() { - return filters.size() > 0; - } - - public boolean isNotFiltered() { - return ! isFiltered(); - } - - public void addFilter(String filter) { - if ( filters == NO_FILTERS ) // immutable -> mutable - filters = new HashSet(filters); - - if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this); - if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this); - filters.add(filter); - } - - public void addFilters(Collection filters) { - if ( filters == null ) throw new IllegalArgumentException("BUG: Attempting to add null filters at" + this); - for ( String f : filters ) - addFilter(f); - } - - public void clearFilters() { - filters = new HashSet(); - } - - public void setFilters(Collection filters) { - clearFilters(); - addFilters(filters); - } - - // --------------------------------------------------------------------------------------------------------- - // - // Working with log error rates - // - // --------------------------------------------------------------------------------------------------------- - - public boolean hasNegLog10PError() { - return getNegLog10PError() != NO_NEG_LOG_10PERROR; - } - - /** - * @return the -1 * log10-based error estimate - */ - public double getNegLog10PError() { return negLog10PError; } - public double getPhredScaledQual() { return getNegLog10PError() * 10; } - - public void setNegLog10PError(double negLog10PError) { - if ( negLog10PError < 0 && negLog10PError != NO_NEG_LOG_10PERROR ) throw new IllegalArgumentException("BUG: negLog10PError cannot be < than 0 : " + negLog10PError); - if ( Double.isInfinite(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be Infinity"); - if ( Double.isNaN(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be NaN"); - - this.negLog10PError = negLog10PError; - } - - // --------------------------------------------------------------------------------------------------------- - // - // Working with attributes - // - // --------------------------------------------------------------------------------------------------------- - public void clearAttributes() { - attributes = new HashMap(); - } - - /** - * @return the attribute map - */ - public Map getAttributes() { - return Collections.unmodifiableMap(attributes); - } - - // todo -- define common attributes as enum - - public void setAttributes(Map map) { - clearAttributes(); - putAttributes(map); - } - - public void putAttribute(String key, Object value) { - putAttribute(key, value, false); - } - - public void putAttribute(String key, Object value, boolean allowOverwrites) { - if ( ! allowOverwrites && hasAttribute(key) ) - throw new IllegalStateException("Attempting to overwrite key->value binding: key = " + key + " this = " + this); - - if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable - attributes = new HashMap(); - - attributes.put(key, value); - } - - public void removeAttribute(String key) { - if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable - attributes = new HashMap(); - attributes.remove(key); - } - - public void putAttributes(Map map) { - if ( map != null ) { - // for efficiency, we can skip the validation if the map is empty - if ( attributes.size() == 0 ) { - if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable - attributes = new HashMap(); - attributes.putAll(map); - } else { - for ( Map.Entry elt : map.entrySet() ) { - putAttribute(elt.getKey(), elt.getValue(), false); - } - } - } - } - - public boolean hasAttribute(String key) { - return attributes.containsKey(key); - } - - public int getNumAttributes() { - return attributes.size(); - } - - /** - * @param key the attribute key - * - * @return the attribute value for the given key (or null if not set) - */ - public Object getAttribute(String key) { - return attributes.get(key); - } - - public Object getAttribute(String key, Object defaultValue) { - if ( hasAttribute(key) ) - return attributes.get(key); - else - return defaultValue; - } - - public String getAttributeAsString(String key, String defaultValue) { - Object x = getAttribute(key); - if ( x == null ) return defaultValue; - if ( x instanceof String ) return (String)x; - return String.valueOf(x); // throws an exception if this isn't a string - } - - public int getAttributeAsInt(String key, int defaultValue) { - Object x = getAttribute(key); - if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue; - if ( x instanceof Integer ) return (Integer)x; - return Integer.valueOf((String)x); // throws an exception if this isn't a string - } - - public double getAttributeAsDouble(String key, double defaultValue) { - Object x = getAttribute(key); - if ( x == null ) return defaultValue; - if ( x instanceof Double ) return (Double)x; - return Double.valueOf((String)x); // throws an exception if this isn't a string - } - - public boolean getAttributeAsBoolean(String key, boolean defaultValue) { - Object x = getAttribute(key); - if ( x == null ) return defaultValue; - if ( x instanceof Boolean ) return (Boolean)x; - return Boolean.valueOf((String)x); // throws an exception if this isn't a string - } - -// public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null" -// public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); } -// public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); } -// public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); } -// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} } -// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} } -// public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); } -// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableGenotype.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableGenotype.java deleted file mode 100755 index f5072e040..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableGenotype.java +++ /dev/null @@ -1,68 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -import java.util.*; - -/** - * This class emcompasses all the basic information about a genotype. It is immutable. - * - * @author Mark DePristo - */ -class MutableGenotype extends Genotype { - public MutableGenotype(Genotype parent) { - super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased()); - } - - public MutableGenotype(String sampleName, Genotype parent) { - super(sampleName, parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased()); - } - - - public MutableGenotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean genotypesArePhased) { - super(sampleName, alleles, negLog10PError, filters, attributes, genotypesArePhased); - } - - public MutableGenotype(String sampleName, List alleles, double negLog10PError) { - super(sampleName, alleles, negLog10PError); - } - - public MutableGenotype(String sampleName, List alleles) { - super(sampleName, alleles); - } - - public Genotype unmodifiableGenotype() { - return new Genotype(getSampleName(), getAlleles(), getNegLog10PError(), getFilters(), getAttributes(), isPhased()); - } - - - /** - * - * @param alleles list of alleles - */ - public void setAlleles(List alleles) { - this.alleles = new ArrayList(alleles); - validate(); - } - - public void setPhase(boolean isPhased) { - super.isPhased = isPhased; - } - - // --------------------------------------------------------------------------------------------------------- - // - // InferredGeneticContext mutation operators - // - // --------------------------------------------------------------------------------------------------------- - public void setName(String name) { commonInfo.setName(name); } - public void addFilter(String filter) { commonInfo.addFilter(filter); } - public void addFilters(Collection filters) { commonInfo.addFilters(filters); } - public void clearFilters() { commonInfo.clearFilters(); } - public void setFilters(Collection filters) { commonInfo.setFilters(filters); } - public void setAttributes(Map map) { commonInfo.setAttributes(map); } - public void clearAttributes() { commonInfo.clearAttributes(); } - public void putAttribute(String key, Object value) { commonInfo.putAttribute(key, value); } - public void removeAttribute(String key) { commonInfo.removeAttribute(key); } - public void putAttributes(Map map) { commonInfo.putAttributes(map); } - public void setNegLog10PError(double negLog10PError) { commonInfo.setNegLog10PError(negLog10PError); } - public void putAttribute(String key, Object value, boolean allowOverwrites) { commonInfo.putAttribute(key, value, allowOverwrites); } - -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableVariantContext.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableVariantContext.java deleted file mode 100755 index 24e71ae50..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableVariantContext.java +++ /dev/null @@ -1,213 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - - -import java.util.Collection; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; - -/** - * Mutable version of VariantContext - * - * @author depristo - */ -class MutableVariantContext extends VariantContext { - // --------------------------------------------------------------------------------------------------------- - // - // constructors - // - // --------------------------------------------------------------------------------------------------------- - - public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - super(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes); - } - - public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { - super(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes); - } - - public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles) { - super(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } - - public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes) { - super(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } - - public MutableVariantContext(VariantContext parent) { - super(parent.getSource(), parent.contig, parent.start, parent.stop, parent.getAlleles(), parent.getGenotypes(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.getReferenceBaseForIndel()); - } - - /** - * Sets the alleles segregating in this context to the collect of alleles. Each of which must be unique according - * to equals() in Allele. Validate() should be called when you are done modifying the context. - * - * @param alleles - */ - public void setAlleles(Collection alleles) { - this.alleles.clear(); - for ( Allele a : alleles ) - addAllele(a); - } - - /** - * Adds allele to the segregating allele list in this context to the collection of alleles. The new - * allele must be be unique according to equals() in Allele. - * Validate() should be called when you are done modifying the context. - * - * @param allele - */ - public void addAllele(Allele allele) { - final boolean allowDuplicates = false; // used to be a parameter - - type = null; - - for ( Allele a : alleles ) { - if ( a.basesMatch(allele) && ! allowDuplicates ) - throw new IllegalArgumentException("Duplicate allele added to VariantContext" + this); - } - - // we are a novel allele - alleles.add(allele); - } - - public void clearGenotypes() { - genotypes = new TreeMap(); - } - - /** - * Adds this single genotype to the context, not allowing duplicate genotypes to be added - * @param genotype - */ - public void addGenotypes(Genotype genotype) { - putGenotype(genotype.getSampleName(), genotype, false); - } - - /** - * Adds these genotypes to the context, not allowing duplicate genotypes to be added - * @param genotypes - */ - public void addGenotypes(Collection genotypes) { - for ( Genotype g : genotypes ) { - addGenotype(g); - } - } - - /** - * Adds these genotype to the context, not allowing duplicate genotypes to be added. - * @param genotypes - */ - public void addGenotypes(Map genotypes) { - - for ( Map.Entry elt : genotypes.entrySet() ) { - addGenotype(elt.getValue()); - } - } - - /** - * Adds these genotypes to the context. - * - * @param genotypes - */ - public void putGenotypes(Map genotypes) { - for ( Map.Entry g : genotypes.entrySet() ) - putGenotype(g.getKey(), g.getValue()); - } - - /** - * Adds these genotypes to the context. - * - * @param genotypes - */ - public void putGenotypes(Collection genotypes) { - for ( Genotype g : genotypes ) - putGenotype(g); - } - - /** - * Adds this genotype to the context, throwing an error if it's already bound. - * - * @param genotype - */ - public void addGenotype(Genotype genotype) { - addGenotype(genotype.getSampleName(), genotype); - } - - /** - * Adds this genotype to the context, throwing an error if it's already bound. - * - * @param genotype - */ - public void addGenotype(String sampleName, Genotype genotype) { - putGenotype(sampleName, genotype, false); - } - - /** - * Adds this genotype to the context. - * - * @param genotype - */ - public void putGenotype(Genotype genotype) { - putGenotype(genotype.getSampleName(), genotype); - } - - /** - * Adds this genotype to the context. - * - * @param genotype - */ - public void putGenotype(String sampleName, Genotype genotype) { - putGenotype(sampleName, genotype, true); - } - - private void putGenotype(String sampleName, Genotype genotype, boolean allowOverwrites) { - if ( hasGenotype(sampleName) && ! allowOverwrites ) - throw new IllegalStateException("Attempting to overwrite sample->genotype binding: " + sampleName + " this=" + this); - - if ( ! sampleName.equals(genotype.getSampleName()) ) - throw new IllegalStateException("Sample name doesn't equal genotype.getSample(): " + sampleName + " genotype=" + genotype); - - this.genotypes.put(sampleName, genotype); - } - - /** - * Removes the binding from sampleName to genotype. If this doesn't exist, throws an IllegalArgumentException - * @param sampleName - */ - public void removeGenotype(String sampleName) { - if ( ! this.genotypes.containsKey(sampleName) ) - throw new IllegalArgumentException("Sample name isn't contained in genotypes " + sampleName + " genotypes =" + genotypes); - - this.genotypes.remove(sampleName); - } - - /** - * Removes genotype from the context. If this doesn't exist, throws an IllegalArgumentException - * @param genotype - */ - public void removeGenotype(Genotype genotype) { - removeGenotype(genotype.getSampleName()); - } - - // todo -- add replace genotype routine - - // --------------------------------------------------------------------------------------------------------- - // - // InferredGeneticContext mutation operators - // - // --------------------------------------------------------------------------------------------------------- - - public void setSource(String source) { commonInfo.setName(source); } - public void addFilter(String filter) { commonInfo.addFilter(filter); } - public void addFilters(Collection filters) { commonInfo.addFilters(filters); } - public void clearFilters() { commonInfo.clearFilters(); } - public void setFilters(Collection filters) { commonInfo.setFilters(filters); } - public void setAttributes(Map map) { commonInfo.setAttributes(map); } - public void clearAttributes() { commonInfo.clearAttributes(); } - public void putAttribute(String key, Object value) { commonInfo.putAttribute(key, value); } - public void removeAttribute(String key) { commonInfo.removeAttribute(key); } - public void putAttributes(Map map) { commonInfo.putAttributes(map); } - public void setNegLog10PError(double negLog10PError) { commonInfo.setNegLog10PError(negLog10PError); } - public void putAttribute(String key, Object value, boolean allowOverwrites) { commonInfo.putAttribute(key, value, allowOverwrites); } - public void setID(String id) { putAttribute(ID_KEY, id, true); } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCF3Codec.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCF3Codec.java deleted file mode 100755 index 9f653872a..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCF3Codec.java +++ /dev/null @@ -1,198 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.LineReader; -import org.broad.tribble.util.ParsingUtils; - -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.*; - - -/** - * A feature codec for the VCF3 specification, to read older VCF files. VCF3 has been - * depreciated in favor of VCF4 (See VCF codec for the latest information) - * - *

- * Reads historical VCF3 encoded files (1000 Genomes Pilot results, for example) - *

- * - *

- * See also: @see VCF specification
- * See also: @see VCF spec. publication - *

- * - * @author Mark DePristo - * @since 2010 - */ -class VCF3Codec extends AbstractVCFCodec { - public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3"; - - - /** - * @param reader the line reader to take header lines from - * @return the number of header lines - */ - public Object readHeader(LineReader reader) { - List headerStrings = new ArrayList(); - - String line; - try { - boolean foundHeaderVersion = false; - while ((line = reader.readLine()) != null) { - lineNo++; - if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { - String[] lineFields = line.substring(2).split("="); - if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { - if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - foundHeaderVersion = true; - version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( version != VCFHeaderVersion.VCF3_3 && version != VCFHeaderVersion.VCF3_2 ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv3 and does not support " + lineFields[1]); - } - headerStrings.add(line); - } - else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { - if (!foundHeaderVersion) { - throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); - } - return createHeader(headerStrings, line); - } - else { - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - } - } catch (IOException e) { - throw new RuntimeException("IO Exception ", e); - } - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - - /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * @param filterString the string to parse - * @return a set of the filters applied - */ - protected Set parseFilters(String filterString) { - - // null for unfiltered - if ( filterString.equals(VCFConstants.UNFILTERED) ) - return null; - - // empty set for passes filters - LinkedHashSet fFields = new LinkedHashSet(); - - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - return fFields; - - if ( filterString.length() == 0 ) - generateException("The VCF specification requires a valid filter status"); - - // do we have the filter string cached? - if ( filterHash.containsKey(filterString) ) - return filterHash.get(filterString); - - // otherwise we have to parse and cache the value - if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) - fFields.add(filterString); - else - fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); - - filterHash.put(filterString, fFields); - - return fFields; - } - - /** - * create a genotype map - * @param str the string - * @param alleles the list of alleles - * @param chr chrom - * @param pos position - * @return a mapping of sample name to genotype object - */ - public Map createGenotypeMap(String str, List alleles, String chr, int pos) { - if (genotypeParts == null) - genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; - - int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); - - Map genotypes = new LinkedHashMap(nParts); - - // get the format keys - int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); - - // cycle through the sample names - Iterator sampleNameIterator = header.getGenotypeSamples().iterator(); - - // clear out our allele mapping - alleleMap.clear(); - - // cycle through the genotype strings - for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) { - int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); - - double GTQual = VariantContext.NO_NEG_LOG_10PERROR; - Set genotypeFilters = null; - Map gtAttributes = null; - String sampleName = sampleNameIterator.next(); - - // check to see if the value list is longer than the key list, which is a problem - if (nGTKeys < GTValueSplitSize) - generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]); - - int genotypeAlleleLocation = -1; - if (nGTKeys >= 1) { - gtAttributes = new HashMap(nGTKeys - 1); - - for (int i = 0; i < nGTKeys; i++) { - final String gtKey = new String(genotypeKeyArray[i]); - boolean missing = i >= GTValueSplitSize; - - if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) { - genotypeAlleleLocation = i; - } else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { - GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]); - } else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { - genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i])); - } else if ( missing || GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) ) { - gtAttributes.put(gtKey, VCFConstants.MISSING_VALUE_v4); - } else { - gtAttributes.put(gtKey, new String(GTValueArray[i])); - } - } - } - - // check to make sure we found a genotype field - if ( genotypeAlleleLocation < 0 ) - generateException("Unable to find the GT field for the record; the GT field is required"); - if ( genotypeAlleleLocation > 0 ) - generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes"); - - boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1; - - // add it to the list - try { - genotypes.put(sampleName, new Genotype(sampleName, - parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap), - GTQual, - genotypeFilters, - gtAttributes, - phased)); - } catch (TribbleException e) { - throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos); - } - } - - return genotypes; - } - - @Override - public boolean canDecode(final File potentialInput) { - return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFAltHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFAltHeaderLine.java deleted file mode 100644 index e432fe411..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFAltHeaderLine.java +++ /dev/null @@ -1,28 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -/** - * @author ebanks - * A class representing a key=value entry for ALT fields in the VCF header - */ -class VCFAltHeaderLine extends VCFSimpleHeaderLine { - - /** - * create a VCF filter header line - * - * @param name the name for this header line - * @param description the description for this header line - */ - public VCFAltHeaderLine(String name, String description) { - super(name, description, SupportedHeaderLineType.ALT); - } - - /** - * create a VCF info header line - * - * @param line the header line - * @param version the vcf header version - */ - protected VCFAltHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.ALT); - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCodec.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCodec.java deleted file mode 100755 index f873aebcc..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCodec.java +++ /dev/null @@ -1,228 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.LineReader; -import org.broad.tribble.util.ParsingUtils; - -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.*; - -/** - * A feature codec for the VCF 4 specification - * - *

- * VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a - * header line, and then data lines each containing information about a position in the genome. - *

- *

One of the main uses of next-generation sequencing is to discover variation amongst large populations - * of related samples. Recently the format for storing next-generation read alignments has been - * standardised by the SAM/BAM file format specification. This has significantly improved the - * interoperability of next-generation tools for alignment, visualisation, and variant calling. - * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent - * types of sequence variation, including SNPs, indels and larger structural variants, together - * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for - * fast data retrieval of variants from a range of positions on the reference genome. - * The format was developed for the 1000 Genomes Project, and has also been adopted by other projects - * such as UK10K, dbSNP, or the NHLBI Exome Project. VCFtools is a software suite that implements - * various utilities for processing VCF files, including validation, merging and comparing, - * and also provides a general Perl and Python API. - * The VCF specification and VCFtools are available from http://vcftools.sourceforge.net.

- * - *

- * See also: @see VCF specification
- * See also: @see VCF spec. publication - *

- * - *

File format example

- *
- *     ##fileformat=VCFv4.0
- *     #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA12878
- *     chr1    109     .       A       T       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:610,327:308:-316.30,-95.47,-803.03:99
- *     chr1    147     .       C       A       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:294,49:118:-57.87,-34.96,-338.46:99
- * 
- * - * @author Mark DePristo - * @since 2010 - */ -public class VCFCodec extends AbstractVCFCodec { - // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. - - public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; - - /** - * @param reader the line reader to take header lines from - * @return the number of header lines - */ - public Object readHeader(LineReader reader) { - List headerStrings = new ArrayList(); - - String line; - try { - boolean foundHeaderVersion = false; - while ((line = reader.readLine()) != null) { - lineNo++; - if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { - String[] lineFields = line.substring(2).split("="); - if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { - if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - foundHeaderVersion = true; - version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( version == VCFHeaderVersion.VCF3_3 || version == VCFHeaderVersion.VCF3_2 ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]); - if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]); - } - headerStrings.add(line); - } - else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { - if (!foundHeaderVersion) { - throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); - } - return createHeader(headerStrings, line); - } - else { - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - } - } catch (IOException e) { - throw new RuntimeException("IO Exception ", e); - } - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - - /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * - * @param filterString the string to parse - * @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF) - */ - protected Set parseFilters(String filterString) { - return parseFilters(filterHash, lineNo, filterString); - } - - public static Set parseFilters(final Map> cache, final int lineNo, final String filterString) { - // null for unfiltered - if ( filterString.equals(VCFConstants.UNFILTERED) ) - return null; - - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) - return Collections.emptySet(); - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); - if ( filterString.length() == 0 ) - generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); - - // do we have the filter string cached? - if ( cache != null && cache.containsKey(filterString) ) - return Collections.unmodifiableSet(cache.get(filterString)); - - // empty set for passes filters - LinkedHashSet fFields = new LinkedHashSet(); - // otherwise we have to parse and cache the value - if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) - fFields.add(filterString); - else - fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); - - fFields = fFields; - if ( cache != null ) cache.put(filterString, fFields); - - return Collections.unmodifiableSet(fFields); - } - - - /** - * create a genotype map - * @param str the string - * @param alleles the list of alleles - * @return a mapping of sample name to genotype object - */ - public Map createGenotypeMap(String str, List alleles, String chr, int pos) { - if (genotypeParts == null) - genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; - - int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); - - Map genotypes = new LinkedHashMap(nParts); - - // get the format keys - int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); - - // cycle through the sample names - Iterator sampleNameIterator = header.getGenotypeSamples().iterator(); - - // clear out our allele mapping - alleleMap.clear(); - - // cycle through the genotype strings - for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) { - int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); - - double GTQual = VariantContext.NO_NEG_LOG_10PERROR; - Set genotypeFilters = null; - Map gtAttributes = null; - String sampleName = sampleNameIterator.next(); - - // check to see if the value list is longer than the key list, which is a problem - if (nGTKeys < GTValueSplitSize) - generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]); - - int genotypeAlleleLocation = -1; - if (nGTKeys >= 1) { - gtAttributes = new HashMap(nGTKeys - 1); - - for (int i = 0; i < nGTKeys; i++) { - final String gtKey = new String(genotypeKeyArray[i]); - boolean missing = i >= GTValueSplitSize; - - // todo -- all of these on the fly parsing of the missing value should be static constants - if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) { - genotypeAlleleLocation = i; - } else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { - GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]); - } else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { - genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i])); - } else if ( missing ) { - gtAttributes.put(gtKey, VCFConstants.MISSING_VALUE_v4); - } else { - gtAttributes.put(gtKey, new String(GTValueArray[i])); - } - } - } - - // check to make sure we found a genotype field if we are a VCF4.0 file - if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 ) - generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0"); - if ( genotypeAlleleLocation > 0 ) - generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present"); - - List GTalleles = (genotypeAlleleLocation == -1 ? null : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap)); - boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1; - - // add it to the list - try { - genotypes.put(sampleName, - new Genotype(sampleName, - GTalleles, - GTQual, - genotypeFilters, - gtAttributes, - phased)); - } catch (TribbleException e) { - throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos); - } - } - - return genotypes; - } - - @Override - public boolean canDecode(final File potentialInput) { - return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCompoundHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCompoundHeaderLine.java deleted file mode 100755 index 74d6cf62f..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCompoundHeaderLine.java +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.variantcontext.v13; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.Arrays; -import java.util.LinkedHashMap; -import java.util.Map; - -/** - * a base class for compound header lines, which include info lines and format lines (so far) - */ -abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine { - public enum SupportedHeaderLineType { - INFO(true), FORMAT(false); - - public final boolean allowFlagValues; - SupportedHeaderLineType(boolean flagValues) { - allowFlagValues = flagValues; - } - } - - // the field types - private String name; - private int count = -1; - private VCFHeaderLineCount countType; - private String description; - private VCFHeaderLineType type; - - // access methods - public String getName() { return name; } - public String getDescription() { return description; } - public VCFHeaderLineType getType() { return type; } - public VCFHeaderLineCount getCountType() { return countType; } - public int getCount() { - if ( countType != VCFHeaderLineCount.INTEGER ) - throw new ReviewedStingException("Asking for header line count when type is not an integer"); - return count; - } - - // utility method - public int getCount(int numAltAlleles) { - int myCount; - switch ( countType ) { - case INTEGER: myCount = count; break; - case UNBOUNDED: myCount = -1; break; - case A: myCount = numAltAlleles; break; - case G: myCount = ((numAltAlleles + 1) * (numAltAlleles + 2) / 2); break; - default: throw new ReviewedStingException("Unknown count type: " + countType); - } - return myCount; - } - - public void setNumberToUnbounded() { - countType = VCFHeaderLineCount.UNBOUNDED; - count = -1; - } - - // our type of line, i.e. format, info, etc - private final SupportedHeaderLineType lineType; - - /** - * create a VCF format header line - * - * @param name the name for this header line - * @param count the count for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type - */ - protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); - this.name = name; - this.countType = VCFHeaderLineCount.INTEGER; - this.count = count; - this.type = type; - this.description = description; - this.lineType = lineType; - validate(); - } - - /** - * create a VCF format header line - * - * @param name the name for this header line - * @param count the count type for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type - */ - protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); - this.name = name; - this.countType = count; - this.type = type; - this.description = description; - this.lineType = lineType; - validate(); - } - - /** - * create a VCF format header line - * - * @param line the header line - * @param version the VCF header version - * @param lineType the header line type - * - */ - protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); - Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description")); - name = mapping.get("ID"); - count = -1; - final String numberStr = mapping.get("Number"); - if ( numberStr.equals(VCFConstants.PER_ALLELE_COUNT) ) { - countType = VCFHeaderLineCount.A; - } else if ( numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT) ) { - countType = VCFHeaderLineCount.G; - } else if ( ((version == VCFHeaderVersion.VCF4_0 || version == VCFHeaderVersion.VCF4_1) && - numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || - ((version == VCFHeaderVersion.VCF3_2 || version == VCFHeaderVersion.VCF3_3) && - numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3)) ) { - countType = VCFHeaderLineCount.UNBOUNDED; - } else { - countType = VCFHeaderLineCount.INTEGER; - count = Integer.valueOf(numberStr); - - } - type = VCFHeaderLineType.valueOf(mapping.get("Type")); - if (type == VCFHeaderLineType.Flag && !allowFlagValues()) - throw new IllegalArgumentException("Flag is an unsupported type for this kind of field"); - - description = mapping.get("Description"); - if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided - description = UNBOUND_DESCRIPTION; - - this.lineType = lineType; - - validate(); - } - - private void validate() { - if ( name == null || type == null || description == null || lineType == null ) - throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s", - super.getKey(), name, type, description, lineType )); - } - - /** - * make a string representation of this header line - * @return a string representation - */ - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put("ID", name); - Object number; - switch ( countType ) { - case A: number = VCFConstants.PER_ALLELE_COUNT; break; - case G: number = VCFConstants.PER_GENOTYPE_COUNT; break; - case UNBOUNDED: number = VCFConstants.UNBOUNDED_ENCODING_v4; break; - case INTEGER: - default: number = count; - } - map.put("Number", number); - map.put("Type", type); - map.put("Description", description); - return lineType.toString() + "=" + toStringEncoding(map); - } - - /** - * returns true if we're equal to another compounder header line - * @param o a compound header line - * @return true if equal - */ - public boolean equals(Object o) { - if ( !(o instanceof VCFCompoundHeaderLine) ) - return false; - VCFCompoundHeaderLine other = (VCFCompoundHeaderLine)o; - return equalsExcludingDescription(other) && - description.equals(other.description); - } - - public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) { - return count == other.count && - countType == other.countType && - type == other.type && - lineType == other.lineType && - name.equals(other.name); - } - - public boolean sameLineTypeAndName(VCFCompoundHeaderLine other) { - return lineType == other.lineType && - name.equals(other.name); - } - - /** - * do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true) - * @return true if we do, false otherwise - */ - abstract boolean allowFlagValues(); - -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFConstants.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFConstants.java deleted file mode 100755 index 91f6b1ba9..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFConstants.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.variantcontext.v13; - -import java.util.Locale; - -final class VCFConstants { - public static final Locale VCF_LOCALE = Locale.US; - - // reserved INFO/FORMAT field keys - public static final String ANCESTRAL_ALLELE_KEY = "AA"; - public static final String ALLELE_COUNT_KEY = "AC"; - public static final String ALLELE_FREQUENCY_KEY = "AF"; - public static final String ALLELE_NUMBER_KEY = "AN"; - public static final String RMS_BASE_QUALITY_KEY = "BQ"; - public static final String CIGAR_KEY = "CIGAR"; - public static final String DBSNP_KEY = "DB"; - public static final String DEPTH_KEY = "DP"; - public static final String DOWNSAMPLED_KEY = "DS"; - public static final String EXPECTED_ALLELE_COUNT_KEY = "EC"; - public static final String END_KEY = "END"; - public static final String GENOTYPE_FILTER_KEY = "FT"; - public static final String GENOTYPE_KEY = "GT"; - @Deprecated - public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods - public static final String GENOTYPE_POSTERIORS_KEY = "GP"; - public static final String GENOTYPE_QUALITY_KEY = "GQ"; - public static final String HAPMAP2_KEY = "H2"; - public static final String HAPMAP3_KEY = "H3"; - public static final String HAPLOTYPE_QUALITY_KEY = "HQ"; - public static final String RMS_MAPPING_QUALITY_KEY = "MQ"; - public static final String MAPPING_QUALITY_ZERO_KEY = "MQ0"; - public static final String SAMPLE_NUMBER_KEY = "NS"; - public static final String PHRED_GENOTYPE_LIKELIHOODS_KEY = "PL"; // phred-scaled genotype likelihoods - public static final String PHASE_QUALITY_KEY = "PQ"; - public static final String PHASE_SET_KEY = "PS"; - public static final String OLD_DEPTH_KEY = "RD"; - public static final String STRAND_BIAS_KEY = "SB"; - public static final String SOMATIC_KEY = "SOMATIC"; - public static final String VALIDATED_KEY = "VALIDATED"; - public static final String THOUSAND_GENOMES_KEY = "1000G"; - - // separators - public static final String FORMAT_FIELD_SEPARATOR = ":"; - public static final String GENOTYPE_FIELD_SEPARATOR = ":"; - public static final char GENOTYPE_FIELD_SEPARATOR_CHAR = ':'; - public static final String FIELD_SEPARATOR = "\t"; - public static final char FIELD_SEPARATOR_CHAR = '\t'; - public static final String FILTER_CODE_SEPARATOR = ";"; - public static final String INFO_FIELD_ARRAY_SEPARATOR = ","; - public static final char INFO_FIELD_ARRAY_SEPARATOR_CHAR = ','; - public static final String ID_FIELD_SEPARATOR = ";"; - public static final String INFO_FIELD_SEPARATOR = ";"; - public static final char INFO_FIELD_SEPARATOR_CHAR = ';'; - public static final String UNPHASED = "/"; - public static final String PHASED = "|"; - public static final String PHASED_SWITCH_PROB_v3 = "\\"; - public static final String PHASING_TOKENS = "/|\\"; - - // old indel alleles - public static final char DELETION_ALLELE_v3 = 'D'; - public static final char INSERTION_ALLELE_v3 = 'I'; - - // missing/default values - public static final String UNFILTERED = "."; - public static final String PASSES_FILTERS_v3 = "0"; - public static final String PASSES_FILTERS_v4 = "PASS"; - public static final String EMPTY_ID_FIELD = "."; - public static final String EMPTY_INFO_FIELD = "."; - public static final String EMPTY_ALTERNATE_ALLELE_FIELD = "."; - public static final String MISSING_VALUE_v4 = "."; - public static final String MISSING_QUALITY_v3 = "-1"; - public static final Double MISSING_QUALITY_v3_DOUBLE = Double.valueOf(MISSING_QUALITY_v3); - - public static final String MISSING_GENOTYPE_QUALITY_v3 = "-1"; - public static final String MISSING_HAPLOTYPE_QUALITY_v3 = "-1"; - public static final String MISSING_DEPTH_v3 = "-1"; - public static final String UNBOUNDED_ENCODING_v4 = "."; - public static final String UNBOUNDED_ENCODING_v3 = "-1"; - public static final String PER_ALLELE_COUNT = "A"; - public static final String PER_GENOTYPE_COUNT = "G"; - public static final String EMPTY_ALLELE = "."; - public static final String EMPTY_GENOTYPE = "./."; - public static final double MAX_GENOTYPE_QUAL = 99.0; - - public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f"; - public static final String DOUBLE_PRECISION_INT_SUFFIX = ".00"; - public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFilterHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFilterHeaderLine.java deleted file mode 100755 index 5e16fbed0..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFilterHeaderLine.java +++ /dev/null @@ -1,28 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -/** - * @author ebanks - * A class representing a key=value entry for FILTER fields in the VCF header - */ -class VCFFilterHeaderLine extends VCFSimpleHeaderLine { - - /** - * create a VCF filter header line - * - * @param name the name for this header line - * @param description the description for this header line - */ - public VCFFilterHeaderLine(String name, String description) { - super(name, description, SupportedHeaderLineType.FILTER); - } - - /** - * create a VCF info header line - * - * @param line the header line - * @param version the vcf header version - */ - protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.FILTER); - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFormatHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFormatHeaderLine.java deleted file mode 100755 index f73c032cc..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFormatHeaderLine.java +++ /dev/null @@ -1,32 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - - -/** - * @author ebanks - *

- * Class VCFFormatHeaderLine - *

- * A class representing a key=value entry for genotype FORMAT fields in the VCF header - */ -class VCFFormatHeaderLine extends VCFCompoundHeaderLine { - - public VCFFormatHeaderLine(String name, int count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.FORMAT); - if (type == VCFHeaderLineType.Flag) - throw new IllegalArgumentException("Flag is an unsupported type for format fields"); - } - - public VCFFormatHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.FORMAT); - } - - protected VCFFormatHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.FORMAT); - } - - // format fields do not allow flag values (that wouldn't make much sense, how would you encode this in the genotype). - @Override - boolean allowFlagValues() { - return false; - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeader.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeader.java deleted file mode 100755 index be1b49ec1..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeader.java +++ /dev/null @@ -1,198 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - - -import org.broad.tribble.util.ParsingUtils; - -import java.util.*; - - -/** - * @author aaron - *

- * Class VCFHeader - *

- * A class representing the VCF header - */ -class VCFHeader { - - // the mandatory header fields - public enum HEADER_FIELDS { - CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO - } - - // the associated meta data - private final Set mMetaData; - private final Map mInfoMetaData = new HashMap(); - private final Map mFormatMetaData = new HashMap(); - private final Map mOtherMetaData = new HashMap(); - - // the list of auxillary tags - private final Set mGenotypeSampleNames = new LinkedHashSet(); - - // the character string that indicates meta data - public static final String METADATA_INDICATOR = "##"; - - // the header string indicator - public static final String HEADER_INDICATOR = "#"; - - // were the input samples sorted originally (or are we sorting them)? - private boolean samplesWereAlreadySorted = true; - - - /** - * create a VCF header, given a list of meta data and auxillary tags - * - * @param metaData the meta data associated with this header - */ - public VCFHeader(Set metaData) { - mMetaData = new TreeSet(metaData); - loadVCFVersion(); - loadMetaDataMaps(); - } - - /** - * create a VCF header, given a list of meta data and auxillary tags - * - * @param metaData the meta data associated with this header - * @param genotypeSampleNames the sample names - */ - public VCFHeader(Set metaData, Set genotypeSampleNames) { - mMetaData = new TreeSet(); - if ( metaData != null ) - mMetaData.addAll(metaData); - - mGenotypeSampleNames.addAll(genotypeSampleNames); - - loadVCFVersion(); - loadMetaDataMaps(); - - samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames); - } - - /** - * Adds a header line to the header metadata. - * - * @param headerLine Line to add to the existing metadata component. - */ - public void addMetaDataLine(VCFHeaderLine headerLine) { - mMetaData.add(headerLine); - } - - /** - * check our metadata for a VCF version tag, and throw an exception if the version is out of date - * or the version is not present - */ - public void loadVCFVersion() { - List toRemove = new ArrayList(); - for ( VCFHeaderLine line : mMetaData ) - if ( VCFHeaderVersion.isFormatString(line.getKey())) { - toRemove.add(line); - } - // remove old header lines for now, - mMetaData.removeAll(toRemove); - - } - - /** - * load the format/info meta data maps (these are used for quick lookup by key name) - */ - private void loadMetaDataMaps() { - for ( VCFHeaderLine line : mMetaData ) { - if ( line instanceof VCFInfoHeaderLine ) { - VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line; - mInfoMetaData.put(infoLine.getName(), infoLine); - } - else if ( line instanceof VCFFormatHeaderLine ) { - VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; - mFormatMetaData.put(formatLine.getName(), formatLine); - } - else { - mOtherMetaData.put(line.getKey(), line); - } - } - } - - /** - * get the header fields in order they're presented in the input file (which is now required to be - * the order presented in the spec). - * - * @return a set of the header fields, in order - */ - public Set getHeaderFields() { - Set fields = new LinkedHashSet(); - for (HEADER_FIELDS field : HEADER_FIELDS.values()) - fields.add(field); - return fields; - } - - /** - * get the meta data, associated with this header - * - * @return a set of the meta data - */ - public Set getMetaData() { - Set lines = new LinkedHashSet(); - lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString())); - lines.addAll(mMetaData); - return Collections.unmodifiableSet(lines); - } - - /** - * get the genotyping sample names - * - * @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false - */ - public Set getGenotypeSamples() { - return mGenotypeSampleNames; - } - - /** - * do we have genotyping data? - * - * @return true if we have genotyping columns, false otherwise - */ - public boolean hasGenotypingData() { - return mGenotypeSampleNames.size() > 0; - } - - /** - * were the input samples sorted originally? - * - * @return true if the input samples were sorted originally, false otherwise - */ - public boolean samplesWereAlreadySorted() { - return samplesWereAlreadySorted; - } - - /** @return the column count */ - public int getColumnCount() { - return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0); - } - - /** - * @param key the header key name - * @return the meta data line, or null if there is none - */ - public VCFInfoHeaderLine getInfoHeaderLine(String key) { - return mInfoMetaData.get(key); - } - - /** - * @param key the header key name - * @return the meta data line, or null if there is none - */ - public VCFFormatHeaderLine getFormatHeaderLine(String key) { - return mFormatMetaData.get(key); - } - - /** - * @param key the header key name - * @return the meta data line, or null if there is none - */ - public VCFHeaderLine getOtherHeaderLine(String key) { - return mOtherMetaData.get(key); - } -} - - - diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLine.java deleted file mode 100755 index 61b0722bd..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLine.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.variantcontext.v13; - -import org.broad.tribble.TribbleException; - -import java.util.Map; - - -/** - * @author ebanks - *

- * Class VCFHeaderLine - *

- * A class representing a key=value entry in the VCF header - */ -class VCFHeaderLine implements Comparable { - protected static boolean ALLOW_UNBOUND_DESCRIPTIONS = true; - protected static String UNBOUND_DESCRIPTION = "Not provided in original VCF header"; - - private String mKey = null; - private String mValue = null; - - - /** - * create a VCF header line - * - * @param key the key for this header line - * @param value the value for this header line - */ - public VCFHeaderLine(String key, String value) { - if ( key == null ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot be null: key = " + key); - mKey = key; - mValue = value; - } - - /** - * Get the key - * - * @return the key - */ - public String getKey() { - return mKey; - } - - /** - * Get the value - * - * @return the value - */ - public String getValue() { - return mValue; - } - - public String toString() { - return toStringEncoding(); - } - - /** - * Should be overloaded in sub classes to do subclass specific - * - * @return the string encoding - */ - protected String toStringEncoding() { - return mKey + "=" + mValue; - } - - public boolean equals(Object o) { - if ( !(o instanceof VCFHeaderLine) ) - return false; - return mKey.equals(((VCFHeaderLine)o).getKey()) && mValue.equals(((VCFHeaderLine)o).getValue()); - } - - public int compareTo(Object other) { - return toString().compareTo(other.toString()); - } - - /** - * @param line the line - * @return true if the line is a VCF meta data line, or false if it is not - */ - public static boolean isHeaderLine(String line) { - return line != null && line.length() > 0 && VCFHeader.HEADER_INDICATOR.equals(line.substring(0,1)); - } - - /** - * create a string of a mapping pair for the target VCF version - * @param keyValues a mapping of the key->value pairs to output - * @return a string, correctly formatted - */ - public static String toStringEncoding(Map keyValues) { - StringBuilder builder = new StringBuilder(); - builder.append("<"); - boolean start = true; - for (Map.Entry entry : keyValues.entrySet()) { - if (start) start = false; - else builder.append(","); - - if ( entry.getValue() == null ) throw new TribbleException.InternalCodecException("Header problem: unbound value at " + entry + " from " + keyValues); - - builder.append(entry.getKey()); - builder.append("="); - builder.append(entry.getValue().toString().contains(",") || - entry.getValue().toString().contains(" ") || - entry.getKey().equals("Description") ? "\""+ entry.getValue() + "\"" : entry.getValue()); - } - builder.append(">"); - return builder.toString(); - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineCount.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineCount.java deleted file mode 100644 index 8fd29d188..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineCount.java +++ /dev/null @@ -1,8 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -/** - * the count encodings we use for fields in VCF header lines - */ -public enum VCFHeaderLineCount { - INTEGER, A, G, UNBOUNDED; -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineTranslator.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineTranslator.java deleted file mode 100755 index 538b4ff8e..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineTranslator.java +++ /dev/null @@ -1,124 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -import java.util.*; - -/** - * A class for translating between vcf header versions - */ -public class VCFHeaderLineTranslator { - private static Map mapping; - - static { - mapping = new HashMap(); - mapping.put(VCFHeaderVersion.VCF4_0,new VCF4Parser()); - mapping.put(VCFHeaderVersion.VCF4_1,new VCF4Parser()); - mapping.put(VCFHeaderVersion.VCF3_3,new VCF3Parser()); - mapping.put(VCFHeaderVersion.VCF3_2,new VCF3Parser()); - } - - public static Map parseLine(VCFHeaderVersion version, String valueLine, List expectedTagOrder) { - return mapping.get(version).parseLine(valueLine,expectedTagOrder); - } -} - - -interface VCFLineParser { - public Map parseLine(String valueLine, List expectedTagOrder); -} - - -/** - * a class that handles the to and from disk for VCF 4 lines - */ -class VCF4Parser implements VCFLineParser { - Set bracketed = new HashSet(); - - /** - * parse a VCF4 line - * @param valueLine the line - * @return a mapping of the tags parsed out - */ - public Map parseLine(String valueLine, List expectedTagOrder) { - // our return map - Map ret = new LinkedHashMap(); - - // a builder to store up characters as we go - StringBuilder builder = new StringBuilder(); - - // store the key when we're parsing out the values - String key = ""; - - // where are we in the stream of characters? - int index = 0; - - // are we inside a quotation? we don't special case ',' then - boolean inQuote = false; - - // a little switch machine to parse out the tags. Regex ended up being really complicated and ugly [yes, but this machine is getting ugly now... MAD] - for (char c: valueLine.toCharArray()) { - if ( c == '\"' ) { - inQuote = ! inQuote; - } else if ( inQuote ) { - builder.append(c); - } else { - switch (c) { - case ('<') : if (index == 0) break; // if we see a open bracket at the beginning, ignore it - case ('>') : if (index == valueLine.length()-1) ret.put(key,builder.toString().trim()); break; // if we see a close bracket, and we're at the end, add an entry to our list - case ('=') : key = builder.toString().trim(); builder = new StringBuilder(); break; // at an equals, copy the key and reset the builder - case (',') : ret.put(key,builder.toString().trim()); builder = new StringBuilder(); break; // drop the current key value to the return map - default: builder.append(c); // otherwise simply append to the current string - } - } - - index++; - } - - // validate the tags against the expected list - index = 0; - if (ret.size() > expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size()); - for (String str : ret.keySet()) { - if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); - index++; - } - return ret; - } -} - -class VCF3Parser implements VCFLineParser { - - public Map parseLine(String valueLine, List expectedTagOrder) { - // our return map - Map ret = new LinkedHashMap(); - - // a builder to store up characters as we go - StringBuilder builder = new StringBuilder(); - - // where are we in the stream of characters? - int index = 0; - // where in the expected tag order are we? - int tagIndex = 0; - - // are we inside a quotation? we don't special case ',' then - boolean inQuote = false; - - // a little switch machine to parse out the tags. Regex ended up being really complicated and ugly - for (char c: valueLine.toCharArray()) { - switch (c) { - case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it - case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map - default: builder.append(c); // otherwise simply append to the current string - } - index++; - } - ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); - - // validate the tags against the expected list - index = 0; - if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size()); - for (String str : ret.keySet()){ - if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); - index++; - } - return ret; - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineType.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineType.java deleted file mode 100755 index 65cf1a327..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineType.java +++ /dev/null @@ -1,8 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -/** - * the type encodings we use for fields in VCF header lines - */ -enum VCFHeaderLineType { - Integer, Float, String, Character, Flag; -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderVersion.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderVersion.java deleted file mode 100755 index 21e737abe..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderVersion.java +++ /dev/null @@ -1,91 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -import org.broad.tribble.TribbleException; - -/** - * information that identifies each header version - */ -enum VCFHeaderVersion { - VCF3_2("VCRv3.2","format"), - VCF3_3("VCFv3.3","fileformat"), - VCF4_0("VCFv4.0","fileformat"), - VCF4_1("VCFv4.1","fileformat"); - - private final String versionString; - private final String formatString; - - /** - * create the enum, privately, using: - * @param vString the version string - * @param fString the format string - */ - VCFHeaderVersion(String vString, String fString) { - this.versionString = vString; - this.formatString = fString; - } - - /** - * get the header version - * @param version the version string - * @return a VCFHeaderVersion object - */ - public static VCFHeaderVersion toHeaderVersion(String version) { - version = clean(version); - for (VCFHeaderVersion hv : VCFHeaderVersion.values()) - if (hv.versionString.equals(version)) - return hv; - return null; - } - - /** - * are we a valid version string of some type - * @param version the version string - * @return true if we're valid of some type, false otherwise - */ - public static boolean isVersionString(String version){ - return toHeaderVersion(version) != null; - } - - /** - * are we a valid format string for some type - * @param format the format string - * @return true if we're valid of some type, false otherwise - */ - public static boolean isFormatString(String format){ - format = clean(format); - for (VCFHeaderVersion hv : VCFHeaderVersion.values()) - if (hv.formatString.equals(format)) - return true; - return false; - } - - public static VCFHeaderVersion getHeaderVersion(String versionLine) { - String[] lineFields = versionLine.split("="); - if ( lineFields.length != 2 || !isFormatString(lineFields[0].substring(2)) ) - throw new TribbleException.InvalidHeader(versionLine + " is not a valid VCF version line"); - - if ( !isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - - return toHeaderVersion(lineFields[1]); - } - - /** - * Utility function to clean up a VCF header string - * - * @param s string - * @return trimmed version of s - */ - private static String clean(String s) { - return s.trim(); - } - - - public String getVersionString() { - return versionString; - } - - public String getFormatString() { - return formatString; - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFInfoHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFInfoHeaderLine.java deleted file mode 100755 index 642a78b76..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFInfoHeaderLine.java +++ /dev/null @@ -1,29 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - - -/** - * @author ebanks - *

- * Class VCFInfoHeaderLine - *

- * A class representing a key=value entry for INFO fields in the VCF header - */ -class VCFInfoHeaderLine extends VCFCompoundHeaderLine { - public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.INFO); - } - - public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.INFO); - } - - protected VCFInfoHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.INFO); - } - - // info fields allow flag values - @Override - boolean allowFlagValues() { - return true; - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFNamedHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFNamedHeaderLine.java deleted file mode 100755 index b3ce5d841..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFNamedHeaderLine.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.variantcontext.v13; - -/** an interface for named header lines **/ -interface VCFNamedHeaderLine { - String getName(); -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFParser.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFParser.java deleted file mode 100755 index 95a3f7bf1..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFParser.java +++ /dev/null @@ -1,22 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -import java.util.List; -import java.util.Map; - - -/** - * All VCF codecs need to implement this interface so that we can perform lazy loading. - */ -interface VCFParser { - - /** - * create a genotype map - * @param str the string - * @param alleles the list of alleles - * @param chr chrom - * @param pos position - * @return a mapping of sample name to genotype object - */ - public Map createGenotypeMap(String str, List alleles, String chr, int pos); - -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFSimpleHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFSimpleHeaderLine.java deleted file mode 100644 index 17706c705..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFSimpleHeaderLine.java +++ /dev/null @@ -1,81 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -import java.util.Arrays; -import java.util.LinkedHashMap; -import java.util.Map; - - -/** - * @author ebanks - * A class representing a key=value entry for simple VCF header types - */ -abstract class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine { - - public enum SupportedHeaderLineType { - FILTER, ALT; - } - - private String name; - private String description; - - // our type of line, i.e. filter, alt, etc - private final SupportedHeaderLineType lineType; - - - /** - * create a VCF filter header line - * - * @param name the name for this header line - * @param description the description for this header line - * @param lineType the header line type - */ - public VCFSimpleHeaderLine(String name, String description, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); - this.lineType = lineType; - this.name = name; - this.description = description; - - if ( name == null || description == null ) - throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s desc=%s", super.getKey(), name, description )); - } - - /** - * create a VCF info header line - * - * @param line the header line - * @param version the vcf header version - * @param lineType the header line type - */ - protected VCFSimpleHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); - this.lineType = lineType; - Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description")); - name = mapping.get("ID"); - description = mapping.get("Description"); - if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided - description = UNBOUND_DESCRIPTION; - } - - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put("ID", name); - map.put("Description", description); - return lineType.toString() + "=" + toStringEncoding(map); - } - - public boolean equals(Object o) { - if ( !(o instanceof VCFSimpleHeaderLine) ) - return false; - VCFSimpleHeaderLine other = (VCFSimpleHeaderLine)o; - return name.equals(other.name) && - description.equals(other.description); - } - - public String getName() { - return name; - } - - public String getDescription() { - return description; - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFUtils.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFUtils.java deleted file mode 100755 index dc78d40ac..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFUtils.java +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.variantcontext.v13; - -import org.apache.log4j.Logger; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; - -import java.util.*; - -/** - * A set of static utility methods for common operations on VCF files/records. - */ -class VCFUtils { - /** - * Constructor access disallowed...static utility methods only! - */ - private VCFUtils() { } - - public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List> rodBindings) { - // Collect the eval rod names - final Set names = new TreeSet(); - for ( final RodBinding evalRod : rodBindings ) - names.add(evalRod.getName()); - return getVCFHeadersFromRods(toolkit, names); - } - - public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit) { - return getVCFHeadersFromRods(toolkit, (Collection)null); - } - - public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { - Map data = new HashMap(); - - // iterate to get all of the sample names - List dataSources = toolkit.getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - // ignore the rod if it's not in our list - if ( rodNames != null && !rodNames.contains(source.getName()) ) - continue; - - if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader ) - data.put(source.getName(), (VCFHeader)source.getHeader()); - } - - return data; - } - - public static Map getVCFHeadersFromRodPrefix(GenomeAnalysisEngine toolkit,String prefix) { - Map data = new HashMap(); - - // iterate to get all of the sample names - List dataSources = toolkit.getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - // ignore the rod if lacks the prefix - if ( ! source.getName().startsWith(prefix) ) - continue; - - if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader ) - data.put(source.getName(), (VCFHeader)source.getHeader()); - } - - return data; - } - - /** - * Gets the header fields from all VCF rods input by the user - * - * @param toolkit GATK engine - * - * @return a set of all fields - */ - public static Set getHeaderFields(GenomeAnalysisEngine toolkit) { - return getHeaderFields(toolkit, null); - } - - /** - * Gets the header fields from all VCF rods input by the user - * - * @param toolkit GATK engine - * @param rodNames names of rods to use, or null if we should use all possible ones - * - * @return a set of all fields - */ - public static Set getHeaderFields(GenomeAnalysisEngine toolkit, Collection rodNames) { - - // keep a map of sample name to occurrences encountered - TreeSet fields = new TreeSet(); - - // iterate to get all of the sample names - List dataSources = toolkit.getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - // ignore the rod if it's not in our list - if ( rodNames != null && !rodNames.contains(source.getName()) ) - continue; - - if ( source.getRecordType().equals(VariantContext.class)) { - VCFHeader header = (VCFHeader)source.getHeader(); - if ( header != null ) - fields.addAll(header.getMetaData()); - } - } - - return fields; - } - - /** Only displays a warning if a logger is provided and an identical warning hasn't been already issued */ - private static final class HeaderConflictWarner { - Logger logger; - Set alreadyIssued = new HashSet(); - - private HeaderConflictWarner(final Logger logger) { - this.logger = logger; - } - - public void warn(final VCFHeaderLine line, final String msg) { - if ( logger != null && ! alreadyIssued.contains(line.getKey()) ) { - alreadyIssued.add(line.getKey()); - logger.warn(msg); - } - } - } - - public static Set smartMergeHeaders(Collection headers, Logger logger) throws IllegalStateException { - HashMap map = new HashMap(); // from KEY.NAME -> line - HeaderConflictWarner conflictWarner = new HeaderConflictWarner(logger); - - // todo -- needs to remove all version headers from sources and add its own VCF version line - for ( VCFHeader source : headers ) { - //System.out.printf("Merging in header %s%n", source); - for ( VCFHeaderLine line : source.getMetaData()) { - String key = line.getKey(); - - if ( line instanceof VCFNamedHeaderLine) - key = key + "" + ((VCFNamedHeaderLine) line).getName(); - - if ( map.containsKey(key) ) { - VCFHeaderLine other = map.get(key); - if ( line.equals(other) ) - continue; - else if ( ! line.getClass().equals(other.getClass()) ) - throw new IllegalStateException("Incompatible header types: " + line + " " + other ); - else if ( line instanceof VCFFilterHeaderLine) { - String lineName = ((VCFFilterHeaderLine) line).getName(); String otherName = ((VCFFilterHeaderLine) other).getName(); - if ( ! lineName.equals(otherName) ) - throw new IllegalStateException("Incompatible header types: " + line + " " + other ); - } else if ( line instanceof VCFCompoundHeaderLine ) { - VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line; - VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other; - - // if the names are the same, but the values are different, we need to quit - if (! (compLine).equalsExcludingDescription(compOther) ) { - if ( compLine.getType().equals(compOther.getType()) ) { - // The Number entry is an Integer that describes the number of values that can be - // included with the INFO field. For example, if the INFO field contains a single - // number, then this value should be 1. However, if the INFO field describes a pair - // of numbers, then this value should be 2 and so on. If the number of possible - // values varies, is unknown, or is unbounded, then this value should be '.'. - conflictWarner.warn(line, "Promoting header field Number to . due to number differences in header lines: " + line + " " + other); - compOther.setNumberToUnbounded(); - } else if ( compLine.getType() == VCFHeaderLineType.Integer && compOther.getType() == VCFHeaderLineType.Float ) { - // promote key to Float - conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); - map.put(key, compOther); - } else if ( compLine.getType() == VCFHeaderLineType.Float && compOther.getType() == VCFHeaderLineType.Integer ) { - // promote key to Float - conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); - } else { - throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other ); - } - } - if ( ! compLine.getDescription().equals(compOther) ) - conflictWarner.warn(line, "Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine); - } else { - // we are not equal, but we're not anything special either - conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + line + " already present header = " + other); - } - } else { - map.put(key, line); - //System.out.printf("Adding header line %s%n", line); - } - } - } - - return new HashSet(map.values()); - } - - public static String rsIDOfFirstRealVariant(List VCs, VariantContext.Type type) { - if ( VCs == null ) - return null; - - String rsID = null; - for ( VariantContext vc : VCs ) { - if ( vc.getType() == type ) { - rsID = vc.getID(); - break; - } - } - - return rsID; - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFWriter.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFWriter.java deleted file mode 100755 index 15bdb5046..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFWriter.java +++ /dev/null @@ -1,16 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -/** - * this class writes VCF files - */ -public interface VCFWriter { - - public void writeHeader(VCFHeader header); - - /** - * attempt to close the VCF file - */ - public void close(); - - public void add(VariantContext vc); -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContext.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContext.java deleted file mode 100755 index 3a193a00a..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContext.java +++ /dev/null @@ -1,1615 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext.v13; - -import org.broad.tribble.Feature; -import org.broad.tribble.TribbleException; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.*; - -/** - * Class VariantContext - * - * == High-level overview == - * - * The VariantContext object is a single general class system for representing genetic variation data composed of: - * - * * Allele: representing single genetic haplotypes (A, T, ATC, -) - * * Genotype: an assignment of alleles for each chromosome of a single named sample at a particular locus - * * VariantContext: an abstract class holding all segregating alleles at a locus as well as genotypes - * for multiple individuals containing alleles at that locus - * - * The class system works by defining segregating alleles, creating a variant context representing the segregating - * information at a locus, and potentially creating and associating genotypes with individuals in the context. - * - * All of the classes are highly validating -- call validate() if you modify them -- so you can rely on the - * self-consistency of the data once you have a VariantContext in hand. The system has a rich set of assessor - * and manipulator routines, as well as more complex static support routines in VariantContextUtils. - * - * The VariantContext (and Genotype) objects are attributed (supporting addition of arbitrary key/value pairs) and - * filtered (can represent a variation that is viewed as suspect). - * - * VariantContexts are dynamically typed, so whether a VariantContext is a SNP, Indel, or NoVariant depends - * on the properties of the alleles in the context. See the detailed documentation on the Type parameter below. - * - * It's also easy to create subcontexts based on selected genotypes. - * - * == Working with Variant Contexts == - * By default, VariantContexts are immutable. In order to access (in the rare circumstances where you need them) - * setter routines, you need to create MutableVariantContexts and MutableGenotypes. - * - * === Some example data === - * - * Allele A, Aref, T, Tref; - * Allele del, delRef, ATC, ATCref; - * - * A [ref] / T at 10 - * GenomeLoc snpLoc = GenomeLocParser.createGenomeLoc("chr1", 10, 10); - * - * - / ATC [ref] from 20-23 - * GenomeLoc delLoc = GenomeLocParser.createGenomeLoc("chr1", 20, 22); - * - * // - [ref] / ATC immediately after 20 - * GenomeLoc insLoc = GenomeLocParser.createGenomeLoc("chr1", 20, 20); - * - * === Alleles === - * - * See the documentation in the Allele class itself - * - * What are they? - * - * Alleles can be either reference or non-reference - * - * Example alleles used here: - * - * del = new Allele("-"); - * A = new Allele("A"); - * Aref = new Allele("A", true); - * T = new Allele("T"); - * ATC = new Allele("ATC"); - * - * === Creating variant contexts === - * - * ==== By hand ==== - * - * Here's an example of a A/T polymorphism with the A being reference: - * - *

- * VariantContext vc = new VariantContext(name, snpLoc, Arrays.asList(Aref, T));
- * 
- * - * If you want to create a non-variant site, just put in a single reference allele - * - *
- * VariantContext vc = new VariantContext(name, snpLoc, Arrays.asList(Aref));
- * 
- * - * A deletion is just as easy: - * - *
- * VariantContext vc = new VariantContext(name, delLoc, Arrays.asList(ATCref, del));
- * 
- * - * The only 2 things that distinguishes between a insertion and deletion are the reference allele - * and the location of the variation. An insertion has a Null reference allele and at least - * one non-reference Non-Null allele. Additionally, the location of the insertion is immediately after - * a 1-bp GenomeLoc (at say 20). - * - *
- * VariantContext vc = new VariantContext("name", insLoc, Arrays.asList(delRef, ATC));
- * 
- * - * ==== Converting rods and other data structures to VCs ==== - * - * You can convert many common types into VariantContexts using the general function: - * - *
- * VariantContextAdaptors.convertToVariantContext(name, myObject)
- * 
- * - * dbSNP and VCFs, for example, can be passed in as myObject and a VariantContext corresponding to that - * object will be returned. A null return type indicates that the type isn't yet supported. This is the best - * and easiest way to create contexts using RODs. - * - * - * === Working with genotypes === - * - *
- * List alleles = Arrays.asList(Aref, T);
- * Genotype g1 = new Genotype(Arrays.asList(Aref, Aref), "g1", 10);
- * Genotype g2 = new Genotype(Arrays.asList(Aref, T), "g2", 10);
- * Genotype g3 = new Genotype(Arrays.asList(T, T), "g3", 10);
- * VariantContext vc = new VariantContext(snpLoc, alleles, Arrays.asList(g1, g2, g3));
- * 
- * - * At this point we have 3 genotypes in our context, g1-g3. - * - * You can assess a good deal of information about the genotypes through the VariantContext: - * - *
- * vc.hasGenotypes()
- * vc.isMonomorphic()
- * vc.isPolymorphic()
- * vc.getSamples().size()
- *
- * vc.getGenotypes()
- * vc.getGenotypes().get("g1")
- * vc.hasGenotype("g1")
- *
- * vc.getChromosomeCount()
- * vc.getChromosomeCount(Aref)
- * vc.getChromosomeCount(T)
- * 
- * - * === NO_CALL alleles === - * - * The system allows one to create Genotypes carrying special NO_CALL alleles that aren't present in the - * set of context alleles and that represent undetermined alleles in a genotype: - * - * Genotype g4 = new Genotype(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), "NO_DATA_FOR_SAMPLE", 10); - * - * - * === subcontexts === - * It's also very easy get subcontext based only the data in a subset of the genotypes: - * - *
- * VariantContext vc12 = vc.subContextFromGenotypes(Arrays.asList(g1,g2));
- * VariantContext vc1 = vc.subContextFromGenotypes(Arrays.asList(g1));
- * 
- * - * @author depristo - */ -public class VariantContext implements Feature { // to enable tribble intergration - protected InferredGeneticContext commonInfo = null; - public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; - public final static String UNPARSED_GENOTYPE_MAP_KEY = "_UNPARSED_GENOTYPE_MAP_"; - public final static String UNPARSED_GENOTYPE_PARSER_KEY = "_UNPARSED_GENOTYPE_PARSER_"; - public final static String ID_KEY = "ID"; - - private final Byte REFERENCE_BASE_FOR_INDEL; - - public final static Set PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet()); - - /** The location of this VariantContext */ - protected String contig; - protected long start; - protected long stop; - - /** The type (cached for performance reasons) of this context */ - protected Type type = null; - - /** A set of the alleles segregating in this context */ - final protected List alleles; - - /** A mapping from sampleName -> genotype objects for all genotypes associated with this context */ - protected Map genotypes = null; - - /** Counts for each of the possible Genotype types in this context */ - protected int[] genotypeCounts = null; - - public final static Map NO_GENOTYPES = Collections.unmodifiableMap(new HashMap()); - - // a fast cached access point to the ref / alt alleles for biallelic case - private Allele REF = null; - - // set to the alt allele when biallelic, otherwise == null - private Allele ALT = null; - - // were filters applied? - private boolean filtersWereAppliedToContext; - - // --------------------------------------------------------------------------------------------------------- - // - // constructors - // - // --------------------------------------------------------------------------------------------------------- - - - /** - * the complete constructor. Makes a complete VariantContext from its arguments - * This is the only constructor that is able to create indels! DO NOT USE THE OTHER ONES. - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes map - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - * @param referenceBaseForIndel padded reference base - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { - this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, referenceBaseForIndel, false, true); - } - - /** - * the complete constructor. Makes a complete VariantContext from its arguments - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes map - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, null, false, true); - } - - /** - * Makes a VariantContext from its arguments without parsing the genotypes. - * Note that this constructor assumes that if there is genotype data, then it's been put into - * the attributes with the UNPARSED_GENOTYPE_MAP_KEY and that the codec has been added with the - * UNPARSED_GENOTYPE_PARSER_KEY. It doesn't validate that this is the case because it's possible - * that there is no genotype data. - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - * @param referenceBaseForIndel padded reference base - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, referenceBaseForIndel, true, true); - } - - /** - * Create a new VariantContext - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes set - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, null, false, true); - } - - /** - * Create a new variant context without genotypes and no Perror, no filters, and no attributes - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, null, false, true); - } - - /** - * Create a new variant context with genotypes but without Perror, filters, and attributes - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes) { - this(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } - - /** - * Copy constructor - * - * @param other the VariantContext to copy - */ - public VariantContext(VariantContext other) { - this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, false, true); - } - - /** - * the actual constructor. Private access only - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes map - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - * @param referenceBaseForIndel padded reference base - * @param genotypesAreUnparsed true if the genotypes have not yet been parsed - * @param performValidation if true, call validate() as the final step in construction - */ - private VariantContext(String source, String contig, long start, long stop, - Collection alleles, Map genotypes, - double negLog10PError, Set filters, Map attributes, - Byte referenceBaseForIndel, boolean genotypesAreUnparsed, - boolean performValidation ) { - if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); } - this.contig = contig; - this.start = start; - this.stop = stop; - - if ( !genotypesAreUnparsed && attributes != null ) { - if ( attributes.containsKey(UNPARSED_GENOTYPE_MAP_KEY) ) - attributes.remove(UNPARSED_GENOTYPE_MAP_KEY); - if ( attributes.containsKey(UNPARSED_GENOTYPE_PARSER_KEY) ) - attributes.remove(UNPARSED_GENOTYPE_PARSER_KEY); - } - - this.commonInfo = new InferredGeneticContext(source, negLog10PError, filters, attributes); - filtersWereAppliedToContext = filters != null; - REFERENCE_BASE_FOR_INDEL = referenceBaseForIndel; - - if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); } - - // we need to make this a LinkedHashSet in case the user prefers a given ordering of alleles - this.alleles = makeAlleles(alleles); - - - if ( genotypes == null ) { genotypes = NO_GENOTYPES; } - this.genotypes = Collections.unmodifiableMap(genotypes); - - // cache the REF and ALT alleles - int nAlleles = alleles.size(); - for ( Allele a : alleles ) { - if ( a.isReference() ) { - REF = a; - } else if ( nAlleles == 2 ) { // only cache ALT when biallelic - ALT = a; - } - } - - if ( performValidation ) { - validate(); - } - } - - // --------------------------------------------------------------------------------------------------------- - // - // Partial-cloning routines (because Variant Context is immutable). - // - // IMPORTANT: These routines assume that the VariantContext on which they're called is already valid. - // Due to this assumption, they explicitly tell the constructor NOT to perform validation by - // calling validate(), and instead perform validation only on the data that's changed. - // - // Note that we don't call vc.getGenotypes() because that triggers the lazy loading. - // Also note that we need to create a new attributes map because it's unmodifiable and the constructor may try to modify it. - // - // --------------------------------------------------------------------------------------------------------- - - public static VariantContext modifyGenotypes(VariantContext vc, Map genotypes) { - VariantContext modifiedVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), false, false); - modifiedVC.validateGenotypes(); - return modifiedVC; - } - - public static VariantContext modifyLocation(VariantContext vc, String chr, int start, int end) { - VariantContext modifiedVC = new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true, false); - - // Since start and end have changed, we need to call both validateAlleles() and validateReferencePadding(), - // since those validation routines rely on the values of start and end: - modifiedVC.validateAlleles(); - modifiedVC.validateReferencePadding(); - - return modifiedVC; - } - - public static VariantContext modifyFilters(VariantContext vc, Set filters) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true, false); - } - - public static VariantContext modifyAttributes(VariantContext vc, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, vc.getReferenceBaseForIndel(), true, false); - } - - public static VariantContext modifyReferencePadding(VariantContext vc, Byte b) { - VariantContext modifiedVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), b, true, false); - modifiedVC.validateReferencePadding(); - return modifiedVC; - } - - public static VariantContext modifyPErrorFiltersAndAttributes(VariantContext vc, double negLog10PError, Set filters, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, vc.getReferenceBaseForIndel(), true, false); - } - - // --------------------------------------------------------------------------------------------------------- - // - // Selectors - // - // --------------------------------------------------------------------------------------------------------- - - /** - * Returns a context identical to this (i.e., filter, qual are all the same) but containing only the Genotype - * genotype and alleles in genotype. This is the right way to test if a single genotype is actually - * variant or not. - * - * @param genotype genotype - * @return vc subcontext - */ - public VariantContext subContextFromGenotypes(Genotype genotype) { - return subContextFromGenotypes(Arrays.asList(genotype)); - } - - - /** - * Returns a context identical to this (i.e., filter, qual are all the same) but containing only the Genotypes - * genotypes and alleles in these genotypes. This is the right way to test if a single genotype is actually - * variant or not. - * - * @param genotypes genotypes - * @return vc subcontext - */ - public VariantContext subContextFromGenotypes(Collection genotypes) { - return subContextFromGenotypes(genotypes, allelesOfGenotypes(genotypes)) ; - } - - /** - * Returns a context identical to this (i.e., filter, qual are all the same) but containing only the Genotypes - * genotypes. Also, the resulting variant context will contain the alleles provided, not only those found in genotypes - * - * @param genotypes genotypes - * @param alleles the set of allele segregating alleles at this site. Must include those in genotypes, but may be more - * @return vc subcontext - */ - public VariantContext subContextFromGenotypes(Collection genotypes, Collection alleles) { - return new VariantContext(getSource(), contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes(), getReferenceBaseForIndel()); - } - - - /** - * helper routine for subcontext - * @param genotypes genotypes - * @return allele set - */ - private Set allelesOfGenotypes(Collection genotypes) { - Set alleles = new HashSet(); - - boolean addedref = false; - for ( Genotype g : genotypes ) { - for ( Allele a : g.getAlleles() ) { - addedref = addedref || a.isReference(); - if ( a.isCalled() ) - alleles.add(a); - } - } - if ( ! addedref ) alleles.add(getReference()); - - return alleles; - } - - // --------------------------------------------------------------------------------------------------------- - // - // type operations - // - // --------------------------------------------------------------------------------------------------------- - - /** - * see: http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=handbook&part=ch5&rendertype=table&id=ch5.ch5_t3 - * - * Format: - * dbSNP variation class - * Rules for assigning allele classes - * Sample allele definition - * - * Single Nucleotide Polymorphisms (SNPs)a - * Strictly defined as single base substitutions involving A, T, C, or G. - * A/T - * - * Deletion/Insertion Polymorphisms (DIPs) - * Designated using the full sequence of the insertion as one allele, and either a fully - * defined string for the variant allele or a '-' character to specify the deleted allele. - * This class will be assigned to a variation if the variation alleles are of different lengths or - * if one of the alleles is deleted ('-'). - * T/-/CCTA/G - * - * No-variation - * Reports may be submitted for segments of sequence that are assayed and determined to be invariant - * in the sample. - * (NoVariation) - * - * Mixed - * Mix of other classes - * - * Also supports NO_VARIATION type, used to indicate that the site isn't polymorphic in the population - * - * - * Not currently supported: - * - * Heterozygous sequencea - * The term heterozygous is used to specify a region detected by certain methods that do not - * resolve the polymorphism into a specific sequence motif. In these cases, a unique flanking - * sequence must be provided to define a sequence context for the variation. - * (heterozygous) - * - * Microsatellite or short tandem repeat (STR) - * Alleles are designated by providing the repeat motif and the copy number for each allele. - * Expansion of the allele repeat motif designated in dbSNP into full-length sequence will - * be only an approximation of the true genomic sequence because many microsatellite markers are - * not fully sequenced and are resolved as size variants only. - * (CAC)8/9/10/11 - * - * Named variant - * Applies to insertion/deletion polymorphisms of longer sequence features, such as retroposon - * dimorphism for Alu or line elements. These variations frequently include a deletion '-' indicator - * for the absent allele. - * (alu) / - - * - * Multi-Nucleotide Polymorphism (MNP) - * Assigned to variations that are multi-base variations of a single, common length - * GGA/AGT - */ - public enum Type { - NO_VARIATION, - SNP, - MNP, // a multi-nucleotide polymorphism - INDEL, - SYMBOLIC, - MIXED, - } - - /** - * Determines (if necessary) and returns the type of this variation by examining the alleles it contains. - * - * @return the type of this VariantContext - **/ - public Type getType() { - if ( type == null ) - determineType(); - - return type; - } - - /** - * convenience method for SNPs - * - * @return true if this is a SNP, false otherwise - */ - public boolean isSNP() { return getType() == Type.SNP; } - - - /** - * convenience method for variants - * - * @return true if this is a variant allele, false if it's reference - */ - public boolean isVariant() { return getType() != Type.NO_VARIATION; } - - /** - * convenience method for point events - * - * @return true if this is a SNP or ref site, false if it's an indel or mixed event - */ - public boolean isPointEvent() { return isSNP() || !isVariant(); } - - /** - * convenience method for indels - * - * @return true if this is an indel, false otherwise - */ - public boolean isIndel() { return getType() == Type.INDEL; } - - /** - * @return true if the alleles indicate a simple insertion (i.e., the reference allele is Null) - */ - public boolean isSimpleInsertion() { - // can't just call !isSimpleDeletion() because of complex indels - return getType() == Type.INDEL && getReference().isNull() && isBiallelic(); - } - - /** - * @return true if the alleles indicate a simple deletion (i.e., a single alt allele that is Null) - */ - public boolean isSimpleDeletion() { - // can't just call !isSimpleInsertion() because of complex indels - return getType() == Type.INDEL && getAlternateAllele(0).isNull() && isBiallelic(); - } - - /** - * @return true if the alleles indicate neither a simple deletion nor a simple insertion - */ - public boolean isComplexIndel() { - return isIndel() && !isSimpleDeletion() && !isSimpleInsertion(); - } - - public boolean isSymbolic() { - return getType() == Type.SYMBOLIC; - } - - public boolean isMNP() { - return getType() == Type.MNP; - } - - /** - * convenience method for indels - * - * @return true if this is an mixed variation, false otherwise - */ - public boolean isMixed() { return getType() == Type.MIXED; } - - - // --------------------------------------------------------------------------------------------------------- - // - // Generic accessors - // - // --------------------------------------------------------------------------------------------------------- - - public boolean hasID() { - return commonInfo.hasAttribute(ID_KEY); - } - - public String getID() { - return (String)commonInfo.getAttribute(ID_KEY); - } - - public boolean hasReferenceBaseForIndel() { - return REFERENCE_BASE_FOR_INDEL != null; - } - - // the indel base that gets stripped off for indels - public Byte getReferenceBaseForIndel() { - return REFERENCE_BASE_FOR_INDEL; - } - - // --------------------------------------------------------------------------------------------------------- - // - // get routines to access context info fields - // - // --------------------------------------------------------------------------------------------------------- - public String getSource() { return commonInfo.getName(); } - public Set getFilters() { return commonInfo.getFilters(); } - public boolean isFiltered() { return commonInfo.isFiltered(); } - public boolean isNotFiltered() { return commonInfo.isNotFiltered(); } - public boolean filtersWereApplied() { return filtersWereAppliedToContext; } - public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); } - public double getNegLog10PError() { return commonInfo.getNegLog10PError(); } - public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); } - - public Map getAttributes() { return commonInfo.getAttributes(); } - public boolean hasAttribute(String key) { return commonInfo.hasAttribute(key); } - public Object getAttribute(String key) { return commonInfo.getAttribute(key); } - - public Object getAttribute(String key, Object defaultValue) { - return commonInfo.getAttribute(key, defaultValue); - } - - public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); } - public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } - public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } - public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } - - // --------------------------------------------------------------------------------------------------------- - // - // Working with alleles - // - // --------------------------------------------------------------------------------------------------------- - - /** - * @return the reference allele for this context - */ - public Allele getReference() { - Allele ref = REF; - if ( ref == null ) - throw new IllegalStateException("BUG: no reference allele found at " + this); - return ref; - } - - - /** - * @return true if the context is strictly bi-allelic - */ - public boolean isBiallelic() { - return getNAlleles() == 2; - } - - /** - * @return The number of segregating alleles in this context - */ - public int getNAlleles() { - return alleles.size(); - } - - /** - * @return The allele sharing the same bases as this String. A convenience method; better to use byte[] - */ - public Allele getAllele(String allele) { - return getAllele(allele.getBytes()); - } - - /** - * @return The allele sharing the same bases as this byte[], or null if no such allele is present. - */ - public Allele getAllele(byte[] allele) { - return Allele.getMatchingAllele(getAlleles(), allele); - } - - /** - * @return True if this context contains Allele allele, or false otherwise - */ - public boolean hasAllele(Allele allele) { - return hasAllele(allele, false); - } - - public boolean hasAllele(Allele allele, boolean ignoreRefState) { - if ( allele == REF || allele == ALT ) // optimization for cached cases - return true; - - for ( Allele a : getAlleles() ) { - if ( a.equals(allele, ignoreRefState) ) - return true; - } - - return false; - } - - - /** - * Gets the alleles. This method should return all of the alleles present at the location, - * including the reference allele. There are no constraints imposed on the ordering of alleles - * in the set. If the reference is not an allele in this context it will not be included. - * - * @return the set of alleles - */ - public List getAlleles() { return alleles; } - - /** - * Gets the alternate alleles. This method should return all the alleles present at the location, - * NOT including the reference allele. There are no constraints imposed on the ordering of alleles - * in the set. - * - * @return the set of alternate alleles - */ - public List getAlternateAlleles() { - return alleles.subList(1, alleles.size()); - } - - /** - * Gets the sizes of the alternate alleles if they are insertion/deletion events, and returns a list of their sizes - * - * @return a list of indel lengths ( null if not of type indel or mixed ) - */ - public List getIndelLengths() { - if ( getType() != Type.INDEL && getType() != Type.MIXED ) { - return null; - } - - List lengths = new ArrayList(); - for ( Allele a : getAlternateAlleles() ) { - lengths.add(a.length() - getReference().length()); - } - - return lengths; - } - - /** - * @param i -- the ith allele (from 0 to n - 2 for a context with n alleles including a reference allele) - * @return the ith non-reference allele in this context - * @throws IllegalArgumentException if i is invalid - */ - public Allele getAlternateAllele(int i) { - return alleles.get(i+1); - } - - /** - * @param other VariantContext whose alternate alleles to compare against - * @return true if this VariantContext has the same alternate alleles as other, - * regardless of ordering. Otherwise returns false. - */ - public boolean hasSameAlternateAllelesAs ( VariantContext other ) { - List thisAlternateAlleles = getAlternateAlleles(); - List otherAlternateAlleles = other.getAlternateAlleles(); - - if ( thisAlternateAlleles.size() != otherAlternateAlleles.size() ) { - return false; - } - - for ( Allele allele : thisAlternateAlleles ) { - if ( ! otherAlternateAlleles.contains(allele) ) { - return false; - } - } - - return true; - } - - // --------------------------------------------------------------------------------------------------------- - // - // Working with genotypes - // - // --------------------------------------------------------------------------------------------------------- - - private void loadGenotypes() { - if ( !hasAttribute(UNPARSED_GENOTYPE_MAP_KEY) ) { - if ( genotypes == null ) - genotypes = NO_GENOTYPES; - return; - } - - Object parserObj = getAttribute(UNPARSED_GENOTYPE_PARSER_KEY); - if ( parserObj == null || !(parserObj instanceof VCFParser) ) - throw new IllegalStateException("There is no VCF parser stored to unparse the genotype data"); - VCFParser parser = (VCFParser)parserObj; - - Object mapObj = getAttribute(UNPARSED_GENOTYPE_MAP_KEY); - if ( mapObj == null ) - throw new IllegalStateException("There is no mapping string stored to unparse the genotype data"); - - genotypes = parser.createGenotypeMap(mapObj.toString(), new ArrayList(alleles), getChr(), getStart()); - - commonInfo.removeAttribute(UNPARSED_GENOTYPE_MAP_KEY); - commonInfo.removeAttribute(UNPARSED_GENOTYPE_PARSER_KEY); - - validateGenotypes(); - } - - /** - * @return the number of samples in the context - */ - public int getNSamples() { - loadGenotypes(); - return genotypes.size(); - } - - /** - * @return true if the context has associated genotypes - */ - public boolean hasGenotypes() { - loadGenotypes(); - return genotypes.size() > 0; - } - - public boolean hasGenotypes(Collection sampleNames) { - loadGenotypes(); - for ( String name : sampleNames ) { - if ( ! genotypes.containsKey(name) ) - return false; - } - return true; - } - - /** - * @return set of all Genotypes associated with this context - */ - public Map getGenotypes() { - loadGenotypes(); - return genotypes; - } - - public List getGenotypesSortedByName() { - loadGenotypes(); - Collection types = new TreeMap(genotypes).values(); - return new ArrayList(types); - } - - /** - * Returns a map from sampleName -> Genotype for the genotype associated with sampleName. Returns a map - * for consistency with the multi-get function. - * - * @param sampleName - * @return - * @throws IllegalArgumentException if sampleName isn't bound to a genotype - */ - public Map getGenotypes(String sampleName) { - return getGenotypes(Arrays.asList(sampleName)); - } - - /** - * Returns a map from sampleName -> Genotype for each sampleName in sampleNames. Returns a map - * for consistency with the multi-get function. - * - * @param sampleNames a unique list of sample names - * @return - * @throws IllegalArgumentException if sampleName isn't bound to a genotype - */ - public Map getGenotypes(Collection sampleNames) { - HashMap map = new HashMap(); - - for ( String name : sampleNames ) { - if ( map.containsKey(name) ) throw new IllegalArgumentException("Duplicate names detected in requested samples " + sampleNames); - final Genotype g = getGenotype(name); - if ( g != null ) { - map.put(name, g); - } - } - - return map; - } - - /** - * @return the set of all sample names in this context - */ - public Set getSampleNames() { - return getGenotypes().keySet(); - } - - /** - * @param sample the sample name - * - * @return the Genotype associated with the given sample in this context or null if the sample is not in this context - */ - public Genotype getGenotype(String sample) { - return getGenotypes().get(sample); - } - - public boolean hasGenotype(String sample) { - return getGenotypes().containsKey(sample); - } - - public Genotype getGenotype(int ith) { - return getGenotypesSortedByName().get(ith); - } - - - /** - * Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS - * - * @return chromosome count - */ - public int getChromosomeCount() { - int n = 0; - - for ( Genotype g : getGenotypes().values() ) { - n += g.isNoCall() ? 0 : g.getPloidy(); - } - - return n; - } - - /** - * Returns the number of chromosomes carrying allele A in the genotypes - * - * @param a allele - * @return chromosome count - */ - public int getChromosomeCount(Allele a) { - int n = 0; - - for ( Genotype g : getGenotypes().values() ) { - n += g.getAlleles(a).size(); - } - - return n; - } - - /** - * Genotype-specific functions -- are the genotypes monomorphic w.r.t. to the alleles segregating at this - * site? That is, is the number of alternate alleles among all fo the genotype == 0? - * - * @return true if it's monomorphic - */ - public boolean isMonomorphic() { - return ! isVariant() || (hasGenotypes() && getHomRefCount() + getNoCallCount() == getNSamples()); - } - - /** - * Genotype-specific functions -- are the genotypes polymorphic w.r.t. to the alleles segregating at this - * site? That is, is the number of alternate alleles among all fo the genotype > 0? - * - * @return true if it's polymorphic - */ - public boolean isPolymorphic() { - return ! isMonomorphic(); - } - - private void calculateGenotypeCounts() { - if ( genotypeCounts == null ) { - genotypeCounts = new int[Genotype.Type.values().length]; - - for ( Genotype g : getGenotypes().values() ) { - if ( g.isNoCall() ) - genotypeCounts[Genotype.Type.NO_CALL.ordinal()]++; - else if ( g.isHomRef() ) - genotypeCounts[Genotype.Type.HOM_REF.ordinal()]++; - else if ( g.isHet() ) - genotypeCounts[Genotype.Type.HET.ordinal()]++; - else if ( g.isHomVar() ) - genotypeCounts[Genotype.Type.HOM_VAR.ordinal()]++; - else - genotypeCounts[Genotype.Type.MIXED.ordinal()]++; - } - } - } - - /** - * Genotype-specific functions -- how many no-calls are there in the genotypes? - * - * @return number of no calls - */ - public int getNoCallCount() { - calculateGenotypeCounts(); - return genotypeCounts[Genotype.Type.NO_CALL.ordinal()]; - } - - /** - * Genotype-specific functions -- how many hom ref calls are there in the genotypes? - * - * @return number of hom ref calls - */ - public int getHomRefCount() { - calculateGenotypeCounts(); - return genotypeCounts[Genotype.Type.HOM_REF.ordinal()]; - } - - /** - * Genotype-specific functions -- how many het calls are there in the genotypes? - * - * @return number of het calls - */ - public int getHetCount() { - calculateGenotypeCounts(); - return genotypeCounts[Genotype.Type.HET.ordinal()]; - } - - /** - * Genotype-specific functions -- how many hom var calls are there in the genotypes? - * - * @return number of hom var calls - */ - public int getHomVarCount() { - return genotypeCounts[Genotype.Type.HOM_VAR.ordinal()]; - } - - /** - * Genotype-specific functions -- how many mixed calls are there in the genotypes? - * - * @return number of mixed calls - */ - public int getMixedCount() { - return genotypeCounts[Genotype.Type.MIXED.ordinal()]; - } - - // --------------------------------------------------------------------------------------------------------- - // - // validation: extra-strict validation routines for paranoid users - // - // --------------------------------------------------------------------------------------------------------- - - /** - * Run all extra-strict validation tests on a Variant Context object - * - * @param reference the true reference allele - * @param paddedRefBase the reference base used for padding indels - * @param rsIDs the true dbSNP IDs - */ - public void extraStrictValidation(Allele reference, Byte paddedRefBase, Set rsIDs) { - // validate the reference - validateReferenceBases(reference, paddedRefBase); - - // validate the RS IDs - validateRSIDs(rsIDs); - - // validate the altenate alleles - validateAlternateAlleles(); - - // validate the AN and AC fields - validateChromosomeCounts(); - - // TODO: implement me - //checkReferenceTrack(); - } - - public void validateReferenceBases(Allele reference, Byte paddedRefBase) { - // don't validate if we're a complex event - if ( !isComplexIndel() && !reference.isNull() && !reference.basesMatch(getReference()) ) { - throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString())); - } - - // we also need to validate the padding base for simple indels - if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) ) { - throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), (char)paddedRefBase.byteValue(), (char)getReferenceBaseForIndel().byteValue())); - } - } - - public void validateRSIDs(Set rsIDs) { - if ( rsIDs != null && hasAttribute(VariantContext.ID_KEY) ) { - for ( String id : getID().split(VCFConstants.ID_FIELD_SEPARATOR) ) { - if ( id.startsWith("rs") && !rsIDs.contains(id) ) - throw new TribbleException.InternalCodecException(String.format("the rsID %s for the record at position %s:%d is not in dbSNP", id, getChr(), getStart())); - } - } - } - - public void validateAlternateAlleles() { - if ( !hasGenotypes() ) - return; - - List reportedAlleles = getAlleles(); - Set observedAlleles = new HashSet(); - observedAlleles.add(getReference()); - for ( Genotype g : getGenotypes().values() ) { - if ( g.isCalled() ) - observedAlleles.addAll(g.getAlleles()); - } - - if ( reportedAlleles.size() != observedAlleles.size() ) - throw new TribbleException.InternalCodecException(String.format("the ALT allele(s) for the record at position %s:%d do not match what is observed in the per-sample genotypes", getChr(), getStart())); - - int originalSize = reportedAlleles.size(); - // take the intersection and see if things change - observedAlleles.retainAll(reportedAlleles); - if ( observedAlleles.size() != originalSize ) - throw new TribbleException.InternalCodecException(String.format("the ALT allele(s) for the record at position %s:%d do not match what is observed in the per-sample genotypes", getChr(), getStart())); - } - - public void validateChromosomeCounts() { - Map observedAttrs = calculateChromosomeCounts(); - - // AN - if ( hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) { - int reportedAN = Integer.valueOf(getAttribute(VCFConstants.ALLELE_NUMBER_KEY).toString()); - int observedAN = (Integer)observedAttrs.get(VCFConstants.ALLELE_NUMBER_KEY); - if ( reportedAN != observedAN ) - throw new TribbleException.InternalCodecException(String.format("the Allele Number (AN) tag is incorrect for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedAN, observedAN)); - } - - // AC - if ( hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { - List observedACs = (List)observedAttrs.get(VCFConstants.ALLELE_COUNT_KEY); - if ( getAttribute(VCFConstants.ALLELE_COUNT_KEY) instanceof List ) { - Collections.sort(observedACs); - List reportedACs = (List)getAttribute(VCFConstants.ALLELE_COUNT_KEY); - Collections.sort(reportedACs); - if ( observedACs.size() != reportedACs.size() ) - throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have the correct number of values for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedACs.size(), observedACs.size())); - for (int i = 0; i < observedACs.size(); i++) { - if ( Integer.valueOf(reportedACs.get(i).toString()) != observedACs.get(i) ) - throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedACs.get(i), observedACs.get(i))); - } - } else { - if ( observedACs.size() != 1 ) - throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have enough values for the record at position %s:%d", getChr(), getStart())); - int reportedAC = Integer.valueOf(getAttribute(VCFConstants.ALLELE_COUNT_KEY).toString()); - if ( reportedAC != observedACs.get(0) ) - throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedAC, observedACs.get(0))); - } - } - } - - private Map calculateChromosomeCounts() { - Map attributes = new HashMap(); - - attributes.put(VCFConstants.ALLELE_NUMBER_KEY, getChromosomeCount()); - ArrayList alleleFreqs = new ArrayList(); - ArrayList alleleCounts = new ArrayList(); - - // if there are alternate alleles, record the relevant tags - if ( getAlternateAlleles().size() > 0 ) { - for ( Allele allele : getAlternateAlleles() ) { - alleleCounts.add(getChromosomeCount(allele)); - alleleFreqs.add((double)getChromosomeCount(allele) / (double)getChromosomeCount()); - } - } - // otherwise, set them to 0 - else { - alleleCounts.add(0); - alleleFreqs.add(0.0); - } - - attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts); - attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs); - return attributes; - } - - // --------------------------------------------------------------------------------------------------------- - // - // validation: the normal validation routines are called automatically upon creation of the VC - // - // --------------------------------------------------------------------------------------------------------- - - /** - * To be called by any modifying routines - */ - private boolean validate() { - return validate(true); - } - - private boolean validate(boolean throwException) { - try { - validateReferencePadding(); - validateAlleles(); - validateGenotypes(); - } catch ( IllegalArgumentException e ) { - if ( throwException ) - throw e; - else - return false; - } - - return true; - } - - private void validateReferencePadding() { - if (hasSymbolicAlleles()) // symbolic alleles don't need padding... - return; - - boolean needsPadding = (getReference().length() == getEnd() - getStart()); // off by one because padded base was removed - - if ( needsPadding && !hasReferenceBaseForIndel() ) - throw new ReviewedStingException("Badly formed variant context at location " + getChr() + ":" + getStart() + "; no padded reference base was provided."); - } - - private void validateAlleles() { - // check alleles - boolean alreadySeenRef = false, alreadySeenNull = false; - for ( Allele allele : alleles ) { - // make sure there's only one reference allele - if ( allele.isReference() ) { - if ( alreadySeenRef ) throw new IllegalArgumentException("BUG: Received two reference tagged alleles in VariantContext " + alleles + " this=" + this); - alreadySeenRef = true; - } - - if ( allele.isNoCall() ) { - throw new IllegalArgumentException("BUG: Cannot add a no call allele to a variant context " + alleles + " this=" + this); - } - - // make sure there's only one null allele - if ( allele.isNull() ) { - if ( alreadySeenNull ) throw new IllegalArgumentException("BUG: Received two null alleles in VariantContext " + alleles + " this=" + this); - alreadySeenNull = true; - } - } - - // make sure there's one reference allele - if ( ! alreadySeenRef ) - throw new IllegalArgumentException("No reference allele found in VariantContext"); - -// if ( getType() == Type.INDEL ) { -// if ( getReference().length() != (getLocation().size()-1) ) { - long length = (stop - start) + 1; - if ( (getReference().isNull() && length != 1 ) || - (getReference().isNonNull() && (length - getReference().length() > 1))) { - throw new IllegalStateException("BUG: GenomeLoc " + contig + ":" + start + "-" + stop + " has a size == " + length + " but the variation reference allele has length " + getReference().length() + " this = " + this); - } - } - - private void validateGenotypes() { - if ( this.genotypes == null ) throw new IllegalStateException("Genotypes is null"); - - for ( Map.Entry elt : this.genotypes.entrySet() ) { - String name = elt.getKey(); - Genotype g = elt.getValue(); - - if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName()); - - if ( g.isAvailable() ) { - for ( Allele gAllele : g.getAlleles() ) { - if ( ! hasAllele(gAllele) && gAllele.isCalled() ) - throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles); - } - } - } - } - - // --------------------------------------------------------------------------------------------------------- - // - // utility routines - // - // --------------------------------------------------------------------------------------------------------- - - private void determineType() { - if ( type == null ) { - switch ( getNAlleles() ) { - case 0: - throw new IllegalStateException("Unexpected error: requested type of VariantContext with no alleles!" + this); - case 1: - // note that this doesn't require a reference allele. You can be monomorphic independent of having a - // reference allele - type = Type.NO_VARIATION; - break; - default: - determinePolymorphicType(); - } - } - } - - private void determinePolymorphicType() { - type = null; - - // do a pairwise comparison of all alleles against the reference allele - for ( Allele allele : alleles ) { - if ( allele == REF ) - continue; - - // find the type of this allele relative to the reference - Type biallelicType = typeOfBiallelicVariant(REF, allele); - - // for the first alternate allele, set the type to be that one - if ( type == null ) { - type = biallelicType; - } - // if the type of this allele is different from that of a previous one, assign it the MIXED type and quit - else if ( biallelicType != type ) { - type = Type.MIXED; - return; - } - } - } - - private static Type typeOfBiallelicVariant(Allele ref, Allele allele) { - if ( ref.isSymbolic() ) - throw new IllegalStateException("Unexpected error: encountered a record with a symbolic reference allele"); - - if ( allele.isSymbolic() ) - return Type.SYMBOLIC; - - if ( ref.length() == allele.length() ) { - if ( allele.length() == 1 ) - return Type.SNP; - else - return Type.MNP; - } - - // Important note: previously we were checking that one allele is the prefix of the other. However, that's not an - // appropriate check as can be seen from the following example: - // REF = CTTA and ALT = C,CT,CA - // This should be assigned the INDEL type but was being marked as a MIXED type because of the prefix check. - // In truth, it should be absolutely impossible to return a MIXED type from this method because it simply - // performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type - // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point - // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. - return Type.INDEL; - - // old incorrect logic: - // if (oneIsPrefixOfOther(ref, allele)) - // return Type.INDEL; - // else - // return Type.MIXED; - } - - public String toString() { - return String.format("[VC %s @ %s of type=%s alleles=%s attr=%s GT=%s", - getSource(), contig + ":" + (start - stop == 0 ? start : start + "-" + stop), this.getType(), - ParsingUtils.sortList(this.getAlleles()), ParsingUtils.sortedString(this.getAttributes()), this.getGenotypesSortedByName()); - } - - // protected basic manipulation routines - private static List makeAlleles(Collection alleles) { - final List alleleList = new ArrayList(alleles.size()); - - boolean sawRef = false; - for ( final Allele a : alleles ) { - for ( final Allele b : alleleList ) { - if ( a.equals(b, true) ) - throw new IllegalArgumentException("Duplicate allele added to VariantContext: " + a); - } - - // deal with the case where the first allele isn't the reference - if ( a.isReference() ) { - if ( sawRef ) - throw new IllegalArgumentException("Alleles for a VariantContext must contain at most one reference allele: " + alleles); - alleleList.add(0, a); - sawRef = true; - } - else - alleleList.add(a); - } - - if ( alleleList.isEmpty() ) - throw new IllegalArgumentException("Cannot create a VariantContext with an empty allele list"); - - if ( alleleList.get(0).isNonReference() ) - throw new IllegalArgumentException("Alleles for a VariantContext must contain at least one reference allele: " + alleles); - - return alleleList; - } - - public static Map genotypeCollectionToMap(Map dest, Collection genotypes) { - for ( Genotype g : genotypes ) { - if ( dest.containsKey(g.getSampleName() ) ) - throw new IllegalArgumentException("Duplicate genotype added to VariantContext: " + g); - dest.put(g.getSampleName(), g); - } - - return dest; - } - - // --------------------------------------------------------------------------------------------------------- - // - // tribble integration routines -- not for public consumption - // - // --------------------------------------------------------------------------------------------------------- - public String getChr() { - return contig; - } - - public int getStart() { - return (int)start; - } - - public int getEnd() { - return (int)stop; - } - - private boolean hasSymbolicAlleles() { - for (Allele a: getAlleles()) { - if (a.isSymbolic()) { - return true; - } - } - return false; - } - - public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { - - // see if we need to pad common reference base from all alleles - boolean padVC; - - // We need to pad a VC with a common base if the length of the reference allele is less than the length of the VariantContext. - // This happens because the position of e.g. an indel is always one before the actual event (as per VCF convention). - long locLength = (inputVC.getEnd() - inputVC.getStart()) + 1; - if (inputVC.hasSymbolicAlleles()) - padVC = true; - else if (inputVC.getReference().length() == locLength) - padVC = false; - else if (inputVC.getReference().length() == locLength-1) - padVC = true; - else throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + - " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); - - // nothing to do if we don't need to pad bases - if (padVC) { - - if ( !inputVC.hasReferenceBaseForIndel() ) - throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); - - Byte refByte = inputVC.getReferenceBaseForIndel(); - - List alleles = new ArrayList(); - Map genotypes = new TreeMap(); - - Map inputGenotypes = inputVC.getGenotypes(); - - for (Allele a : inputVC.getAlleles()) { - // get bases for current allele and create a new one with trimmed bases - if (a.isSymbolic()) { - alleles.add(a); - } else { - String newBases; - if ( refBaseShouldBeAppliedToEndOfAlleles ) - newBases = a.getBaseString() + new String(new byte[]{refByte}); - else - newBases = new String(new byte[]{refByte}) + a.getBaseString(); - alleles.add(Allele.create(newBases,a.isReference())); - } - } - - // now we can recreate new genotypes with trimmed alleles - for (String sample : inputVC.getSampleNames()) { - Genotype g = inputGenotypes.get(sample); - - List inAlleles = g.getAlleles(); - List newGenotypeAlleles = new ArrayList(); - for (Allele a : inAlleles) { - if (a.isCalled()) { - if (a.isSymbolic()) { - newGenotypeAlleles.add(a); - } else { - String newBases; - if ( refBaseShouldBeAppliedToEndOfAlleles ) - newBases = a.getBaseString() + new String(new byte[]{refByte}); - else - newBases = new String(new byte[]{refByte}) + a.getBaseString(); - newGenotypeAlleles.add(Allele.create(newBases,a.isReference())); - } - } - else { - // add no-call allele - newGenotypeAlleles.add(Allele.NO_CALL); - } - } - genotypes.put(sample, new Genotype(sample, newGenotypeAlleles, g.getNegLog10PError(), - g.getFilters(),g.getAttributes(),g.isPhased())); - - } - - // Do not change the filter state if filters were not applied to this context - Set inputVCFilters = inputVC.filtersWereAppliedToContext ? inputVC.getFilters() : null; - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVCFilters, inputVC.getAttributes(),refByte); - } - else - return inputVC; - - } - - public ArrayList getTwoAllelesWithHighestAlleleCounts() { - // first idea: get two alleles with highest AC - int maxAC1 = 0, maxAC2=0,maxAC1ind =0, maxAC2ind = 0; - int i=0; - int[] alleleCounts = new int[this.getAlleles().size()]; - ArrayList alleleArray = new ArrayList(); - for (Allele a:this.getAlleles()) { - int ac = this.getChromosomeCount(a); - if (ac >=maxAC1) { - maxAC1 = ac; - maxAC1ind = i; - } - alleleArray.add(a); - alleleCounts[i++] = ac; - } - // now get second best allele - for (i=0; i < alleleCounts.length; i++) { - if (i == maxAC1ind) - continue; - if (alleleCounts[i] >= maxAC2) { - maxAC2 = alleleCounts[i]; - maxAC2ind = i; - } - } - - Allele alleleA, alleleB; - if (alleleArray.get(maxAC1ind).isReference()) { - alleleA = alleleArray.get(maxAC1ind); - alleleB = alleleArray.get(maxAC2ind); - } - else if (alleleArray.get(maxAC2ind).isReference()) { - alleleA = alleleArray.get(maxAC2ind); - alleleB = alleleArray.get(maxAC1ind); - } else { - alleleA = alleleArray.get(maxAC1ind); - alleleB = alleleArray.get(maxAC2ind); - } - ArrayList a = new ArrayList(); - a.add(alleleA); - a.add(alleleB); - return a; - } - public Allele getAltAlleleWithHighestAlleleCount() { - // first idea: get two alleles with highest AC - Allele best = null; - int maxAC1 = 0; - for (Allele a:this.getAlternateAlleles()) { - int ac = this.getChromosomeCount(a); - if (ac >=maxAC1) { - maxAC1 = ac; - best = a; - } - - } - return best; - } - - public int[] getGLIndecesOfAllele(Allele inputAllele) { - int[] idxVector = new int[3]; - int numAlleles = this.getAlleles().size(); - - int idxDiag = numAlleles; - int incr = numAlleles - 1; - int k=1; - for (Allele a: getAlternateAlleles()) { - // multi-allelic approximation, part 1: Ideally - // for each alt allele compute marginal (suboptimal) posteriors - - // compute indices for AA,AB,BB for current allele - genotype likelihoods are a linear vector that can be thought of - // as a row-wise upper triangular matrix of likelihoods. - // So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC. - // 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD - - int idxAA = 0; - int idxAB = k++; - // yy is always element on the diagonal. - // 2 alleles: BBelement 2 - // 3 alleles: BB element 3. CC element 5 - // 4 alleles: - int idxBB = idxDiag; - - if (a.equals(inputAllele)) { - idxVector[0] = idxAA; - idxVector[1] = idxAB; - idxVector[2] = idxBB; - break; - } - idxDiag += incr--; - } - return idxVector; - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContextUtils.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContextUtils.java deleted file mode 100755 index e2cf2ecf0..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContextUtils.java +++ /dev/null @@ -1,1407 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.variantcontext.v13; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.samtools.util.StringUtil; -import org.apache.commons.jexl2.Expression; -import org.apache.commons.jexl2.JexlEngine; -import org.apache.log4j.Logger; -import org.broad.tribble.util.popgen.HardyWeinbergCalculation; -import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.Serializable; -import java.util.*; - -public class VariantContextUtils { - private static Logger logger = Logger.getLogger(VariantContextUtils.class); - public final static String MERGE_INTERSECTION = "Intersection"; - public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; - public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; - public final static String MERGE_FILTER_PREFIX = "filterIn"; - - final public static JexlEngine engine = new JexlEngine(); - static { - engine.setSilent(false); // will throw errors now for selects that don't evaluate properly - engine.setLenient(false); - } - - /** - * Create a new VariantContext - * - * @param name name - * @param loc location - * @param alleles alleles - * @param genotypes genotypes set - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - * @return VariantContext object - */ - public static VariantContext toVC(String name, GenomeLoc loc, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - return new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes != null ? VariantContext.genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes); - } - - /** - * Create a new variant context without genotypes and no Perror, no filters, and no attributes - * @param name name - * @param loc location - * @param alleles alleles - * @return VariantContext object - */ - public static VariantContext toVC(String name, GenomeLoc loc, Collection alleles) { - return new VariantContext (name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, VariantContext.NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } - - /** - * Create a new variant context without genotypes and no Perror, no filters, and no attributes - * @param name name - * @param loc location - * @param alleles alleles - * @param genotypes genotypes - * @return VariantContext object - */ - public static VariantContext toVC(String name, GenomeLoc loc, Collection alleles, Collection genotypes) { - return new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } - - /** - * Copy constructor - * - * @param other the VariantContext to copy - * @return VariantContext object - */ - public static VariantContext toVC(VariantContext other) { - return new VariantContext(other.getSource(), other.getChr(), other.getStart(), other.getEnd(), other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.getFilters(), other.getAttributes()); - } - - /** - * Update the attributes of the attributes map given the VariantContext to reflect the proper chromosome-based VCF tags - * - * @param vc the VariantContext - * @param attributes the attributes map to populate; must not be null; may contain old values - * @param removeStaleValues should we remove stale values from the mapping? - */ - public static void calculateChromosomeCounts(VariantContext vc, Map attributes, boolean removeStaleValues) { - // if everyone is a no-call, remove the old attributes if requested - if ( vc.getChromosomeCount() == 0 && removeStaleValues ) { - if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) - attributes.remove(VCFConstants.ALLELE_COUNT_KEY); - if ( attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY) ) - attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); - if ( attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY) ) - attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); - return; - } - - if ( vc.hasGenotypes() ) { - attributes.put(VCFConstants.ALLELE_NUMBER_KEY, vc.getChromosomeCount()); - - // if there are alternate alleles, record the relevant tags - if ( vc.getAlternateAlleles().size() > 0 ) { - ArrayList alleleFreqs = new ArrayList(); - ArrayList alleleCounts = new ArrayList(); - double totalChromosomes = (double)vc.getChromosomeCount(); - for ( Allele allele : vc.getAlternateAlleles() ) { - int altChromosomes = vc.getChromosomeCount(allele); - alleleCounts.add(altChromosomes); - String freq = String.format(makePrecisionFormatStringFromDenominatorValue(totalChromosomes), ((double)altChromosomes / totalChromosomes)); - alleleFreqs.add(freq); - } - - attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); - attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); - } - else { - attributes.put(VCFConstants.ALLELE_COUNT_KEY, 0); - attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, 0.0); - } - } - } - - private static String makePrecisionFormatStringFromDenominatorValue(double maxValue) { - int precision = 1; - - while ( maxValue > 1 ) { - precision++; - maxValue /= 10.0; - } - - return "%." + precision + "f"; - } - - public static Genotype removePLs(Genotype g) { - Map attrs = new HashMap(g.getAttributes()); - attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); - attrs.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY); - return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attrs, g.isPhased()); - } - - /** - * A simple but common wrapper for matching VariantContext objects using JEXL expressions - */ - public static class JexlVCMatchExp { - public String name; - public Expression exp; - - /** - * Create a new matcher expression with name and JEXL expression exp - * @param name name - * @param exp expression - */ - public JexlVCMatchExp(String name, Expression exp) { - this.name = name; - this.exp = exp; - } - } - - /** - * Method for creating JexlVCMatchExp from input walker arguments names and exps. These two arrays contain - * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return - * a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to - * match() below. - * - * @param names names - * @param exps expressions - * @return list of matches - */ - public static List initializeMatchExps(String[] names, String[] exps) { - if ( names == null || exps == null ) - throw new ReviewedStingException("BUG: neither names nor exps can be null: names " + Arrays.toString(names) + " exps=" + Arrays.toString(exps) ); - - if ( names.length != exps.length ) - throw new UserException("Inconsistent number of provided filter names and expressions: names=" + Arrays.toString(names) + " exps=" + Arrays.toString(exps)); - - Map map = new HashMap(); - for ( int i = 0; i < names.length; i++ ) { map.put(names[i], exps[i]); } - - return VariantContextUtils.initializeMatchExps(map); - } - - public static List initializeMatchExps(ArrayList names, ArrayList exps) { - String[] nameArray = new String[names.size()]; - String[] expArray = new String[exps.size()]; - return initializeMatchExps(names.toArray(nameArray), exps.toArray(expArray)); - } - - - /** - * Method for creating JexlVCMatchExp from input walker arguments mapping from names to exps. These two arrays contain - * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return - * a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to - * match() below. - * - * @param names_and_exps mapping of names to expressions - * @return list of matches - */ - public static List initializeMatchExps(Map names_and_exps) { - List exps = new ArrayList(); - - for ( Map.Entry elt : names_and_exps.entrySet() ) { - String name = elt.getKey(); - String expStr = elt.getValue(); - - if ( name == null || expStr == null ) throw new IllegalArgumentException("Cannot create null expressions : " + name + " " + expStr); - try { - Expression exp = engine.createExpression(expStr); - exps.add(new JexlVCMatchExp(name, exp)); - } catch (Exception e) { - throw new UserException.BadArgumentValue(name, "Invalid expression used (" + expStr + "). Please see the JEXL docs for correct syntax.") ; - } - } - - return exps; - } - - /** - * Returns true if exp match VC. See collection<> version for full docs. - * @param vc variant context - * @param exp expression - * @return true if there is a match - */ - public static boolean match(VariantContext vc, JexlVCMatchExp exp) { - return match(vc,Arrays.asList(exp)).get(exp); - } - - /** - * Matches each JexlVCMatchExp exp against the data contained in vc, and returns a map from these - * expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL - * expressions to VariantContext records. Use initializeMatchExps() to create the list of JexlVCMatchExp - * expressions. - * - * @param vc variant context - * @param exps expressions - * @return true if there is a match - */ - public static Map match(VariantContext vc, Collection exps) { - return new JEXLMap(exps,vc); - - } - - /** - * Returns true if exp match VC/g. See collection<> version for full docs. - * @param vc variant context - * @param g genotype - * @param exp expression - * @return true if there is a match - */ - public static boolean match(VariantContext vc, Genotype g, JexlVCMatchExp exp) { - return match(vc,g,Arrays.asList(exp)).get(exp); - } - - /** - * Matches each JexlVCMatchExp exp against the data contained in vc/g, and returns a map from these - * expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL - * expressions to VariantContext records/genotypes. Use initializeMatchExps() to create the list of JexlVCMatchExp - * expressions. - * - * @param vc variant context - * @param g genotype - * @param exps expressions - * @return true if there is a match - */ - public static Map match(VariantContext vc, Genotype g, Collection exps) { - return new JEXLMap(exps,vc,g); - } - - public static double computeHardyWeinbergPvalue(VariantContext vc) { - if ( vc.getChromosomeCount() == 0 ) - return 0.0; - return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); - } - - /** - * Returns a newly allocated VC that is the same as VC, but without genotypes - * @param vc variant context - * @return new VC without genotypes - */ - @Requires("vc != null") - @Ensures("result != null") - public static VariantContext sitesOnlyVariantContext(VariantContext vc) { - return VariantContext.modifyGenotypes(vc, null); - } - - /** - * Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes - * @param vcs collection of VCs - * @return new VCs without genotypes - */ - @Requires("vcs != null") - @Ensures("result != null") - public static Collection sitesOnlyVariantContexts(Collection vcs) { - List r = new ArrayList(); - for ( VariantContext vc : vcs ) - r.add(sitesOnlyVariantContext(vc)); - return r; - } - - public static VariantContext pruneVariantContext(VariantContext vc) { - return pruneVariantContext(vc, null); - } - - public static VariantContext pruneVariantContext(final VariantContext vc, final Collection keysToPreserve ) { - final MutableVariantContext mvc = new MutableVariantContext(vc); - - if ( keysToPreserve == null || keysToPreserve.size() == 0 ) - mvc.clearAttributes(); - else { - final Map d = mvc.getAttributes(); - mvc.clearAttributes(); - for ( String key : keysToPreserve ) - if ( d.containsKey(key) ) - mvc.putAttribute(key, d.get(key)); - } - - // this must be done as the ID is stored in the attributes field - if ( vc.hasID() ) mvc.setID(vc.getID()); - - Collection gs = mvc.getGenotypes().values(); - mvc.clearGenotypes(); - for ( Genotype g : gs ) { - MutableGenotype mg = new MutableGenotype(g); - mg.clearAttributes(); - if ( keysToPreserve != null ) - for ( String key : keysToPreserve ) - if ( g.hasAttribute(key) ) - mg.putAttribute(key, g.getAttribute(key)); - mvc.addGenotype(mg); - } - - return mvc; - } - - public enum GenotypeMergeType { - /** - * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. - */ - UNIQUIFY, - /** - * Take genotypes in priority order (see the priority argument). - */ - PRIORITIZE, - /** - * Take the genotypes in any order. - */ - UNSORTED, - /** - * Require that all samples/genotypes be unique between all inputs. - */ - REQUIRE_UNIQUE - } - - public enum FilteredRecordMergeType { - /** - * Union - leaves the record if any record is unfiltered. - */ - KEEP_IF_ANY_UNFILTERED, - /** - * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. - */ - KEEP_IF_ALL_UNFILTERED - } - - /** - * Performs a master merge on the VCs. Here there is a master input [contains all of the information] and many - * VCs containing partial, extra genotype information which should be added to the master. For example, - * we scatter out the phasing algorithm over some samples in the master, producing a minimal VCF with phasing - * information per genotype. The master merge will add the PQ information from each genotype record, where - * appropriate, to the master VC. - * - * @param unsortedVCs collection of VCs - * @param masterName name of master VC - * @return master-merged VC - */ - public static VariantContext masterMerge(Collection unsortedVCs, String masterName) { - VariantContext master = findMaster(unsortedVCs, masterName); - Map genotypes = master.getGenotypes(); - for (Genotype g : genotypes.values()) { - genotypes.put(g.getSampleName(), new MutableGenotype(g)); - } - - Map masterAttributes = new HashMap(master.getAttributes()); - - for (VariantContext vc : unsortedVCs) { - if (!vc.getSource().equals(masterName)) { - for (Genotype g : vc.getGenotypes().values()) { - MutableGenotype masterG = (MutableGenotype) genotypes.get(g.getSampleName()); - for (Map.Entry attr : g.getAttributes().entrySet()) { - if (!masterG.hasAttribute(attr.getKey())) { - //System.out.printf("Adding GT attribute %s to masterG %s, new %s%n", attr, masterG, g); - masterG.putAttribute(attr.getKey(), attr.getValue()); - } - } - - if (masterG.isPhased() != g.isPhased()) { - if (masterG.sameGenotype(g)) { - // System.out.printf("Updating phasing %s to masterG %s, new %s%n", g.isPhased(), masterG, g); - masterG.setAlleles(g.getAlleles()); - masterG.setPhase(g.isPhased()); - } - //else System.out.println("WARNING: Not updating phase, since genotypes differ between master file and auxiliary info file!"); - } - -// if ( MathUtils.compareDoubles(masterG.getNegLog10PError(), g.getNegLog10PError()) != 0 ) { -// System.out.printf("Updating GQ %s to masterG %s, new %s%n", g.getNegLog10PError(), masterG, g); -// masterG.setNegLog10PError(g.getNegLog10PError()); -// } - - } - - for (Map.Entry attr : vc.getAttributes().entrySet()) { - if (!masterAttributes.containsKey(attr.getKey())) { - //System.out.printf("Adding VC attribute %s to master %s, new %s%n", attr, master, vc); - masterAttributes.put(attr.getKey(), attr.getValue()); - } - } - } - } - - return new VariantContext(master.getSource(), master.getChr(), master.getStart(), master.getEnd(), master.getAlleles(), genotypes, master.getNegLog10PError(), master.getFilters(), masterAttributes); - } - - private static VariantContext findMaster(Collection unsortedVCs, String masterName) { - for (VariantContext vc : unsortedVCs) { - if (vc.getSource().equals(masterName)) { - return vc; - } - } - - throw new ReviewedStingException(String.format("Couldn't find master VCF %s at %s", masterName, unsortedVCs.iterator().next())); - } - - /** - * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. - * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with - * the sample name - * - * @param genomeLocParser loc parser - * @param unsortedVCs collection of unsorted VCs - * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs - * @param filteredRecordMergeType merge type for filtered records - * @param genotypeMergeOptions merge option for genotypes - * @param annotateOrigin should we annotate the set it came from? - * @param printMessages should we print messages? - * @param setKey the key name of the set - * @param filteredAreUncalled are filtered records uncalled? - * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @return new VariantContext representing the merge of unsortedVCs - */ - public static VariantContext simpleMerge(final GenomeLocParser genomeLocParser, - final Collection unsortedVCs, - final List priorityListOfVCs, - final FilteredRecordMergeType filteredRecordMergeType, - final GenotypeMergeType genotypeMergeOptions, - final boolean annotateOrigin, - final boolean printMessages, - final String setKey, - final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC ) { - if ( unsortedVCs == null || unsortedVCs.size() == 0 ) - return null; - - if ( annotateOrigin && priorityListOfVCs == null ) - throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts"); - - if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE ) - verifyUniqueSampleNames(unsortedVCs); - - List prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); - // Make sure all variant contexts are padded with reference base in case of indels if necessary - List VCs = new ArrayList(); - - for (VariantContext vc : prepaddedVCs) { - // also a reasonable place to remove filtered calls, if needed - if ( ! filteredAreUncalled || vc.isNotFiltered() ) - VCs.add(VariantContext.createVariantContextWithPaddedAlleles(vc, false)); - } - if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled - return null; - - // establish the baseline info from the first VC - final VariantContext first = VCs.get(0); - final String name = first.getSource(); - final Allele refAllele = determineReferenceAllele(VCs); - - final Set alleles = new LinkedHashSet(); - final Set filters = new TreeSet(); - final Map attributes = new TreeMap(); - final Set inconsistentAttributes = new HashSet(); - final Set variantSources = new HashSet(); // contains the set of sources we found in our set of VCs that are variant - final Set rsIDs = new LinkedHashSet(1); // most of the time there's one id - - GenomeLoc loc = getLocation(genomeLocParser,first); - int depth = 0; - int maxAC = -1; - final Map attributesWithMaxAC = new TreeMap(); - double negLog10PError = -1; - VariantContext vcWithMaxAC = null; - Map genotypes = new TreeMap(); - - // counting the number of filtered and variant VCs - int nFiltered = 0; - - boolean remapped = false; - - // cycle through and add info from the other VCs, making sure the loc/reference matches - - for ( VariantContext vc : VCs ) { - if ( loc.getStart() != vc.getStart() ) // || !first.getReference().equals(vc.getReference()) ) - throw new ReviewedStingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); - - if ( getLocation(genomeLocParser,vc).size() > loc.size() ) - loc = getLocation(genomeLocParser,vc); // get the longest location - - nFiltered += vc.isFiltered() ? 1 : 0; - if ( vc.isVariant() ) variantSources.add(vc.getSource()); - - AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); - remapped = remapped || alleleMapping.needsRemapping(); - - alleles.addAll(alleleMapping.values()); - - mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); - - negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1); - - filters.addAll(vc.getFilters()); - - // - // add attributes - // - // special case DP (add it up) and ID (just preserve it) - // - if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) - depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); - if ( vc.hasID() && ! vc.getID().equals(VCFConstants.EMPTY_ID_FIELD) ) rsIDs.add(vc.getID()); - if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { - String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); - // lets see if the string contains a , separator - if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { - List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); - for (String alleleCount : alleleCountArray) { - final int ac = Integer.valueOf(alleleCount.trim()); - if (ac > maxAC) { - maxAC = ac; - vcWithMaxAC = vc; - } - } - } else { - final int ac = Integer.valueOf(rawAlleleCounts); - if (ac > maxAC) { - maxAC = ac; - vcWithMaxAC = vc; - } - } - } - - for (Map.Entry p : vc.getAttributes().entrySet()) { - String key = p.getKey(); - // if we don't like the key already, don't go anywhere - if ( ! inconsistentAttributes.contains(key) ) { - boolean alreadyFound = attributes.containsKey(key); - Object boundValue = attributes.get(key); - boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); - - if ( alreadyFound && ! boundValue.equals(p.getValue()) && ! boundIsMissingValue ) { - // we found the value but we're inconsistent, put it in the exclude list - //System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, p.getValue()); - inconsistentAttributes.add(key); - attributes.remove(key); - } else if ( ! alreadyFound || boundIsMissingValue ) { // no value - //if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), p.getValue()); - attributes.put(key, p.getValue()); - } - } - } - } - - // if we have more alternate alleles in the merged VC than in one or more of the - // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF - for ( VariantContext vc : VCs ) { - if (vc.alleles.size() == 1) - continue; - if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { - logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", - genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); - genotypes = stripPLs(genotypes); - // this will remove stale AC,AF attributed from vc - calculateChromosomeCounts(vc, attributes, true); - break; - } - } - - // take the VC with the maxAC and pull the attributes into a modifiable map - if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { - attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); - } - - // if at least one record was unfiltered and we want a union, clear all of the filters - if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() ) - filters.clear(); - - - if ( annotateOrigin ) { // we care about where the call came from - String setValue; - if ( nFiltered == 0 && variantSources.size() == priorityListOfVCs.size() ) // nothing was unfiltered - setValue = MERGE_INTERSECTION; - else if ( nFiltered == VCs.size() ) // everything was filtered out - setValue = MERGE_FILTER_IN_ALL; - else if ( variantSources.isEmpty() ) // everyone was reference - setValue = MERGE_REF_IN_ALL; - else { - LinkedHashSet s = new LinkedHashSet(); - for ( VariantContext vc : VCs ) - if ( vc.isVariant() ) - s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); - setValue = Utils.join("-", s); - } - - if ( setKey != null ) { - attributes.put(setKey, setValue); - if( mergeInfoWithMaxAC && vcWithMaxAC != null ) { attributesWithMaxAC.put(setKey, vcWithMaxAC.getSource()); } - } - } - - if ( depth > 0 ) - attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); - - if ( ! rsIDs.isEmpty() ) { - attributes.put(VariantContext.ID_KEY, Utils.join(",", rsIDs)); - } - - VariantContext merged = new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, negLog10PError, filters, (mergeInfoWithMaxAC ? attributesWithMaxAC : attributes) ); - // Trim the padded bases of all alleles if necessary - merged = createVariantContextWithTrimmedAlleles(merged); - - if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); - return merged; - } - - private static final boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { - final Iterator it1 = alleleSet1.iterator(); - final Iterator it2 = alleleSet2.iterator(); - - while ( it1.hasNext() && it2.hasNext() ) { - final Allele a1 = it1.next(); - final Allele a2 = it2.next(); - if ( ! a1.equals(a2) ) - return true; - } - - // by this point, at least one of the iterators is empty. All of the elements - // we've compared are equal up until this point. But it's possible that the - // sets aren't the same size, which is indicated by the test below. If they - // are of the same size, though, the sets are compatible - return it1.hasNext() || it2.hasNext(); - } - - public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { - // if all alleles of vc1 are a contained in alleles of vc2, return true - if (!vc1.getReference().equals(vc2.getReference())) - return false; - - for (Allele a :vc1.getAlternateAlleles()) { - if (!vc2.getAlternateAlleles().contains(a)) - return false; - } - - return true; - } - public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { - // see if we need to trim common reference base from all alleles - boolean trimVC; - - // We need to trim common reference base from all alleles in all genotypes if a ref base is common to all alleles - Allele refAllele = inputVC.getReference(); - if (!inputVC.isVariant()) - trimVC = false; - else if (refAllele.isNull()) - trimVC = false; - else { - trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList(inputVC.getAlternateAlleles()), - inputVC.getReference().getDisplayString()) > 0); - } - - // nothing to do if we don't need to trim bases - if (trimVC) { - List alleles = new ArrayList(); - Map genotypes = new TreeMap(); - - // set the reference base for indels in the attributes - Map attributes = new TreeMap(inputVC.getAttributes()); - - Map originalToTrimmedAlleleMap = new HashMap(); - - for (Allele a : inputVC.getAlleles()) { - if (a.isSymbolic()) { - alleles.add(a); - originalToTrimmedAlleleMap.put(a, a); - } else { - // get bases for current allele and create a new one with trimmed bases - byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length()); - Allele trimmedAllele = Allele.create(newBases, a.isReference()); - alleles.add(trimmedAllele); - originalToTrimmedAlleleMap.put(a, trimmedAllele); - } - } - - // detect case where we're trimming bases but resulting vc doesn't have any null allele. In that case, we keep original representation - // example: mixed records such as {TA*,TGA,TG} - boolean hasNullAlleles = false; - - for (Allele a: originalToTrimmedAlleleMap.values()) { - if (a.isNull()) - hasNullAlleles = true; - if (a.isReference()) - refAllele = a; - } - - if (!hasNullAlleles) - return inputVC; - // now we can recreate new genotypes with trimmed alleles - for ( Map.Entry sample : inputVC.getGenotypes().entrySet() ) { - - List originalAlleles = sample.getValue().getAlleles(); - List trimmedAlleles = new ArrayList(); - for ( Allele a : originalAlleles ) { - if ( a.isCalled() ) - trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); - else - trimmedAlleles.add(Allele.NO_CALL); - } - genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles)); - - } - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0])); - - } - - return inputVC; - } - - public static Map stripPLs(Map genotypes) { - Map newGs = new HashMap(genotypes.size()); - - for ( Map.Entry g : genotypes.entrySet() ) { - newGs.put(g.getKey(), g.getValue().hasLikelihoods() ? removePLs(g.getValue()) : g.getValue()); - } - - return newGs; - } - - public static Map> separateVariantContextsByType(Collection VCs) { - HashMap> mappedVCs = new HashMap>(); - for ( VariantContext vc : VCs ) { - - // look at previous variant contexts of different type. If: - // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list - // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) - // c) neither: do nothing, just add vc to its own list - boolean addtoOwnList = true; - for (VariantContext.Type type : VariantContext.Type.values()) { - if (type.equals(vc.getType())) - continue; - - if (!mappedVCs.containsKey(type)) - continue; - - List vcList = mappedVCs.get(type); - for (int k=0; k < vcList.size(); k++) { - VariantContext otherVC = vcList.get(k); - if (allelesAreSubset(otherVC,vc)) { - // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list - vcList.remove(k); - // avoid having empty lists - if (vcList.size() == 0) - mappedVCs.remove(vcList); - if ( !mappedVCs.containsKey(vc.getType()) ) - mappedVCs.put(vc.getType(), new ArrayList()); - mappedVCs.get(vc.getType()).add(otherVC); - break; - } - else if (allelesAreSubset(vc,otherVC)) { - // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own - mappedVCs.get(type).add(vc); - addtoOwnList = false; - break; - } - } - } - if (addtoOwnList) { - if ( !mappedVCs.containsKey(vc.getType()) ) - mappedVCs.put(vc.getType(), new ArrayList()); - mappedVCs.get(vc.getType()).add(vc); - } - } - - return mappedVCs; - } - - private static class AlleleMapper { - private VariantContext vc = null; - private Map map = null; - public AlleleMapper(VariantContext vc) { this.vc = vc; } - public AlleleMapper(Map map) { this.map = map; } - public boolean needsRemapping() { return this.map != null; } - public Collection values() { return map != null ? map.values() : vc.getAlleles(); } - - public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } - - public List remap(List as) { - List newAs = new ArrayList(); - for ( Allele a : as ) { - //System.out.printf(" Remapping %s => %s%n", a, remap(a)); - newAs.add(remap(a)); - } - return newAs; - } - } - - static private void verifyUniqueSampleNames(Collection unsortedVCs) { - Set names = new HashSet(); - for ( VariantContext vc : unsortedVCs ) { - for ( String name : vc.getSampleNames() ) { - //System.out.printf("Checking %s %b%n", name, names.contains(name)); - if ( names.contains(name) ) - throw new UserException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered " + name); - } - - names.addAll(vc.getSampleNames()); - } - } - - - static private Allele determineReferenceAllele(List VCs) { - Allele ref = null; - - for ( VariantContext vc : VCs ) { - Allele myRef = vc.getReference(); - if ( ref == null || ref.length() < myRef.length() ) - ref = myRef; - else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) - throw new UserException.BadInput(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); - } - - return ref; - } - - static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set allAlleles) { - if ( refAllele.equals(vc.getReference()) ) - return new AlleleMapper(vc); - else { - // we really need to do some work. The refAllele is the longest reference allele seen at this - // start site. So imagine it is: - // - // refAllele: ACGTGA - // myRef: ACGT - // myAlt: - - // - // We need to remap all of the alleles in vc to include the extra GA so that - // myRef => refAllele and myAlt => GA - // - - Allele myRef = vc.getReference(); - if ( refAllele.length() <= myRef.length() ) throw new ReviewedStingException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); - byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); - -// System.out.printf("Remapping allele at %s%n", vc); -// System.out.printf("ref %s%n", refAllele); -// System.out.printf("myref %s%n", myRef ); -// System.out.printf("extrabases %s%n", new String(extraBases)); - - Map map = new HashMap(); - for ( Allele a : vc.getAlleles() ) { - if ( a.isReference() ) - map.put(a, refAllele); - else { - Allele extended = Allele.extend(a, extraBases); - for ( Allele b : allAlleles ) - if ( extended.equals(b) ) - extended = b; -// System.out.printf(" Extending %s => %s%n", a, extended); - map.put(a, extended); - } - } - - // debugging -// System.out.printf("mapping %s%n", map); - - return new AlleleMapper(map); - } - } - - static class CompareByPriority implements Comparator, Serializable { - List priorityListOfVCs; - public CompareByPriority(List priorityListOfVCs) { - this.priorityListOfVCs = priorityListOfVCs; - } - - private int getIndex(VariantContext vc) { - int i = priorityListOfVCs.indexOf(vc.getSource()); - if ( i == -1 ) throw new UserException.BadArgumentValue(Utils.join(",", priorityListOfVCs), "Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getSource()); - return i; - } - - public int compare(VariantContext vc1, VariantContext vc2) { - return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); - } - } - - public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, GenotypeMergeType mergeOption ) { - if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) - throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); - - if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) - return new ArrayList(unsortedVCs); - else { - ArrayList sorted = new ArrayList(unsortedVCs); - Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); - return sorted; - } - } - - private static void mergeGenotypes(Map mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { - for ( Genotype g : oneVC.getGenotypes().values() ) { - String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples); - if ( ! mergedGenotypes.containsKey(name) ) { - // only add if the name is new - Genotype newG = g; - - if ( uniqifySamples || alleleMapping.needsRemapping() ) { - MutableGenotype mutG = new MutableGenotype(name, g); - if ( alleleMapping.needsRemapping() ) mutG.setAlleles(alleleMapping.remap(g.getAlleles())); - newG = mutG; - } - - mergedGenotypes.put(name, newG); - } - } - } - - public static String mergedSampleName(String trackName, String sampleName, boolean uniqify ) { - return uniqify ? sampleName + "." + trackName : sampleName; - } - - /** - * Returns a context identical to this with the REF and ALT alleles reverse complemented. - * - * @param vc variant context - * @return new vc - */ - public static VariantContext reverseComplement(VariantContext vc) { - // create a mapping from original allele to reverse complemented allele - HashMap alleleMap = new HashMap(vc.getAlleles().size()); - for ( Allele originalAllele : vc.getAlleles() ) { - Allele newAllele; - if ( originalAllele.isNoCall() || originalAllele.isNull() ) - newAllele = originalAllele; - else - newAllele = Allele.create(BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); - alleleMap.put(originalAllele, newAllele); - } - - // create new Genotype objects - Map newGenotypes = new HashMap(vc.getNSamples()); - for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { - List newAlleles = new ArrayList(); - for ( Allele allele : genotype.getValue().getAlleles() ) { - Allele newAllele = alleleMap.get(allele); - if ( newAllele == null ) - newAllele = Allele.NO_CALL; - newAlleles.add(newAllele); - } - newGenotypes.put(genotype.getKey(), Genotype.modifyAlleles(genotype.getValue(), newAlleles)); - } - - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes()); - - } - - public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { - if ( allowedAttributes == null ) - return vc; - - Map newGenotypes = new HashMap(vc.getNSamples()); - for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { - Map attrs = new HashMap(); - for ( Map.Entry attr : genotype.getValue().getAttributes().entrySet() ) { - if ( allowedAttributes.contains(attr.getKey()) ) - attrs.put(attr.getKey(), attr.getValue()); - } - newGenotypes.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attrs)); - } - - return VariantContext.modifyGenotypes(vc, newGenotypes); - } - - public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { - if (!context.isSNP() || !context.isBiallelic()) - throw new IllegalStateException("Requested SNP substitution type for bialleic non-SNP " + context); - return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); - } - - /** - * If this is a BiAlleic SNP, is it a transition? - */ - public static boolean isTransition(VariantContext context) { - return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; - } - - /** - * If this is a BiAlleic SNP, is it a transversion? - */ - public static boolean isTransversion(VariantContext context) { - return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; - } - - /** - * create a genome location, given a variant context - * @param genomeLocParser parser - * @param vc the variant context - * @return the genomeLoc - */ - public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) { - return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); - } - - public abstract static class AlleleMergeRule { - // vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2): - abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2); - - public String toString() { - return "all samples are mergeable"; - } - } - - // NOTE: returns null if vc1 and vc2 are not merged into a single MNP record - - public static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) { - if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2)) - return null; - - // Check that it's logically possible to merge the VCs: - if (!allSamplesAreMergeable(vc1, vc2)) - return null; - - // Check if there's a "point" in merging the VCs (e.g., annotations could be changed) - if (!alleleMergeRule.allelesShouldBeMerged(vc1, vc2)) - return null; - - return reallyMergeIntoMNP(vc1, vc2, referenceFile); - } - - private static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { - int startInter = vc1.getEnd() + 1; - int endInter = vc2.getStart() - 1; - byte[] intermediateBases = null; - if (startInter <= endInter) { - intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases(); - StringUtil.toUpperCase(intermediateBases); - } - MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added - - Map mergedGenotypes = new HashMap(); - for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { - String sample = gt1Entry.getKey(); - Genotype gt1 = gt1Entry.getValue(); - Genotype gt2 = vc2.getGenotype(sample); - - List site1Alleles = gt1.getAlleles(); - List site2Alleles = gt2.getAlleles(); - - List mergedAllelesForSample = new LinkedList(); - - /* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records, - we preserve phase information (if any) relative to whatever precedes vc1: - */ - Iterator all2It = site2Alleles.iterator(); - for (Allele all1 : site1Alleles) { - Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() - - Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2); - mergedAllelesForSample.add(mergedAllele); - } - - double mergedGQ = Math.max(gt1.getNegLog10PError(), gt2.getNegLog10PError()); - Set mergedGtFilters = new HashSet(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered - - Map mergedGtAttribs = new HashMap(); - PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2); - if (phaseQual.PQ != null) - mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ); - - Genotype mergedGt = new Genotype(sample, mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased); - mergedGenotypes.put(sample, mergedGt); - } - - String mergedName = VariantContextUtils.mergeVariantContextNames(vc1.getSource(), vc2.getSource()); - double mergedNegLog10PError = Math.max(vc1.getNegLog10PError(), vc2.getNegLog10PError()); - Set mergedFilters = new HashSet(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered - Map mergedAttribs = VariantContextUtils.mergeVariantContextAttributes(vc1, vc2); - - VariantContext mergedVc = new VariantContext(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles(), mergedGenotypes, mergedNegLog10PError, mergedFilters, mergedAttribs); - - mergedAttribs = new HashMap(mergedVc.getAttributes()); - VariantContextUtils.calculateChromosomeCounts(mergedVc, mergedAttribs, true); - mergedVc = VariantContext.modifyAttributes(mergedVc, mergedAttribs); - - return mergedVc; - } - - private static class AlleleOneAndTwo { - private Allele all1; - private Allele all2; - - public AlleleOneAndTwo(Allele all1, Allele all2) { - this.all1 = all1; - this.all2 = all2; - } - - public int hashCode() { - return all1.hashCode() + all2.hashCode(); - } - - public boolean equals(Object other) { - if (!(other instanceof AlleleOneAndTwo)) - return false; - - AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other; - return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2)); - } - } - - private static class MergedAllelesData { - private Map mergedAlleles; - private byte[] intermediateBases; - private int intermediateLength; - - public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) { - this.mergedAlleles = new HashMap(); // implemented equals() and hashCode() for AlleleOneAndTwo - this.intermediateBases = intermediateBases; - this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0; - - this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true); - } - - public Allele ensureMergedAllele(Allele all1, Allele all2) { - return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor - } - - private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) { - AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2); - Allele mergedAllele = mergedAlleles.get(all12); - - if (mergedAllele == null) { - byte[] bases1 = all1.getBases(); - byte[] bases2 = all2.getBases(); - - byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length]; - System.arraycopy(bases1, 0, mergedBases, 0, bases1.length); - if (intermediateBases != null) - System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength); - System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length); - - mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime); - mergedAlleles.put(all12, mergedAllele); - } - - return mergedAllele; - } - - public Set getAllMergedAlleles() { - return new HashSet(mergedAlleles.values()); - } - } - - private static String mergeVariantContextNames(String name1, String name2) { - return name1 + "_" + name2; - } - - private static Map mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) { - Map mergedAttribs = new HashMap(); - - List vcList = new LinkedList(); - vcList.add(vc1); - vcList.add(vc2); - - String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY}; - for (String orAttrib : MERGE_OR_ATTRIBS) { - boolean attribVal = false; - for (VariantContext vc : vcList) { - attribVal = vc.getAttributeAsBoolean(orAttrib, false); - if (attribVal) // already true, so no reason to continue: - break; - } - mergedAttribs.put(orAttrib, attribVal); - } - - // Merge ID fields: - String iDVal = null; - for (VariantContext vc : vcList) { - String val = vc.getAttributeAsString(VariantContext.ID_KEY, null); - if (val != null && !val.equals(VCFConstants.EMPTY_ID_FIELD)) { - if (iDVal == null) - iDVal = val; - else - iDVal += VCFConstants.ID_FIELD_SEPARATOR + val; - } - } - if (iDVal != null) - mergedAttribs.put(VariantContext.ID_KEY, iDVal); - - return mergedAttribs; - } - - private static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) { - GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1); - GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2); - - if (!loc1.onSameContig(loc2)) - throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome"); - - if (!loc1.isBefore(loc2)) - throw new ReviewedStingException("Can only merge if vc1 is BEFORE vc2"); - - if (vc1.isFiltered() || vc2.isFiltered()) - return false; - - if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets - return false; - - if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2)) - return false; - - return true; - } - - private static boolean allGenotypesAreUnfilteredAndCalled(VariantContext vc) { - for (Map.Entry gtEntry : vc.getGenotypes().entrySet()) { - Genotype gt = gtEntry.getValue(); - if (gt.isNoCall() || gt.isFiltered()) - return false; - } - - return true; - } - - // Assumes that vc1 and vc2 were already checked to have the same sample names: - - private static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { - // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1: - for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { - String sample = gt1Entry.getKey(); - Genotype gt1 = gt1Entry.getValue(); - Genotype gt2 = vc2.getGenotype(sample); - - if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom - return false; - } - - return true; - } - - public static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) { - if (gt1.getPloidy() != gt2.getPloidy()) - return false; - - /* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard]. - - HOWEVER, EVEN if this is not the case, but gt1.isHom(), - it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1. - */ - return (gt2.isPhased() || gt2.isHom() || gt1.isHom()); - } - - private static class PhaseAndQuality { - public boolean isPhased; - public Double PQ = null; - - public PhaseAndQuality(Genotype gt) { - this.isPhased = gt.isPhased(); - if (this.isPhased) { - this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); - if ( this.PQ == -1 ) this.PQ = null; - } - } - } - - // Assumes that alleleSegregationIsKnown(gt1, gt2): - - private static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) { - if (gt2.isPhased() || gt2.isHom()) - return new PhaseAndQuality(gt1); // maintain the phase of gt1 - - if (!gt1.isHom()) - throw new ReviewedStingException("alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()"); - - /* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype - - For example, if we're merging the third Genotype with the second one: - 0/1 - 1|1 - 0/1 - - Then, we want to output: - 0/1 - 1/2 - */ - return new PhaseAndQuality(gt2); // maintain the phase of gt2 [since !gt2.genotypesArePhased()] - } - - /* Checks if any sample has a MNP of ALT alleles (segregating together): - [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)] - */ - - public static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { - for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { - String sample = gt1Entry.getKey(); - Genotype gt1 = gt1Entry.getValue(); - Genotype gt2 = vc2.getGenotype(sample); - - List site1Alleles = gt1.getAlleles(); - List site2Alleles = gt2.getAlleles(); - - Iterator all2It = site2Alleles.iterator(); - for (Allele all1 : site1Alleles) { - Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() - - if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate - return true; - } - } - - return false; - } - - /* Checks if all samples are consistent in their haplotypes: - [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)] - */ - - public static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) { - // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference): - Map allele1ToAllele2 = new HashMap(); - Map allele2ToAllele1 = new HashMap(); - - // Note the segregation of the alleles for the reference genome: - allele1ToAllele2.put(vc1.getReference(), vc2.getReference()); - allele2ToAllele1.put(vc2.getReference(), vc1.getReference()); - - // Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples). - for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { - String sample = gt1Entry.getKey(); - Genotype gt1 = gt1Entry.getValue(); - Genotype gt2 = vc2.getGenotype(sample); - - List site1Alleles = gt1.getAlleles(); - List site2Alleles = gt2.getAlleles(); - - Iterator all2It = site2Alleles.iterator(); - for (Allele all1 : site1Alleles) { - Allele all2 = all2It.next(); - - Allele all1To2 = allele1ToAllele2.get(all1); - if (all1To2 == null) - allele1ToAllele2.put(all1, all2); - else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2 - return false; - - Allele all2To1 = allele2ToAllele1.get(all2); - if (all2To1 == null) - allele2ToAllele1.put(all2, all1); - else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1 - return false; - } - } - - return true; - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantJEXLContext.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantJEXLContext.java deleted file mode 100644 index f16861f30..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantJEXLContext.java +++ /dev/null @@ -1,315 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.variantcontext.v13; - -import org.apache.commons.jexl2.JexlContext; -import org.apache.commons.jexl2.MapContext; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; - -/** - * - * @author aaron - * @author depristo - * - * Class VariantJEXLContext - * - * implements the JEXML context for VariantContext; this saves us from - * having to generate a JEXML context lookup map everytime we want to evaluate an expression. - * - * This is package protected, only classes in variantcontext should have access to it. - * - * // todo -- clean up to remove or better support genotype filtering - */ - -class VariantJEXLContext implements JexlContext { - // our stored variant context - private VariantContext vc; - - private interface AttributeGetter { - public Object get(VariantContext vc); - } - - private static Map x = new HashMap(); - - static { - x.put("vc", new AttributeGetter() { public Object get(VariantContext vc) { return vc; }}); - x.put("CHROM", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getChr(); }}); - x.put("POS", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getStart(); }}); - x.put("TYPE", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getType().toString(); }}); - x.put("QUAL", new AttributeGetter() { public Object get(VariantContext vc) { return 10 * vc.getNegLog10PError(); }}); - x.put("ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getAlleles(); }}); - x.put("N_ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getNAlleles(); }}); - x.put("FILTER", new AttributeGetter() { public Object get(VariantContext vc) { return vc.isFiltered() ? "1" : "0"; }}); - -// x.put("GT", new AttributeGetter() { public Object get(VariantContext vc) { return g.getGenotypeString(); }}); - x.put("homRefCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHomRefCount(); }}); - x.put("hetCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHetCount(); }}); - x.put("homVarCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHomVarCount(); }}); - } - - public VariantJEXLContext(VariantContext vc) { - this.vc = vc; - } - - public Object get(String name) { - Object result = null; - if ( x.containsKey(name) ) { // dynamic resolution of name -> value via map - result = x.get(name).get(vc); - } else if ( vc.hasAttribute(name)) { - result = vc.getAttribute(name); - } else if ( vc.getFilters().contains(name) ) { - result = "1"; - } - - //System.out.printf("dynamic lookup %s => %s%n", name, result); - - return result; - } - - public boolean has(String name) { - return get(name) != null; - } - - public void set(String name, Object value) { - throw new UnsupportedOperationException("remove() not supported on a VariantJEXLContext"); - } -} - - - - -/** - * this is an implementation of a Map of JexlVCMatchExp to true or false values. It lazy initializes each value - * as requested to save as much processing time as possible. - * - * Compatible with JEXL 1.1 (this code will be easier if we move to 2.0, all of the functionality can go into the - * JexlContext's get() - * - */ - -class JEXLMap implements Map { - // our variant context and/or Genotype - private final VariantContext vc; - private final Genotype g; - - // our context - private JexlContext jContext = null; - - // our mapping from JEXLVCMatchExp to Booleans, which will be set to NULL for previously uncached JexlVCMatchExp - private Map jexl; - - - public JEXLMap(Collection jexlCollection, VariantContext vc, Genotype g) { - this.vc = vc; - this.g = g; - initialize(jexlCollection); - } - - public JEXLMap(Collection jexlCollection, VariantContext vc) { - this(jexlCollection, vc, null); - } - - private void initialize(Collection jexlCollection) { - jexl = new HashMap(); - for (VariantContextUtils.JexlVCMatchExp exp: jexlCollection) { - jexl.put(exp, null); - } - } - - /** - * create the internal JexlContext, only when required. This code is where new JEXL context variables - * should get added. - * - */ - private void createContext() { - if ( g == null ) { - // todo -- remove dependancy on g to the entire system - jContext = new VariantJEXLContext(vc); - } else { - // - // this whole branch is here just to support G jexl operations - // - Map infoMap = new HashMap(); - - if ( vc != null ) { - // create a mapping of what we know about the variant context, its Chromosome, positions, etc. - infoMap.put("CHROM", vc.getChr()); - infoMap.put("POS", vc.getStart()); - infoMap.put("TYPE", vc.getType().toString()); - infoMap.put("QUAL", String.valueOf(vc.getPhredScaledQual())); - - // add alleles - infoMap.put("ALLELES", Utils.join(";", vc.getAlleles())); - infoMap.put("N_ALLELES", String.valueOf(vc.getNAlleles())); - - // add attributes - addAttributesToMap(infoMap, vc.getAttributes()); - - // add filter fields - infoMap.put("FILTER", vc.isFiltered() ? "1" : "0"); - for ( Object filterCode : vc.getFilters() ) { - infoMap.put(String.valueOf(filterCode), "1"); - } - - // add genotype-specific fields - // TODO -- implement me when we figure out a good way to represent this - // for ( Genotype g : vc.getGenotypes().values() ) { - // String prefix = g.getSampleName() + "."; - // addAttributesToMap(infoMap, g.getAttributes(), prefix); - // infoMap.put(prefix + "GT", g.getGenotypeString()); - // } - - // add specific genotype if one is provided - infoMap.put(VCFConstants.GENOTYPE_KEY, g.getGenotypeString()); - infoMap.put("isHomRef", g.isHomRef() ? "1" : "0"); - infoMap.put("isHet", g.isHet() ? "1" : "0"); - infoMap.put("isHomVar", g.isHomVar() ? "1" : "0"); - infoMap.put(VCFConstants.GENOTYPE_QUALITY_KEY, new Double(g.getPhredScaledQual())); - for ( Map.Entry e : g.getAttributes().entrySet() ) { - if ( e.getValue() != null && !e.getValue().equals(VCFConstants.MISSING_VALUE_v4) ) - infoMap.put(e.getKey(), e.getValue()); - } - } - - // create the internal context that we can evaluate expressions against - - jContext = new MapContext(infoMap); - } - } - - /** - * @return the size of the internal data structure - */ - public int size() { - return jexl.size(); - } - - /** - * @return true if we're empty - */ - public boolean isEmpty() { return this.jexl.isEmpty(); } - - /** - * do we contain the specified key - * @param o the key - * @return true if we have a value for that key - */ - public boolean containsKey(Object o) { return jexl.containsKey(o); } - - public Boolean get(Object o) { - // if we've already determined the value, return it - if (jexl.containsKey(o) && jexl.get(o) != null) return jexl.get(o); - - // try and cast the expression - VariantContextUtils.JexlVCMatchExp e = (VariantContextUtils.JexlVCMatchExp) o; - evaluateExpression(e); - return jexl.get(e); - } - - /** - * get the keyset of map - * @return a set of keys of type JexlVCMatchExp - */ - public Set keySet() { - return jexl.keySet(); - } - - /** - * get all the values of the map. This is an expensive call, since it evaluates all keys that haven't - * been evaluated yet. This is fine if you truely want all the keys, but if you only want a portion, or know - * the keys you want, you would be better off using get() to get them by name. - * @return a collection of boolean values, representing the results of all the variants evaluated - */ - public Collection values() { - // this is an expensive call - for (VariantContextUtils.JexlVCMatchExp exp : jexl.keySet()) - if (jexl.get(exp) == null) - evaluateExpression(exp); - return jexl.values(); - } - - /** - * evaulate a JexlVCMatchExp's expression, given the current context (and setup the context if it's null) - * @param exp the JexlVCMatchExp to evaluate - */ - private void evaluateExpression(VariantContextUtils.JexlVCMatchExp exp) { - // if the context is null, we need to create it to evaluate the JEXL expression - if (this.jContext == null) createContext(); - try { - jexl.put (exp, (Boolean) exp.exp.evaluate(jContext)); - } catch (Exception e) { - throw new UserException.CommandLineException(String.format("Invalid JEXL expression detected for %s with message %s", exp.name, e.getMessage())); - } - } - - /** - * helper function: adds the list of attributes to the information map we're building - * @param infoMap the map - * @param attributes the attributes - */ - private static void addAttributesToMap(Map infoMap, Map attributes ) { - for (Map.Entry e : attributes.entrySet()) { - infoMap.put(e.getKey(), String.valueOf(e.getValue())); - } - } - - public Boolean put(VariantContextUtils.JexlVCMatchExp jexlVCMatchExp, Boolean aBoolean) { - return jexl.put(jexlVCMatchExp,aBoolean); - } - - public void putAll(Map map) { - jexl.putAll(map); - } - - // ////////////////////////////////////////////////////////////////////////////////////// - // The Following are unsupported at the moment - // ////////////////////////////////////////////////////////////////////////////////////// - - // this doesn't make much sense to implement, boolean doesn't offer too much variety to deal - // with evaluating every key in the internal map. - public boolean containsValue(Object o) { - throw new UnsupportedOperationException("containsValue() not supported on a JEXLMap"); - } - - // this doesn't make much sense - public Boolean remove(Object o) { - throw new UnsupportedOperationException("remove() not supported on a JEXLMap"); - } - - - public Set> entrySet() { - throw new UnsupportedOperationException("clear() not supported on a JEXLMap"); - } - - // nope - public void clear() { - throw new UnsupportedOperationException("clear() not supported on a JEXLMap"); - } -}