From 101ffc4dfd5ce83f7d6bf0b5de4a99ebfac447de Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 16 Nov 2011 13:35:16 -0500 Subject: [PATCH] Expanded, contrastive VariantContextBenchmark -- Compares performance across a bunch of common operations with GATK 1.3 version of VariantContext and GATK 1.4 -- 1.3 VC and associated utilities copied wholesale into test directory under v13 --- .../VariantContextBenchmark.java | 303 +++- .../variantcontext/v13/AbstractVCFCodec.java | 635 +++++++ .../utils/variantcontext/v13/Allele.java | 456 +++++ .../utils/variantcontext/v13/Genotype.java | 349 ++++ .../v13/GenotypeLikelihoods.java | 196 ++ .../variantcontext/v13/IndexingVCFWriter.java | 143 ++ .../v13/InferredGeneticContext.java | 243 +++ .../variantcontext/v13/MutableGenotype.java | 68 + .../v13/MutableVariantContext.java | 213 +++ .../utils/variantcontext/v13/VCF3Codec.java | 198 ++ .../variantcontext/v13/VCFAltHeaderLine.java | 28 + .../utils/variantcontext/v13/VCFCodec.java | 228 +++ .../v13/VCFCompoundHeaderLine.java | 224 +++ .../variantcontext/v13/VCFConstants.java | 112 ++ .../v13/VCFFilterHeaderLine.java | 28 + .../v13/VCFFormatHeaderLine.java | 32 + .../utils/variantcontext/v13/VCFHeader.java | 198 ++ .../variantcontext/v13/VCFHeaderLine.java | 134 ++ .../v13/VCFHeaderLineCount.java | 8 + .../v13/VCFHeaderLineTranslator.java | 124 ++ .../variantcontext/v13/VCFHeaderLineType.java | 8 + .../variantcontext/v13/VCFHeaderVersion.java | 91 + .../variantcontext/v13/VCFInfoHeaderLine.java | 29 + .../v13/VCFNamedHeaderLine.java | 30 + .../utils/variantcontext/v13/VCFParser.java | 22 + .../v13/VCFSimpleHeaderLine.java | 81 + .../utils/variantcontext/v13/VCFUtils.java | 227 +++ .../utils/variantcontext/v13/VCFWriter.java | 16 + .../variantcontext/v13/VariantContext.java | 1615 +++++++++++++++++ .../v13/VariantContextUtils.java | 1407 ++++++++++++++ .../v13/VariantJEXLContext.java | 315 ++++ 31 files changed, 7727 insertions(+), 34 deletions(-) create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/AbstractVCFCodec.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Allele.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Genotype.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/GenotypeLikelihoods.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/IndexingVCFWriter.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/InferredGeneticContext.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableGenotype.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableVariantContext.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCF3Codec.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFAltHeaderLine.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCodec.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCompoundHeaderLine.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFConstants.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFilterHeaderLine.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFormatHeaderLine.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeader.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLine.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineCount.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineTranslator.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineType.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderVersion.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFInfoHeaderLine.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFNamedHeaderLine.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFParser.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFSimpleHeaderLine.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFUtils.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFWriter.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContext.java create mode 100755 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContextUtils.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantJEXLContext.java diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java index 06ce61627..e16176dff 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java @@ -27,8 +27,14 @@ package org.broadinstitute.sting.utils.variantcontext; import com.google.caliper.Param; import com.google.caliper.SimpleBenchmark; import com.google.caliper.runner.CaliperMain; +import net.sf.picard.reference.ReferenceSequenceFile; +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; import org.broad.tribble.readers.AsciiLineReader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import java.io.*; import java.util.*; @@ -46,24 +52,39 @@ public class VariantContextBenchmark extends SimpleBenchmark { @Param({"100"}) int nSamplesToTake; // set automatically by framework - @Param({"READ", "READ_SUBSET"}) - Operation operation; // set automatically by framework + @Param({"10"}) + int dupsToMerge; // set automatically by framework - @Param({"OF_SAMPLES"}) - SubContextOp subContextOp; // set automatically by framework + @Param + Operation operation; // set automatically by framework private String INPUT_STRING; public enum Operation { READ, - READ_SUBSET + SUBSET_TO_SAMPLES, + GET_TYPE, + GET_ID, + GET_GENOTYPES, + GET_ATTRIBUTE_STRING, + GET_ATTRIBUTE_INT, + GET_N_SAMPLES, + GET_GENOTYPES_FOR_SAMPLES, + GET_GENOTYPES_IN_ORDER_OF_NAME, + CALC_GENOTYPE_COUNTS, + MERGE } - public enum SubContextOp { - OF_SAMPLES - } + private GenomeLocParser b37GenomeLocParser; @Override protected void setUp() { + try { + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.b37KGReference)); + b37GenomeLocParser = new GenomeLocParser(seq); + } catch ( FileNotFoundException e) { + throw new RuntimeException(e); + } + // read it into a String so that we don't try to benchmark IO issues try { FileInputStream s = new FileInputStream(new File(vcfFile)); @@ -83,52 +104,266 @@ public class VariantContextBenchmark extends SimpleBenchmark { } } - private void parseGenotypes(VCFCodec codec, Operation op, SubContextOp subop ) { + private interface FunctionToBenchmark { + public void run(T vc); + } + + private void runBenchmark(FeatureCodec codec, FunctionToBenchmark func) { try { InputStream is = new ByteArrayInputStream(INPUT_STRING.getBytes()); AsciiLineReader lineReader = new AsciiLineReader(is); codec.readHeader(lineReader); int counter = 0; - Set samples = null; while (counter++ < linesToRead ) { String line = lineReader.readLine(); if ( line == null ) break; - VariantContext vc = (VariantContext)codec.decode(line); - if ( samples == null ) { - samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); - } - - if ( op == Operation.READ_SUBSET) - processOneVC(vc, samples, subop); + T vc = codec.decode(line); + func.run(vc); } } catch (Exception e) { System.out.println("Benchmarking run failure because of " + e.getMessage()); } } - public void timeMe(int rep) { - for ( int i = 0; i < rep; i++ ) - parseGenotypes(new VCFCodec(), operation, subContextOp); + public void timeV14(int rep) { + for ( int i = 0; i < rep; i++ ) { + FunctionToBenchmark func = getV14FunctionToBenchmark(); + FeatureCodec codec = new VCFCodec(); + runBenchmark(codec, func); + } + } + + public FunctionToBenchmark getV14FunctionToBenchmark() { + switch ( operation ) { + case READ: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + ; // empty operation + } + }; + case SUBSET_TO_SAMPLES: + return new FunctionToBenchmark() { + Set samples; + public void run(final VariantContext vc) { + if ( samples == null ) + samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); + VariantContext sub = vc.subContextFromSamples(samples); + sub.getNSamples(); + } + }; + case GET_TYPE: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getType(); + } + }; + case GET_ID: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getID(); + } + }; + case GET_GENOTYPES: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getGenotypes().size(); + } + }; + + case GET_GENOTYPES_FOR_SAMPLES: + return new FunctionToBenchmark() { + Set samples; + public void run(final VariantContext vc) { + if ( samples == null ) + samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); + vc.getGenotypes(samples).size(); + } + }; + + case GET_ATTRIBUTE_STRING: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getAttribute("AN", null); + } + }; + + case GET_ATTRIBUTE_INT: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getAttributeAsInt("AC", 0); + } + }; + + case GET_N_SAMPLES: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getNSamples(); + } + }; + + case GET_GENOTYPES_IN_ORDER_OF_NAME: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + ; // TODO - TEST IS BROKEN +// int n = 0; +// for ( final Genotype g: vc.getGenotypesSortedByName() ) n++; + } + }; + + case CALC_GENOTYPE_COUNTS: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getHetCount(); + } + }; + + case MERGE: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + List toMerge = new ArrayList(); + + for ( int i = 0; i < dupsToMerge; i++ ) { + GenotypeCollection gc = GenotypeCollection.create(vc.getNSamples()); + for ( final Genotype g : vc.getGenotypes() ) { + gc.add(new Genotype(g.getSampleName()+"_"+i, g)); + } + toMerge.add(VariantContext.modifyGenotypes(vc, gc)); + } + + VariantContextUtils.simpleMerge(b37GenomeLocParser, toMerge, null, + VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + VariantContextUtils.GenotypeMergeType.UNSORTED, + true, false, "set", false, true); + } + }; + + default: throw new IllegalArgumentException("Unexpected operation " + operation); + } + } + + public void timeV13(int rep) { + for ( int i = 0; i < rep; i++ ) { + FunctionToBenchmark func = getV13FunctionToBenchmark(); + FeatureCodec codec = new org.broadinstitute.sting.utils.variantcontext.v13.VCFCodec(); + runBenchmark(codec, func); + } + } + + public FunctionToBenchmark getV13FunctionToBenchmark() { + switch ( operation ) { + case READ: + return new FunctionToBenchmark() { + public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { + ; // empty operation + } + }; + case SUBSET_TO_SAMPLES: + return new FunctionToBenchmark() { + List samples; + public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { + if ( samples == null ) + samples = new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake); + org.broadinstitute.sting.utils.variantcontext.v13.VariantContext sub = vc.subContextFromGenotypes(vc.getGenotypes(samples).values()); + sub.getNSamples(); + } + }; + + case GET_TYPE: + return new FunctionToBenchmark() { + public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { + vc.getType(); + } + }; + case GET_ID: + return new FunctionToBenchmark() { + public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { + vc.getID(); + } + }; + case GET_GENOTYPES: + return new FunctionToBenchmark() { + public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { + vc.getGenotypes().size(); + } + }; + + case GET_GENOTYPES_FOR_SAMPLES: + return new FunctionToBenchmark() { + Set samples; + public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { + if ( samples == null ) + samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); + vc.getGenotypes(samples).size(); + } + }; + + case GET_ATTRIBUTE_STRING: + return new FunctionToBenchmark() { + public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { + vc.getAttribute("AN", null); + } + }; + + case GET_ATTRIBUTE_INT: + return new FunctionToBenchmark() { + public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { + vc.getAttributeAsInt("AC", 0); + } + }; + + case GET_N_SAMPLES: + return new FunctionToBenchmark() { + public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { + vc.getNSamples(); + } + }; + + case GET_GENOTYPES_IN_ORDER_OF_NAME: + return new FunctionToBenchmark() { + public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { + ; // TODO - TEST IS BROKEN + //vc.getGenotypesSortedByName(); + } + }; + + case CALC_GENOTYPE_COUNTS: + return new FunctionToBenchmark() { + public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { + vc.getHetCount(); + } + }; + + case MERGE: + return new FunctionToBenchmark() { + public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { + List toMerge = new ArrayList(); + + for ( int i = 0; i < dupsToMerge; i++ ) { + Map gc = new HashMap(); + for ( final org.broadinstitute.sting.utils.variantcontext.v13.Genotype g : vc.getGenotypes().values() ) { + String name = g.getSampleName()+"_"+i; + gc.put(name, new org.broadinstitute.sting.utils.variantcontext.v13.Genotype(name, + g.getAlleles(), g.getNegLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased(), g.getLikelihoods().getAsVector())); + toMerge.add(org.broadinstitute.sting.utils.variantcontext.v13.VariantContext.modifyGenotypes(vc, gc)); + } + } + + org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.simpleMerge(b37GenomeLocParser, + toMerge, null, + org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.GenotypeMergeType.UNSORTED, + true, false, "set", false, true); + } + }; + + default: throw new IllegalArgumentException("Unexpected operation " + operation); + } } public static void main(String[] args) { CaliperMain.main(VariantContextBenchmark.class, args); } - - private static final void processOneVC(VariantContext vc, Set samples, SubContextOp subop) { - VariantContext sub; - - switch ( subop ) { - case OF_SAMPLES: - sub = vc.subContextFromSamples(samples, vc.getAlleles()); - break; - default: - throw new RuntimeException("Unexpected op: " + subop); - } - - sub.getNSamples(); - } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/AbstractVCFCodec.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/AbstractVCFCodec.java new file mode 100755 index 000000000..5310313e0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/AbstractVCFCodec.java @@ -0,0 +1,635 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.NameAwareCodec; +import org.broad.tribble.TribbleException; +import org.broad.tribble.readers.LineReader; +import org.broad.tribble.util.BlockCompressedInputStream; +import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.*; +import java.util.*; +import java.util.zip.GZIPInputStream; + + +abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser { + + protected final static Logger log = Logger.getLogger(VCFCodec.class); + protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column + + protected VCFHeaderVersion version; + + // we have to store the list of strings that make up the header until they're needed + protected VCFHeader header = null; + + // a mapping of the allele + protected Map> alleleMap = new HashMap>(3); + + // for ParsingUtils.split + protected String[] GTValueArray = new String[100]; + protected String[] genotypeKeyArray = new String[100]; + protected String[] infoFieldArray = new String[1000]; + protected String[] infoValueArray = new String[1000]; + + // for performance testing purposes + public static boolean validate = true; + + // a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over + // todo: make this thread safe? + protected String[] parts = null; + protected String[] genotypeParts = null; + + // for performance we cache the hashmap of filter encodings for quick lookup + protected HashMap> filterHash = new HashMap>(); + + // a mapping of the VCF fields to their type, filter fields, and format fields, for quick lookup to validate against + TreeMap infoFields = new TreeMap(); + TreeMap formatFields = new TreeMap(); + Set filterFields = new HashSet(); + + // we store a name to give to each of the variant contexts we emit + protected String name = "Unknown"; + + protected int lineNo = 0; + + protected Map stringCache = new HashMap(); + + + /** + * @param reader the line reader to take header lines from + * @return the number of header lines + */ + public abstract Object readHeader(LineReader reader); + + /** + * create a genotype map + * @param str the string + * @param alleles the list of alleles + * @param chr chrom + * @param pos position + * @return a mapping of sample name to genotype object + */ + public abstract Map createGenotypeMap(String str, List alleles, String chr, int pos); + + + /** + * parse the filter string, first checking to see if we already have parsed it in a previous attempt + * @param filterString the string to parse + * @return a set of the filters applied + */ + protected abstract Set parseFilters(String filterString); + + /** + * create a VCF header + * @param headerStrings a list of strings that represent all the ## entries + * @param line the single # line (column names) + * @return the count of header lines + */ + protected Object createHeader(List headerStrings, String line) { + + headerStrings.add(line); + + Set metaData = new TreeSet(); + Set auxTags = new LinkedHashSet(); + // iterate over all the passed in strings + for ( String str : headerStrings ) { + if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) { + String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR); + if ( strings.length < VCFHeader.HEADER_FIELDS.values().length ) + throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str); + + int arrayIndex = 0; + for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { + try { + if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex])) + throw new TribbleException.InvalidHeader("we were expecting column name '" + field + "' but we saw '" + strings[arrayIndex] + "'"); + } catch (IllegalArgumentException e) { + throw new TribbleException.InvalidHeader("unknown column name '" + strings[arrayIndex] + "'; it does not match a legal column header name."); + } + arrayIndex++; + } + + boolean sawFormatTag = false; + if ( arrayIndex < strings.length ) { + if ( !strings[arrayIndex].equals("FORMAT") ) + throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'"); + sawFormatTag = true; + arrayIndex++; + } + + while ( arrayIndex < strings.length ) + auxTags.add(strings[arrayIndex++]); + + if ( sawFormatTag && auxTags.size() == 0 ) + throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data"); + + } else { + if ( str.startsWith("##INFO=") ) { + VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version); + metaData.add(info); + infoFields.put(info.getName(), info.getType()); + } else if ( str.startsWith("##FILTER=") ) { + VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9),version); + metaData.add(filter); + filterFields.add(filter.getName()); + } else if ( str.startsWith("##FORMAT=") ) { + VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9),version); + metaData.add(format); + formatFields.put(format.getName(), format.getType()); + } else { + int equals = str.indexOf("="); + if ( equals != -1 ) + metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1))); + } + } + } + + header = new VCFHeader(metaData, auxTags); + return header; + } + + /** + * the fast decode function + * @param line the line of text for the record + * @return a feature, (not guaranteed complete) that has the correct start and stop + */ + public Feature decodeLoc(String line) { + lineNo++; + + // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line + if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; + + // our header cannot be null, we need the genotype sample names and counts + if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); + + final String[] locParts = new String[6]; + int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); + + if ( nParts != 6 ) + throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo); + + // get our alleles (because the end position depends on them) + final String ref = getCachedString(locParts[3].toUpperCase()); + final String alts = getCachedString(locParts[4].toUpperCase()); + final List alleles = parseAlleles(ref, alts, lineNo); + + // find out our location + final int start = Integer.valueOf(locParts[1]); + int stop = start; + + // ref alleles don't need to be single bases for monomorphic sites + if ( alleles.size() == 1 ) { + stop = start + alleles.get(0).length() - 1; + } else if ( !isSingleNucleotideEvent(alleles) ) { + stop = clipAlleles(start, ref, alleles, null, lineNo); + } + + return new VCFLocFeature(locParts[0], start, stop); + } + + private final static class VCFLocFeature implements Feature { + + final String chr; + final int start, stop; + + private VCFLocFeature(String chr, int start, int stop) { + this.chr = chr; + this.start = start; + this.stop = stop; + } + + public String getChr() { return chr; } + public int getStart() { return start; } + public int getEnd() { return stop; } + } + + + /** + * decode the line into a feature (VariantContext) + * @param line the line + * @return a VariantContext + */ + public Feature decode(String line) { + // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line + if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; + + // our header cannot be null, we need the genotype sample names and counts + if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); + + if (parts == null) + parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)]; + + int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true); + + // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns. Otherwise check that we have nine (normal colummns + genotyping data) + if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) || + (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) ) + throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + + " tokens, and saw " + nParts + " )", lineNo); + + return parseVCFLine(parts); + } + + protected void generateException(String message) { + throw new UserException.MalformedVCF(message, lineNo); + } + + protected static void generateException(String message, int lineNo) { + throw new UserException.MalformedVCF(message, lineNo); + } + + /** + * parse out the VCF line + * + * @param parts the parts split up + * @return a variant context object + */ + private VariantContext parseVCFLine(String[] parts) { + // increment the line count + lineNo++; + + // parse out the required fields + String contig = getCachedString(parts[0]); + int pos = Integer.valueOf(parts[1]); + String id = null; + if ( parts[2].length() == 0 ) + generateException("The VCF specification requires a valid ID field"); + else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) + id = VCFConstants.EMPTY_ID_FIELD; + else + id = new String(parts[2]); + String ref = getCachedString(parts[3].toUpperCase()); + String alts = getCachedString(parts[4].toUpperCase()); + Double qual = parseQual(parts[5]); + String filter = getCachedString(parts[6]); + String info = new String(parts[7]); + + // get our alleles, filters, and setup an attribute map + List alleles = parseAlleles(ref, alts, lineNo); + Set filters = parseFilters(filter); + Map attributes = parseInfo(info, id); + + // find out our current location, and clip the alleles down to their minimum length + int loc = pos; + // ref alleles don't need to be single bases for monomorphic sites + if ( alleles.size() == 1 ) { + loc = pos + alleles.get(0).length() - 1; + } else if ( !isSingleNucleotideEvent(alleles) ) { + ArrayList newAlleles = new ArrayList(); + loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo); + alleles = newAlleles; + } + + // do we have genotyping data + if (parts.length > NUM_STANDARD_FIELDS) { + attributes.put(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, new String(parts[8])); + attributes.put(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY, this); + } + + VariantContext vc = null; + try { + vc = new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes, ref.getBytes()[0]); + } catch (Exception e) { + generateException(e.getMessage()); + } + + // did we resort the sample names? If so, we need to load the genotype data + if ( !header.samplesWereAlreadySorted() ) + vc.getGenotypes(); + + return vc; + } + + /** + * + * @return the type of record + */ + public Class getFeatureType() { + return VariantContext.class; + } + + /** + * get the name of this codec + * @return our set name + */ + public String getName() { + return name; + } + + /** + * set the name of this codec + * @param name new name + */ + public void setName(String name) { + this.name = name; + } + + /** + * Return a cached copy of the supplied string. + * + * @param str string + * @return interned string + */ + protected String getCachedString(String str) { + String internedString = stringCache.get(str); + if ( internedString == null ) { + internedString = new String(str); + stringCache.put(internedString, internedString); + } + return internedString; + } + + /** + * parse out the info fields + * @param infoField the fields + * @param id the indentifier + * @return a mapping of keys to objects + */ + private Map parseInfo(String infoField, String id) { + Map attributes = new HashMap(); + + if ( infoField.length() == 0 ) + generateException("The VCF specification requires a valid info field"); + + if ( !infoField.equals(VCFConstants.EMPTY_INFO_FIELD) ) { + if ( infoField.indexOf("\t") != -1 || infoField.indexOf(" ") != -1 ) + generateException("The VCF specification does not allow for whitespace in the INFO field"); + + int infoFieldSplitSize = ParsingUtils.split(infoField, infoFieldArray, VCFConstants.INFO_FIELD_SEPARATOR_CHAR, false); + for (int i = 0; i < infoFieldSplitSize; i++) { + String key; + Object value; + + int eqI = infoFieldArray[i].indexOf("="); + if ( eqI != -1 ) { + key = infoFieldArray[i].substring(0, eqI); + String str = infoFieldArray[i].substring(eqI+1); + + // split on the INFO field separator + int infoValueSplitSize = ParsingUtils.split(str, infoValueArray, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR, false); + if ( infoValueSplitSize == 1 ) { + value = infoValueArray[0]; + } else { + ArrayList valueList = new ArrayList(infoValueSplitSize); + for ( int j = 0; j < infoValueSplitSize; j++ ) + valueList.add(infoValueArray[j]); + value = valueList; + } + } else { + key = infoFieldArray[i]; + value = true; + } + + attributes.put(key, value); + } + } + + if ( ! id.equals(VCFConstants.EMPTY_ID_FIELD) ) + attributes.put(VariantContext.ID_KEY, id); + return attributes; + } + + /** + * create a an allele from an index and an array of alleles + * @param index the index + * @param alleles the alleles + * @return an Allele + */ + protected static Allele oneAllele(String index, List alleles) { + if ( index.equals(VCFConstants.EMPTY_ALLELE) ) + return Allele.NO_CALL; + int i = Integer.valueOf(index); + if ( i >= alleles.size() ) + throw new TribbleException.InternalCodecException("The allele with index " + index + " is not defined in the REF/ALT columns in the record"); + return alleles.get(i); + } + + + /** + * parse genotype alleles from the genotype string + * @param GT GT string + * @param alleles list of possible alleles + * @param cache cache of alleles for GT + * @return the allele list for the GT string + */ + protected static List parseGenotypeAlleles(String GT, List alleles, Map> cache) { + // cache results [since they are immutable] and return a single object for each genotype + List GTAlleles = cache.get(GT); + + if ( GTAlleles == null ) { + StringTokenizer st = new StringTokenizer(GT, VCFConstants.PHASING_TOKENS); + GTAlleles = new ArrayList(st.countTokens()); + while ( st.hasMoreTokens() ) { + String genotype = st.nextToken(); + GTAlleles.add(oneAllele(genotype, alleles)); + } + cache.put(GT, GTAlleles); + } + + return GTAlleles; + } + + /** + * parse out the qual value + * @param qualString the quality string + * @return return a double + */ + protected static Double parseQual(String qualString) { + // if we're the VCF 4 missing char, return immediately + if ( qualString.equals(VCFConstants.MISSING_VALUE_v4)) + return VariantContext.NO_NEG_LOG_10PERROR; + + Double val = Double.valueOf(qualString); + + // check to see if they encoded the missing qual score in VCF 3 style, with either the -1 or -1.0. check for val < 0 to save some CPU cycles + if ((val < 0) && (Math.abs(val - VCFConstants.MISSING_QUALITY_v3_DOUBLE) < VCFConstants.VCF_ENCODING_EPSILON)) + return VariantContext.NO_NEG_LOG_10PERROR; + + // scale and return the value + return val / 10.0; + } + + /** + * parse out the alleles + * @param ref the reference base + * @param alts a string of alternates to break into alleles + * @param lineNo the line number for this record + * @return a list of alleles, and a pair of the shortest and longest sequence + */ + protected static List parseAlleles(String ref, String alts, int lineNo) { + List alleles = new ArrayList(2); // we are almost always biallelic + // ref + checkAllele(ref, true, lineNo); + Allele refAllele = Allele.create(ref, true); + alleles.add(refAllele); + + if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split + parseSingleAltAllele(alleles, alts, lineNo); + else + for ( String alt : alts.split(",") ) + parseSingleAltAllele(alleles, alt, lineNo); + + return alleles; + } + + /** + * check to make sure the allele is an acceptable allele + * @param allele the allele to check + * @param isRef are we the reference allele? + * @param lineNo the line number for this record + */ + private static void checkAllele(String allele, boolean isRef, int lineNo) { + if ( allele == null || allele.length() == 0 ) + generateException("Empty alleles are not permitted in VCF records", lineNo); + + if ( isSymbolicAllele(allele) ) { + if ( isRef ) { + generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo); + } + } else { + // check for VCF3 insertions or deletions + if ( (allele.charAt(0) == VCFConstants.DELETION_ALLELE_v3) || (allele.charAt(0) == VCFConstants.INSERTION_ALLELE_v3) ) + generateException("Insertions/Deletions are not supported when reading 3.x VCF's. Please" + + " convert your file to VCF4 using VCFTools, available at http://vcftools.sourceforge.net/index.html", lineNo); + + if (!Allele.acceptableAlleleBases(allele)) + generateException("Unparsable vcf record with allele " + allele, lineNo); + + if ( isRef && allele.equals(VCFConstants.EMPTY_ALLELE) ) + generateException("The reference allele cannot be missing", lineNo); + } + } + + /** + * return true if this is a symbolic allele (e.g. ) otherwise false + * @param allele the allele to check + * @return true if the allele is a symbolic allele, otherwise false + */ + private static boolean isSymbolicAllele(String allele) { + return (allele != null && allele.startsWith("<") && allele.endsWith(">") && allele.length() > 2); + } + + /** + * parse a single allele, given the allele list + * @param alleles the alleles available + * @param alt the allele to parse + * @param lineNo the line number for this record + */ + private static void parseSingleAltAllele(List alleles, String alt, int lineNo) { + checkAllele(alt, false, lineNo); + + Allele allele = Allele.create(alt, false); + if ( ! allele.isNoCall() ) + alleles.add(allele); + } + + protected static boolean isSingleNucleotideEvent(List alleles) { + for ( Allele a : alleles ) { + if ( a.length() != 1 ) + return false; + } + return true; + } + + public static int computeForwardClipping(List unclippedAlleles, String ref) { + boolean clipping = true; + + for ( Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) + continue; + + if ( a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0]) ) { + clipping = false; + break; + } + } + + return (clipping) ? 1 : 0; + } + + protected static int computeReverseClipping(List unclippedAlleles, String ref, int forwardClipping, int lineNo) { + int clipping = 0; + boolean stillClipping = true; + + while ( stillClipping ) { + for ( Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) + continue; + + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) + stillClipping = false; + else if ( ref.length() == clipping ) + generateException("bad alleles encountered", lineNo); + else if ( a.getBases()[a.length()-clipping-1] != ref.getBytes()[ref.length()-clipping-1] ) + stillClipping = false; + } + if ( stillClipping ) + clipping++; + } + + return clipping; + } + /** + * clip the alleles, based on the reference + * + * @param position the unadjusted start position (pre-clipping) + * @param ref the reference string + * @param unclippedAlleles the list of unclipped alleles + * @param clippedAlleles output list of clipped alleles + * @param lineNo the current line number in the file + * @return the new reference end position of this event + */ + protected static int clipAlleles(int position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { + + int forwardClipping = computeForwardClipping(unclippedAlleles, ref); + int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo); + + if ( clippedAlleles != null ) { + for ( Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) { + clippedAlleles.add(a); + } else { + clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length-reverseClipping), a.isReference())); + } + } + } + + // the new reference length + int refLength = ref.length() - reverseClipping; + + return position+Math.max(refLength - 1,0); + } + + public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) { + try { + return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) || + isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE) || + isVCFStream(new BlockCompressedInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE); + } catch ( FileNotFoundException e ) { + return false; + } catch ( IOException e ) { + return false; + } + } + + private final static boolean isVCFStream(final InputStream stream, final String MAGIC_HEADER_LINE) { + try { + byte[] buff = new byte[MAGIC_HEADER_LINE.length()]; + int nread = stream.read(buff, 0, MAGIC_HEADER_LINE.length()); + boolean eq = Arrays.equals(buff, MAGIC_HEADER_LINE.getBytes()); + return eq; +// String firstLine = new String(buff); +// return firstLine.startsWith(MAGIC_HEADER_LINE); + } catch ( IOException e ) { + return false; + } catch ( RuntimeException e ) { + return false; + } finally { + try { stream.close(); } catch ( IOException e ) {} + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Allele.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Allele.java new file mode 100755 index 000000000..f43cd7b98 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Allele.java @@ -0,0 +1,456 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +/** + * Immutable representation of an allele + * + * Types of alleles: + * + * Ref: a t C g a // C is the reference base + * + * : a t G g a // C base is a G in some individuals + * + * : a t - g a // C base is deleted w.r.t. the reference + * + * : a t CAg a // A base is inserted w.r.t. the reference sequence + * + * In these cases, where are the alleles? + * + * SNP polymorphism of C/G -> { C , G } -> C is the reference allele + * 1 base deletion of C -> { C , - } -> C is the reference allele + * 1 base insertion of A -> { - ; A } -> Null is the reference allele + * + * Suppose I see a the following in the population: + * + * Ref: a t C g a // C is the reference base + * : a t G g a // C base is a G in some individuals + * : a t - g a // C base is deleted w.r.t. the reference + * + * How do I represent this? There are three segregating alleles: + * + * { C , G , - } + * + * Now suppose I have this more complex example: + * + * Ref: a t C g a // C is the reference base + * : a t - g a + * : a t - - a + * : a t CAg a + * + * There are actually four segregating alleles: + * + * { C g , - g, - -, and CAg } over bases 2-4 + * + * However, the molecular equivalence explicitly listed above is usually discarded, so the actual + * segregating alleles are: + * + * { C g, g, -, C a g } + * + * Critically, it should be possible to apply an allele to a reference sequence to create the + * correct haplotype sequence: + * + * Allele + reference => haplotype + * + * For convenience, we are going to create Alleles where the GenomeLoc of the allele is stored outside of the + * Allele object itself. So there's an idea of an A/C polymorphism independent of it's surrounding context. + * + * Given list of alleles it's possible to determine the "type" of the variation + * + * A / C @ loc => SNP with + * - / A => INDEL + * + * If you know where allele is the reference, you can determine whether the variant is an insertion or deletion. + * + * Alelle also supports is concept of a NO_CALL allele. This Allele represents a haplotype that couldn't be + * determined. This is usually represented by a '.' allele. + * + * Note that Alleles store all bases as bytes, in **UPPER CASE**. So 'atc' == 'ATC' from the perspective of an + * Allele. + + * @author ebanks, depristo + */ +class Allele implements Comparable { + private static final byte[] EMPTY_ALLELE_BASES = new byte[0]; + + private boolean isRef = false; + private boolean isNull = false; + private boolean isNoCall = false; + private boolean isSymbolic = false; + + private byte[] bases = null; + + public final static String NULL_ALLELE_STRING = "-"; + public final static String NO_CALL_STRING = "."; + /** A generic static NO_CALL allele for use */ + + // no public way to create an allele + private Allele(byte[] bases, boolean isRef) { + // standardize our representation of null allele and bases + if ( wouldBeNullAllele(bases) ) { + bases = EMPTY_ALLELE_BASES; + isNull = true; + } else if ( wouldBeNoCallAllele(bases) ) { + bases = EMPTY_ALLELE_BASES; + isNoCall = true; + if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele"); + } else if ( wouldBeSymbolicAllele(bases) ) { + isSymbolic = true; + if ( isRef ) throw new IllegalArgumentException("Cannot tag a symbolic allele as the reference allele"); + } +// else +// bases = new String(bases).toUpperCase().getBytes(); // todo -- slow performance + + this.isRef = isRef; + this.bases = bases; + + if ( ! acceptableAlleleBases(bases) ) + throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'"); + } + + private Allele(String bases, boolean isRef) { + this(bases.getBytes(), isRef); + } + + + private final static Allele REF_A = new Allele("A", true); + private final static Allele ALT_A = new Allele("A", false); + private final static Allele REF_C = new Allele("C", true); + private final static Allele ALT_C = new Allele("C", false); + private final static Allele REF_G = new Allele("G", true); + private final static Allele ALT_G = new Allele("G", false); + private final static Allele REF_T = new Allele("T", true); + private final static Allele ALT_T = new Allele("T", false); + private final static Allele REF_N = new Allele("N", true); + private final static Allele ALT_N = new Allele("N", false); + private final static Allele REF_NULL = new Allele(NULL_ALLELE_STRING, true); + private final static Allele ALT_NULL = new Allele(NULL_ALLELE_STRING, false); + public final static Allele NO_CALL = new Allele(NO_CALL_STRING, false); + + // --------------------------------------------------------------------------------------------------------- + // + // creation routines + // + // --------------------------------------------------------------------------------------------------------- + + /** + * Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases + * == '-', a Null allele is created. If bases == '.', a no call Allele is created. + * + * @param bases the DNA sequence of this variation, '-', of '.' + * @param isRef should we make this a reference allele? + * @throws IllegalArgumentException if bases contains illegal characters or is otherwise malformated + */ + public static Allele create(byte[] bases, boolean isRef) { + if ( bases == null ) + throw new IllegalArgumentException("create: the Allele base string cannot be null; use new Allele() or new Allele(\"\") to create a Null allele"); + + if ( bases.length == 1 ) { + // optimization to return a static constant Allele for each single base object + switch (bases[0]) { + case '.': + if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele"); + return NO_CALL; + case '-': return isRef ? REF_NULL : ALT_NULL; + case 'A': case 'a' : return isRef ? REF_A : ALT_A; + case 'C': case 'c' : return isRef ? REF_C : ALT_C; + case 'G': case 'g' : return isRef ? REF_G : ALT_G; + case 'T': case 't' : return isRef ? REF_T : ALT_T; + case 'N': case 'n' : return isRef ? REF_N : ALT_N; + default: throw new IllegalArgumentException("Illegal base: " + (char)bases[0]); + } + } else { + return new Allele(bases, isRef); + } + } + + public static Allele create(byte base, boolean isRef) { +// public Allele(byte base, boolean isRef) { + return create( new byte[]{ base }, isRef); + } + + public static Allele create(byte base) { + return create( base, false ); + } + + public static Allele extend(Allele left, byte[] right) { + if (left.isSymbolic()) + throw new IllegalArgumentException("Cannot extend a symbolic allele"); + byte[] bases = null; + if ( left.length() == 0 ) + bases = right; + else { + bases = new byte[left.length() + right.length]; + System.arraycopy(left.getBases(), 0, bases, 0, left.length()); + System.arraycopy(right, 0, bases, left.length(), right.length); + } + + return create(bases, left.isReference()); + } + + /** + * @param bases bases representing an allele + * @return true if the bases represent the null allele + */ + public static boolean wouldBeNullAllele(byte[] bases) { + return (bases.length == 1 && bases[0] == '-') || bases.length == 0; + } + + /** + * @param bases bases representing an allele + * @return true if the bases represent the NO_CALL allele + */ + public static boolean wouldBeNoCallAllele(byte[] bases) { + return bases.length == 1 && bases[0] == '.'; + } + + /** + * @param bases bases representing an allele + * @return true if the bases represent a symbolic allele + */ + public static boolean wouldBeSymbolicAllele(byte[] bases) { + return bases.length > 2 && bases[0] == '<' && bases[bases.length-1] == '>'; + } + + /** + * @param bases bases representing an allele + * @return true if the bases represent the well formatted allele + */ + public static boolean acceptableAlleleBases(String bases) { + return acceptableAlleleBases(bases.getBytes()); + } + + /** + * @param bases bases representing an allele + * @return true if the bases represent the well formatted allele + */ + public static boolean acceptableAlleleBases(byte[] bases) { + if ( wouldBeNullAllele(bases) || wouldBeNoCallAllele(bases) || wouldBeSymbolicAllele(bases) ) + return true; + + for ( int i = 0; i < bases.length; i++ ) { + switch (bases[i]) { + case 'A': case 'C': case 'G': case 'T': case 'N' : case 'a': case 'c': case 'g': case 't': case 'n' : + break; + default: + return false; + } + } + + return true; + } + + /** + * @see Allele(byte[], boolean) + * + * @param bases bases representing an allele + * @param isRef is this the reference allele? + */ + public static Allele create(String bases, boolean isRef) { + //public Allele(String bases, boolean isRef) { + return create(bases.getBytes(), isRef); + } + + + /** + * Creates a non-Ref allele. @see Allele(byte[], boolean) for full information + * + * @param bases bases representing an allele + */ + public static Allele create(String bases) { + return create(bases, false); + } + + /** + * Creates a non-Ref allele. @see Allele(byte[], boolean) for full information + * + * @param bases bases representing an allele + */ + public static Allele create(byte[] bases) { + return create(bases, false); + //this(bases, false); + } + + // --------------------------------------------------------------------------------------------------------- + // + // accessor routines + // + // --------------------------------------------------------------------------------------------------------- + + //Returns true if this is the null allele + public boolean isNull() { return isNull; } + // Returns true if this is not the null allele + public boolean isNonNull() { return ! isNull(); } + + // Returns true if this is the NO_CALL allele + public boolean isNoCall() { return isNoCall; } + // Returns true if this is not the NO_CALL allele + public boolean isCalled() { return ! isNoCall(); } + + // Returns true if this Allele is the reference allele + public boolean isReference() { return isRef; } + // Returns true if this Allele is not the reference allele + public boolean isNonReference() { return ! isReference(); } + + // Returns true if this Allele is symbolic (i.e. no well-defined base sequence) + public boolean isSymbolic() { return isSymbolic; } + + // Returns a nice string representation of this object + public String toString() { + return (isNull() ? NULL_ALLELE_STRING : ( isNoCall() ? NO_CALL_STRING : getDisplayString() )) + (isReference() ? "*" : ""); + } + + /** + * Return the DNA bases segregating in this allele. Note this isn't reference polarized, + * so the Null allele is represented by a vector of length 0 + * + * @return the segregating bases + */ + public byte[] getBases() { return isSymbolic ? EMPTY_ALLELE_BASES : bases; } + + /** + * Return the DNA bases segregating in this allele in String format. + * This is useful, because toString() adds a '*' to reference alleles and getBases() returns garbage when you call toString() on it. + * + * @return the segregating bases + */ + public String getBaseString() { return new String(getBases()); } + + /** + * Return the printed representation of this allele. + * Same as getBaseString(), except for symbolic alleles. + * For symbolic alleles, the base string is empty while the display string contains . + * + * @return the allele string representation + */ + public String getDisplayString() { return new String(bases); } + + /** + * @param other the other allele + * + * @return true if these alleles are equal + */ + public boolean equals(Object other) { + return ( ! (other instanceof Allele) ? false : equals((Allele)other, false) ); + } + + /** + * @return hash code + */ + public int hashCode() { + int hash = 1; + for (int i = 0; i < bases.length; i++) + hash += (i+1) * bases[i]; + return hash; + } + + /** + * Returns true if this and other are equal. If ignoreRefState is true, then doesn't require both alleles has the + * same ref tag + * + * @param other allele to compare to + * @param ignoreRefState if true, ignore ref state in comparison + * @return true if this and other are equal + */ + public boolean equals(Allele other, boolean ignoreRefState) { + return this == other || (isRef == other.isRef || ignoreRefState) && isNull == other.isNull && isNoCall == other.isNoCall && (bases == other.bases || Arrays.equals(bases, other.bases)); + } + + /** + * @param test bases to test against + * + * @return true if this Alelle contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles + */ + public boolean basesMatch(byte[] test) { return !isSymbolic && (bases == test || Arrays.equals(bases, test)); } + + /** + * @param test bases to test against + * + * @return true if this Alelle contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles + */ + public boolean basesMatch(String test) { return basesMatch(test.toUpperCase().getBytes()); } + + /** + * @param test allele to test against + * + * @return true if this Alelle contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles + */ + public boolean basesMatch(Allele test) { return basesMatch(test.getBases()); } + + /** + * @return the length of this allele. Null and NO_CALL alleles have 0 length. + */ + public int length() { + return isSymbolic ? 0 : bases.length; + } + + // --------------------------------------------------------------------------------------------------------- + // + // useful static functions + // + // --------------------------------------------------------------------------------------------------------- + + public static Allele getMatchingAllele(Collection allAlleles, String alleleBases) { + return getMatchingAllele(allAlleles, alleleBases.getBytes()); + } + + public static Allele getMatchingAllele(Collection allAlleles, byte[] alleleBases) { + for ( Allele a : allAlleles ) { + if ( a.basesMatch(alleleBases) ) { + return a; + } + } + + if ( wouldBeNoCallAllele(alleleBases) ) + return NO_CALL; + else + return null; // couldn't find anything + } + + public static List resolveAlleles(List possibleAlleles, List alleleStrings) { + List myAlleles = new ArrayList(alleleStrings.size()); + + for ( String alleleString : alleleStrings ) { + Allele allele = getMatchingAllele(possibleAlleles, alleleString); + + if ( allele == null ) { + if ( Allele.wouldBeNoCallAllele(alleleString.getBytes()) ) { + allele = create(alleleString); + } else { + throw new IllegalArgumentException("Allele " + alleleString + " not present in the list of alleles " + possibleAlleles); + } + } + + myAlleles.add(allele); + } + + return myAlleles; + } + + public int compareTo(Allele other) { + if ( isReference() && other.isNonReference() ) + return -1; + else if ( isNonReference() && other.isReference() ) + return 1; + else + return getBaseString().compareTo(other.getBaseString()); // todo -- potential performance issue + } + + public static boolean oneIsPrefixOfOther(Allele a1, Allele a2) { + if ( a1.isNull() || a2.isNull() ) + return true; + + if ( a2.length() >= a1.length() ) + return firstIsPrefixOfSecond(a1, a2); + else + return firstIsPrefixOfSecond(a2, a1); + } + + private static boolean firstIsPrefixOfSecond(Allele a1, Allele a2) { + String a1String = a1.getBaseString(); + return a2.getBaseString().substring(0, a1String.length()).equals(a1String); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Genotype.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Genotype.java new file mode 100755 index 000000000..91aa3b1da --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/Genotype.java @@ -0,0 +1,349 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + + +import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +/** + * This class encompasses all the basic information about a genotype. It is immutable. + * + * @author Mark DePristo + */ +public class Genotype { + + public final static String PHASED_ALLELE_SEPARATOR = "|"; + public final static String UNPHASED_ALLELE_SEPARATOR = "/"; + + protected InferredGeneticContext commonInfo; + public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; + protected List alleles = null; // new ArrayList(); + protected Type type = null; + + protected boolean isPhased = false; + protected boolean filtersWereAppliedToContext; + + public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased) { + this(sampleName, alleles, negLog10PError, filters, attributes, isPhased, null); + } + + public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased, double[] log10Likelihoods) { + if ( alleles != null ) + this.alleles = Collections.unmodifiableList(alleles); + commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes); + if ( log10Likelihoods != null ) + commonInfo.putAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(log10Likelihoods)); + filtersWereAppliedToContext = filters != null; + this.isPhased = isPhased; + validate(); + } + + /** + * Creates a new Genotype for sampleName with genotype according to alleles. + * @param sampleName + * @param alleles + * @param negLog10PError the confidence in these alleles + * @param log10Likelihoods a log10 likelihoods for each of the genotype combinations possible for alleles, in the standard VCF ordering, or null if not known + */ + public Genotype(String sampleName, List alleles, double negLog10PError, double[] log10Likelihoods) { + this(sampleName, alleles, negLog10PError, null, null, false, log10Likelihoods); + } + + public Genotype(String sampleName, List alleles, double negLog10PError) { + this(sampleName, alleles, negLog10PError, null, null, false); + } + + public Genotype(String sampleName, List alleles) { + this(sampleName, alleles, NO_NEG_LOG_10PERROR, null, null, false); + } + + + // --------------------------------------------------------------------------------------------------------- + // + // Partial-cloning routines (because Genotype is immutable). + // + // --------------------------------------------------------------------------------------------------------- + + public static Genotype modifyName(Genotype g, String name) { + return new Genotype(name, g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); + } + + public static Genotype modifyAttributes(Genotype g, Map attributes) { + return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.isPhased()); + } + + public static Genotype modifyAlleles(Genotype g, List alleles) { + return new Genotype(g.getSampleName(), alleles, g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); + } + + /** + * @return the alleles for this genotype + */ + public List getAlleles() { + return alleles; + } + + public List getAlleles(Allele allele) { + if ( getType() == Type.UNAVAILABLE ) + throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype"); + + List al = new ArrayList(); + for ( Allele a : alleles ) + if ( a.equals(allele) ) + al.add(a); + + return Collections.unmodifiableList(al); + } + + public Allele getAllele(int i) { + if ( getType() == Type.UNAVAILABLE ) + throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype"); + return alleles.get(i); + } + + public boolean isPhased() { return isPhased; } + + /** + * @return the ploidy of this genotype + */ + public int getPloidy() { + if ( alleles == null ) + throw new ReviewedStingException("Requesting ploidy for an UNAVAILABLE genotype"); + return alleles.size(); + } + + public enum Type { + NO_CALL, + HOM_REF, + HET, + HOM_VAR, + UNAVAILABLE, + MIXED // no-call and call in the same genotype + } + + public Type getType() { + if ( type == null ) { + type = determineType(); + } + return type; + } + + protected Type determineType() { + if ( alleles == null ) + return Type.UNAVAILABLE; + + boolean sawNoCall = false, sawMultipleAlleles = false; + Allele observedAllele = null; + + for ( Allele allele : alleles ) { + if ( allele.isNoCall() ) + sawNoCall = true; + else if ( observedAllele == null ) + observedAllele = allele; + else if ( !allele.equals(observedAllele) ) + sawMultipleAlleles = true; + } + + if ( sawNoCall ) { + if ( observedAllele == null ) + return Type.NO_CALL; + return Type.MIXED; + } + + if ( observedAllele == null ) + throw new ReviewedStingException("BUG: there are no alleles present in this genotype but the alleles list is not null"); + + return sawMultipleAlleles ? Type.HET : observedAllele.isReference() ? Type.HOM_REF : Type.HOM_VAR; + } + + /** + * @return true if all observed alleles are the same (regardless of whether they are ref or alt); if any alleles are no-calls, this method will return false. + */ + public boolean isHom() { return isHomRef() || isHomVar(); } + + /** + * @return true if all observed alleles are ref; if any alleles are no-calls, this method will return false. + */ + public boolean isHomRef() { return getType() == Type.HOM_REF; } + + /** + * @return true if all observed alleles are alt; if any alleles are no-calls, this method will return false. + */ + public boolean isHomVar() { return getType() == Type.HOM_VAR; } + + /** + * @return true if we're het (observed alleles differ); if the ploidy is less than 2 or if any alleles are no-calls, this method will return false. + */ + public boolean isHet() { return getType() == Type.HET; } + + /** + * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF); if any alleles are not no-calls (even if some are), this method will return false. + */ + public boolean isNoCall() { return getType() == Type.NO_CALL; } + + /** + * @return true if this genotype is comprised of any alleles that are not no-calls (even if some are). + */ + public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; } + + /** + * @return true if this genotype is comprised of both calls and no-calls. + */ + public boolean isMixed() { return getType() == Type.MIXED; } + + /** + * @return true if the type of this genotype is set. + */ + public boolean isAvailable() { return getType() != Type.UNAVAILABLE; } + + // + // Useful methods for getting genotype likelihoods for a genotype object, if present + // + public boolean hasLikelihoods() { + return (hasAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) && !getAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY).equals(VCFConstants.MISSING_VALUE_v4)) || + (hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && !getAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).equals(VCFConstants.MISSING_VALUE_v4)); + } + + public GenotypeLikelihoods getLikelihoods() { + GenotypeLikelihoods x = getLikelihoods(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, true); + if ( x != null ) + return x; + else { + x = getLikelihoods(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, false); + if ( x != null ) return x; + else + throw new IllegalStateException("BUG: genotype likelihood field in " + this.getSampleName() + " sample are not either a string or a genotype likelihood class!"); + } + } + + private GenotypeLikelihoods getLikelihoods(String key, boolean asPL) { + Object x = getAttribute(key); + if ( x instanceof String ) { + if ( asPL ) + return GenotypeLikelihoods.fromPLField((String)x); + else + return GenotypeLikelihoods.fromGLField((String)x); + } + else if ( x instanceof GenotypeLikelihoods ) return (GenotypeLikelihoods)x; + else return null; + } + + public void validate() { + if ( alleles == null ) return; + if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0"); + + // int nNoCalls = 0; + for ( Allele allele : alleles ) { + if ( allele == null ) + throw new IllegalArgumentException("BUG: allele cannot be null in Genotype"); + // nNoCalls += allele.isNoCall() ? 1 : 0; + } + + // Technically, the spec does allow for the below case so this is not an illegal state + //if ( nNoCalls > 0 && nNoCalls != alleles.size() ) + // throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this); + } + + public String getGenotypeString() { + return getGenotypeString(true); + } + + public String getGenotypeString(boolean ignoreRefState) { + if ( alleles == null ) + return null; + + // Notes: + // 1. Make sure to use the appropriate separator depending on whether the genotype is phased + // 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele) + // 3. So that everything is deterministic with regards to integration tests, we sort Alleles (when the genotype isn't phased, of course) + return ParsingUtils.join(isPhased() ? PHASED_ALLELE_SEPARATOR : UNPHASED_ALLELE_SEPARATOR, + ignoreRefState ? getAlleleStrings() : (isPhased() ? getAlleles() : ParsingUtils.sortList(getAlleles()))); + } + + private List getAlleleStrings() { + List al = new ArrayList(); + for ( Allele a : alleles ) + al.add(a.getBaseString()); + + return al; + } + + public String toString() { + int Q = (int)Math.round(getPhredScaledQual()); + return String.format("[%s %s Q%s %s]", getSampleName(), getGenotypeString(false), + Q == -10 ? "." : String.format("%2d",Q), sortedString(getAttributes())); + } + + public String toBriefString() { + return String.format("%s:Q%.2f", getGenotypeString(false), getPhredScaledQual()); + } + + public boolean sameGenotype(Genotype other) { + return sameGenotype(other, true); + } + + public boolean sameGenotype(Genotype other, boolean ignorePhase) { + if (getPloidy() != other.getPloidy()) + return false; // gotta have the same number of allele to be equal + + // By default, compare the elements in the lists of alleles, element-by-element + Collection thisAlleles = this.getAlleles(); + Collection otherAlleles = other.getAlleles(); + + if (ignorePhase) { // do not care about order, only identity of Alleles + thisAlleles = new TreeSet(thisAlleles); //implemented Allele.compareTo() + otherAlleles = new TreeSet(otherAlleles); + } + + return thisAlleles.equals(otherAlleles); + } + + /** + * a utility method for generating sorted strings from a map key set. + * @param c the map + * @param the key type + * @param the value type + * @return a sting, enclosed in {}, with comma seperated key value pairs in order of the keys + */ + private static , V> String sortedString(Map c) { + // NOTE -- THIS IS COPIED FROM GATK UTILS TO ALLOW US TO KEEP A SEPARATION BETWEEN THE GATK AND VCF CODECS + List t = new ArrayList(c.keySet()); + Collections.sort(t); + + List pairs = new ArrayList(); + for (T k : t) { + pairs.add(k + "=" + c.get(k)); + } + + return "{" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}"; + } + + + // --------------------------------------------------------------------------------------------------------- + // + // get routines to access context info fields + // + // --------------------------------------------------------------------------------------------------------- + public String getSampleName() { return commonInfo.getName(); } + public Set getFilters() { return commonInfo.getFilters(); } + public boolean isFiltered() { return commonInfo.isFiltered(); } + public boolean isNotFiltered() { return commonInfo.isNotFiltered(); } + public boolean filtersWereApplied() { return filtersWereAppliedToContext; } + public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); } + public double getNegLog10PError() { return commonInfo.getNegLog10PError(); } + public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); } + + public Map getAttributes() { return commonInfo.getAttributes(); } + public boolean hasAttribute(String key) { return commonInfo.hasAttribute(key); } + public Object getAttribute(String key) { return commonInfo.getAttribute(key); } + + public Object getAttribute(String key, Object defaultValue) { + return commonInfo.getAttribute(key, defaultValue); + } + + public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); } + public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } + public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } + public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/GenotypeLikelihoods.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/GenotypeLikelihoods.java new file mode 100755 index 000000000..02fb32451 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/GenotypeLikelihoods.java @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext.v13; + +import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.EnumMap; +import java.util.Map; + +public class GenotypeLikelihoods { + public static final boolean CAP_PLS = false; + public static final int PL_CAP = 255; + + // + // There are two objects here because we are lazy in creating both representations + // for this object: a vector of log10 Probs and the PL phred-scaled string. Supports + // having one set during initializating, and dynamic creation of the other, if needed + // + private double[] log10Likelihoods = null; + private String likelihoodsAsString_PLs = null; + + public final static GenotypeLikelihoods fromPLField(String PLs) { + return new GenotypeLikelihoods(PLs); + } + + public final static GenotypeLikelihoods fromGLField(String GLs) { + return new GenotypeLikelihoods(parseDeprecatedGLString(GLs)); + } + + public final static GenotypeLikelihoods fromLog10Likelihoods(double[] log10Likelihoods) { + return new GenotypeLikelihoods(log10Likelihoods); + } + + // + // You must use the factory methods now + // + protected GenotypeLikelihoods(String asString) { + likelihoodsAsString_PLs = asString; + } + + protected GenotypeLikelihoods(double[] asVector) { + log10Likelihoods = asVector; + } + + /** + * Returns the genotypes likelihoods in negative log10 vector format. pr{AA} = x, this + * vector returns math.log10(x) for each of the genotypes. Can return null if the + * genotype likelihoods are "missing". + * + * @return + */ + public double[] getAsVector() { + // assumes one of the likelihoods vector or the string isn't null + if ( log10Likelihoods == null ) { + // make sure we create the GL string if it doesn't already exist + log10Likelihoods = parsePLsIntoLikelihoods(likelihoodsAsString_PLs); + } + + return log10Likelihoods; + } + + public String toString() { + return getAsString(); + } + + public String getAsString() { + if ( likelihoodsAsString_PLs == null ) { + // todo -- should we accept null log10Likelihoods and set PLs as MISSING? + if ( log10Likelihoods == null ) + throw new TribbleException("BUG: Attempted to get likelihoods as strings and neither the vector nor the string is set!"); + likelihoodsAsString_PLs = convertLikelihoodsToPLString(log10Likelihoods); + } + + return likelihoodsAsString_PLs; + } + + //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values + //Returns null in case of missing likelihoods + public EnumMap getAsMap(boolean normalizeFromLog10){ + //Make sure that the log10likelihoods are set + double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector(); + if(likelihoods == null) + return null; + EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class); + likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]); + likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]); + likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]); + return likelihoodsMap; + } + + //Return the neg log10 Genotype Quality (GQ) for the given genotype + //Returns Double.NEGATIVE_INFINITY in case of missing genotype + public double getNegLog10GQ(Genotype.Type genotype){ + + double qual = Double.NEGATIVE_INFINITY; + EnumMap likelihoods = getAsMap(false); + if(likelihoods == null) + return qual; + for(Map.Entry likelihood : likelihoods.entrySet()){ + if(likelihood.getKey() == genotype) + continue; + if(likelihood.getValue() > qual) + qual = likelihood.getValue(); + + } + + //Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best) + qual = likelihoods.get(genotype) - qual; + + //Quality of other genotypes 1-P(G) + if (qual < 0) { + double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); + double chosenGenotype = normalized[genotype.ordinal()-1]; + qual = -1.0 * Math.log10(1.0 - chosenGenotype); + } + return qual; + } + + private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) { + if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) { + String[] strings = likelihoodsAsString_PLs.split(","); + double[] likelihoodsAsVector = new double[strings.length]; + for ( int i = 0; i < strings.length; i++ ) { + likelihoodsAsVector[i] = Integer.parseInt(strings[i]) / -10.0; + } + return likelihoodsAsVector; + } else + return null; + } + + /** + * Back-compatibility function to read old style GL formatted genotype likelihoods in VCF format + * @param GLString + * @return + */ + private final static double[] parseDeprecatedGLString(String GLString) { + if ( !GLString.equals(VCFConstants.MISSING_VALUE_v4) ) { + String[] strings = GLString.split(","); + double[] likelihoodsAsVector = new double[strings.length]; + for ( int i = 0; i < strings.length; i++ ) { + likelihoodsAsVector[i] = Double.parseDouble(strings[i]); + } + return likelihoodsAsVector; + } + + return null; + } + + private final static String convertLikelihoodsToPLString(double[] GLs) { + if ( GLs == null ) + return VCFConstants.MISSING_VALUE_v4; + + StringBuilder s = new StringBuilder(); + + double adjust = Double.NEGATIVE_INFINITY; + for ( double l : GLs ) adjust = Math.max(adjust, l); + + boolean first = true; + for ( double l : GLs ) { + if ( ! first ) + s.append(","); + else + first = false; + + long PL = Math.round(-10 * (l - adjust)); + if ( CAP_PLS ) + PL = Math.min(PL, PL_CAP); + s.append(Long.toString(PL)); + } + + return s.toString(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/IndexingVCFWriter.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/IndexingVCFWriter.java new file mode 100644 index 000000000..85444c325 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/IndexingVCFWriter.java @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext.v13; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMSequenceDictionary; +import org.broad.tribble.Tribble; +import org.broad.tribble.TribbleException; +import org.broad.tribble.index.DynamicIndexCreator; +import org.broad.tribble.index.Index; +import org.broad.tribble.index.IndexFactory; +import org.broad.tribble.util.LittleEndianOutputStream; +import org.broad.tribble.util.PositionalStream; +import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.*; + +/** + * this class writes VCF files + */ +abstract class IndexingVCFWriter implements VCFWriter { + final private String name; + private final SAMSequenceDictionary refDict; + + private OutputStream outputStream; + private PositionalStream positionalStream = null; + private DynamicIndexCreator indexer = null; + private LittleEndianOutputStream idxStream = null; + + @Requires({"name != null", + "! ( location == null && output == null )", + "! ( enableOnTheFlyIndexing && location == null )"}) + protected IndexingVCFWriter(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) { + outputStream = output; + this.name = name; + this.refDict = refDict; + + if ( enableOnTheFlyIndexing ) { + try { + idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(location))); + //System.out.println("Creating index on the fly for " + location); + indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); + indexer.initialize(location, indexer.defaultBinSize()); + positionalStream = new PositionalStream(output); + outputStream = positionalStream; + } catch ( IOException ex ) { + // No matter what we keep going, since we don't care if we can't create the index file + idxStream = null; + indexer = null; + positionalStream = null; + } + } + } + + @Ensures("result != null") + public OutputStream getOutputStream() { + return outputStream; + } + + @Ensures("result != null") + public String getStreamName() { + return name; + } + + public abstract void writeHeader(VCFHeader header); + + /** + * attempt to close the VCF file + */ + public void close() { + // try to close the index stream (keep it separate to help debugging efforts) + if ( indexer != null ) { + try { + Index index = indexer.finalizeIndex(positionalStream.getPosition()); + IndexDictionaryUtils.setIndexSequenceDictionary(index, refDict); + index.write(idxStream); + idxStream.close(); + } catch (IOException e) { + throw new ReviewedStingException("Unable to close index for " + getStreamName(), e); + } + } + } + + /** + * add a record to the file + * + * @param vc the Variant Context object + */ + public void add(VariantContext vc) { + // if we are doing on the fly indexing, add the record ***before*** we write any bytes + if ( indexer != null ) + indexer.addFeature(vc, positionalStream.getPosition()); + } + + /** + * Returns a reasonable "name" for this writer, to display to the user if something goes wrong + * + * @param location + * @param stream + * @return + */ + protected static final String writerName(final File location, final OutputStream stream) { + return location == null ? stream.toString() : location.getAbsolutePath(); + } + + /** + * Returns a output stream writing to location, or throws a UserException if this fails + * @param location + * @return + */ + protected static OutputStream openOutputStream(final File location) { + try { + return new FileOutputStream(location); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(location, "Unable to create VCF writer", e); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/InferredGeneticContext.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/InferredGeneticContext.java new file mode 100755 index 000000000..43f61343e --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/InferredGeneticContext.java @@ -0,0 +1,243 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + + +import java.util.*; + + +/** + * Common utility routines for VariantContext and Genotype + * + * @author depristo + */ +final class InferredGeneticContext { + public static final double NO_NEG_LOG_10PERROR = -1.0; + + private static Set NO_FILTERS = Collections.unmodifiableSet(new HashSet()); + private static Map NO_ATTRIBUTES = Collections.unmodifiableMap(new HashMap()); + + private double negLog10PError = NO_NEG_LOG_10PERROR; + private String name = null; + private Set filters = NO_FILTERS; + private Map attributes = NO_ATTRIBUTES; + +// public InferredGeneticContext(String name) { +// this.name = name; +// } +// +// public InferredGeneticContext(String name, double negLog10PError) { +// this(name); +// setNegLog10PError(negLog10PError); +// } + + public InferredGeneticContext(String name, double negLog10PError, Set filters, Map attributes) { + this.name = name; + setNegLog10PError(negLog10PError); + if ( filters != null ) + setFilters(filters); + if ( attributes != null ) + setAttributes(attributes); + } + + /** + * @return the name + */ + public String getName() { + return name; + } + + /** + * Sets the name + * + * @param name the name associated with this information + */ + public void setName(String name) { + if ( name == null ) throw new IllegalArgumentException("Name cannot be null " + this); + this.name = name; + } + + + // --------------------------------------------------------------------------------------------------------- + // + // Filter + // + // --------------------------------------------------------------------------------------------------------- + + public Set getFilters() { + return Collections.unmodifiableSet(filters); + } + + public boolean isFiltered() { + return filters.size() > 0; + } + + public boolean isNotFiltered() { + return ! isFiltered(); + } + + public void addFilter(String filter) { + if ( filters == NO_FILTERS ) // immutable -> mutable + filters = new HashSet(filters); + + if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this); + if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this); + filters.add(filter); + } + + public void addFilters(Collection filters) { + if ( filters == null ) throw new IllegalArgumentException("BUG: Attempting to add null filters at" + this); + for ( String f : filters ) + addFilter(f); + } + + public void clearFilters() { + filters = new HashSet(); + } + + public void setFilters(Collection filters) { + clearFilters(); + addFilters(filters); + } + + // --------------------------------------------------------------------------------------------------------- + // + // Working with log error rates + // + // --------------------------------------------------------------------------------------------------------- + + public boolean hasNegLog10PError() { + return getNegLog10PError() != NO_NEG_LOG_10PERROR; + } + + /** + * @return the -1 * log10-based error estimate + */ + public double getNegLog10PError() { return negLog10PError; } + public double getPhredScaledQual() { return getNegLog10PError() * 10; } + + public void setNegLog10PError(double negLog10PError) { + if ( negLog10PError < 0 && negLog10PError != NO_NEG_LOG_10PERROR ) throw new IllegalArgumentException("BUG: negLog10PError cannot be < than 0 : " + negLog10PError); + if ( Double.isInfinite(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be Infinity"); + if ( Double.isNaN(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be NaN"); + + this.negLog10PError = negLog10PError; + } + + // --------------------------------------------------------------------------------------------------------- + // + // Working with attributes + // + // --------------------------------------------------------------------------------------------------------- + public void clearAttributes() { + attributes = new HashMap(); + } + + /** + * @return the attribute map + */ + public Map getAttributes() { + return Collections.unmodifiableMap(attributes); + } + + // todo -- define common attributes as enum + + public void setAttributes(Map map) { + clearAttributes(); + putAttributes(map); + } + + public void putAttribute(String key, Object value) { + putAttribute(key, value, false); + } + + public void putAttribute(String key, Object value, boolean allowOverwrites) { + if ( ! allowOverwrites && hasAttribute(key) ) + throw new IllegalStateException("Attempting to overwrite key->value binding: key = " + key + " this = " + this); + + if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable + attributes = new HashMap(); + + attributes.put(key, value); + } + + public void removeAttribute(String key) { + if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable + attributes = new HashMap(); + attributes.remove(key); + } + + public void putAttributes(Map map) { + if ( map != null ) { + // for efficiency, we can skip the validation if the map is empty + if ( attributes.size() == 0 ) { + if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable + attributes = new HashMap(); + attributes.putAll(map); + } else { + for ( Map.Entry elt : map.entrySet() ) { + putAttribute(elt.getKey(), elt.getValue(), false); + } + } + } + } + + public boolean hasAttribute(String key) { + return attributes.containsKey(key); + } + + public int getNumAttributes() { + return attributes.size(); + } + + /** + * @param key the attribute key + * + * @return the attribute value for the given key (or null if not set) + */ + public Object getAttribute(String key) { + return attributes.get(key); + } + + public Object getAttribute(String key, Object defaultValue) { + if ( hasAttribute(key) ) + return attributes.get(key); + else + return defaultValue; + } + + public String getAttributeAsString(String key, String defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof String ) return (String)x; + return String.valueOf(x); // throws an exception if this isn't a string + } + + public int getAttributeAsInt(String key, int defaultValue) { + Object x = getAttribute(key); + if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue; + if ( x instanceof Integer ) return (Integer)x; + return Integer.valueOf((String)x); // throws an exception if this isn't a string + } + + public double getAttributeAsDouble(String key, double defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof Double ) return (Double)x; + return Double.valueOf((String)x); // throws an exception if this isn't a string + } + + public boolean getAttributeAsBoolean(String key, boolean defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof Boolean ) return (Boolean)x; + return Boolean.valueOf((String)x); // throws an exception if this isn't a string + } + +// public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null" +// public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); } +// public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); } +// public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); } +// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} } +// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} } +// public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); } +// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableGenotype.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableGenotype.java new file mode 100755 index 000000000..f5072e040 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableGenotype.java @@ -0,0 +1,68 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +import java.util.*; + +/** + * This class emcompasses all the basic information about a genotype. It is immutable. + * + * @author Mark DePristo + */ +class MutableGenotype extends Genotype { + public MutableGenotype(Genotype parent) { + super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased()); + } + + public MutableGenotype(String sampleName, Genotype parent) { + super(sampleName, parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased()); + } + + + public MutableGenotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean genotypesArePhased) { + super(sampleName, alleles, negLog10PError, filters, attributes, genotypesArePhased); + } + + public MutableGenotype(String sampleName, List alleles, double negLog10PError) { + super(sampleName, alleles, negLog10PError); + } + + public MutableGenotype(String sampleName, List alleles) { + super(sampleName, alleles); + } + + public Genotype unmodifiableGenotype() { + return new Genotype(getSampleName(), getAlleles(), getNegLog10PError(), getFilters(), getAttributes(), isPhased()); + } + + + /** + * + * @param alleles list of alleles + */ + public void setAlleles(List alleles) { + this.alleles = new ArrayList(alleles); + validate(); + } + + public void setPhase(boolean isPhased) { + super.isPhased = isPhased; + } + + // --------------------------------------------------------------------------------------------------------- + // + // InferredGeneticContext mutation operators + // + // --------------------------------------------------------------------------------------------------------- + public void setName(String name) { commonInfo.setName(name); } + public void addFilter(String filter) { commonInfo.addFilter(filter); } + public void addFilters(Collection filters) { commonInfo.addFilters(filters); } + public void clearFilters() { commonInfo.clearFilters(); } + public void setFilters(Collection filters) { commonInfo.setFilters(filters); } + public void setAttributes(Map map) { commonInfo.setAttributes(map); } + public void clearAttributes() { commonInfo.clearAttributes(); } + public void putAttribute(String key, Object value) { commonInfo.putAttribute(key, value); } + public void removeAttribute(String key) { commonInfo.removeAttribute(key); } + public void putAttributes(Map map) { commonInfo.putAttributes(map); } + public void setNegLog10PError(double negLog10PError) { commonInfo.setNegLog10PError(negLog10PError); } + public void putAttribute(String key, Object value, boolean allowOverwrites) { commonInfo.putAttribute(key, value, allowOverwrites); } + +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableVariantContext.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableVariantContext.java new file mode 100755 index 000000000..24e71ae50 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/MutableVariantContext.java @@ -0,0 +1,213 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + + +import java.util.Collection; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +/** + * Mutable version of VariantContext + * + * @author depristo + */ +class MutableVariantContext extends VariantContext { + // --------------------------------------------------------------------------------------------------------- + // + // constructors + // + // --------------------------------------------------------------------------------------------------------- + + public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { + super(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes); + } + + public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { + super(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes); + } + + public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles) { + super(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); + } + + public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes) { + super(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); + } + + public MutableVariantContext(VariantContext parent) { + super(parent.getSource(), parent.contig, parent.start, parent.stop, parent.getAlleles(), parent.getGenotypes(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.getReferenceBaseForIndel()); + } + + /** + * Sets the alleles segregating in this context to the collect of alleles. Each of which must be unique according + * to equals() in Allele. Validate() should be called when you are done modifying the context. + * + * @param alleles + */ + public void setAlleles(Collection alleles) { + this.alleles.clear(); + for ( Allele a : alleles ) + addAllele(a); + } + + /** + * Adds allele to the segregating allele list in this context to the collection of alleles. The new + * allele must be be unique according to equals() in Allele. + * Validate() should be called when you are done modifying the context. + * + * @param allele + */ + public void addAllele(Allele allele) { + final boolean allowDuplicates = false; // used to be a parameter + + type = null; + + for ( Allele a : alleles ) { + if ( a.basesMatch(allele) && ! allowDuplicates ) + throw new IllegalArgumentException("Duplicate allele added to VariantContext" + this); + } + + // we are a novel allele + alleles.add(allele); + } + + public void clearGenotypes() { + genotypes = new TreeMap(); + } + + /** + * Adds this single genotype to the context, not allowing duplicate genotypes to be added + * @param genotype + */ + public void addGenotypes(Genotype genotype) { + putGenotype(genotype.getSampleName(), genotype, false); + } + + /** + * Adds these genotypes to the context, not allowing duplicate genotypes to be added + * @param genotypes + */ + public void addGenotypes(Collection genotypes) { + for ( Genotype g : genotypes ) { + addGenotype(g); + } + } + + /** + * Adds these genotype to the context, not allowing duplicate genotypes to be added. + * @param genotypes + */ + public void addGenotypes(Map genotypes) { + + for ( Map.Entry elt : genotypes.entrySet() ) { + addGenotype(elt.getValue()); + } + } + + /** + * Adds these genotypes to the context. + * + * @param genotypes + */ + public void putGenotypes(Map genotypes) { + for ( Map.Entry g : genotypes.entrySet() ) + putGenotype(g.getKey(), g.getValue()); + } + + /** + * Adds these genotypes to the context. + * + * @param genotypes + */ + public void putGenotypes(Collection genotypes) { + for ( Genotype g : genotypes ) + putGenotype(g); + } + + /** + * Adds this genotype to the context, throwing an error if it's already bound. + * + * @param genotype + */ + public void addGenotype(Genotype genotype) { + addGenotype(genotype.getSampleName(), genotype); + } + + /** + * Adds this genotype to the context, throwing an error if it's already bound. + * + * @param genotype + */ + public void addGenotype(String sampleName, Genotype genotype) { + putGenotype(sampleName, genotype, false); + } + + /** + * Adds this genotype to the context. + * + * @param genotype + */ + public void putGenotype(Genotype genotype) { + putGenotype(genotype.getSampleName(), genotype); + } + + /** + * Adds this genotype to the context. + * + * @param genotype + */ + public void putGenotype(String sampleName, Genotype genotype) { + putGenotype(sampleName, genotype, true); + } + + private void putGenotype(String sampleName, Genotype genotype, boolean allowOverwrites) { + if ( hasGenotype(sampleName) && ! allowOverwrites ) + throw new IllegalStateException("Attempting to overwrite sample->genotype binding: " + sampleName + " this=" + this); + + if ( ! sampleName.equals(genotype.getSampleName()) ) + throw new IllegalStateException("Sample name doesn't equal genotype.getSample(): " + sampleName + " genotype=" + genotype); + + this.genotypes.put(sampleName, genotype); + } + + /** + * Removes the binding from sampleName to genotype. If this doesn't exist, throws an IllegalArgumentException + * @param sampleName + */ + public void removeGenotype(String sampleName) { + if ( ! this.genotypes.containsKey(sampleName) ) + throw new IllegalArgumentException("Sample name isn't contained in genotypes " + sampleName + " genotypes =" + genotypes); + + this.genotypes.remove(sampleName); + } + + /** + * Removes genotype from the context. If this doesn't exist, throws an IllegalArgumentException + * @param genotype + */ + public void removeGenotype(Genotype genotype) { + removeGenotype(genotype.getSampleName()); + } + + // todo -- add replace genotype routine + + // --------------------------------------------------------------------------------------------------------- + // + // InferredGeneticContext mutation operators + // + // --------------------------------------------------------------------------------------------------------- + + public void setSource(String source) { commonInfo.setName(source); } + public void addFilter(String filter) { commonInfo.addFilter(filter); } + public void addFilters(Collection filters) { commonInfo.addFilters(filters); } + public void clearFilters() { commonInfo.clearFilters(); } + public void setFilters(Collection filters) { commonInfo.setFilters(filters); } + public void setAttributes(Map map) { commonInfo.setAttributes(map); } + public void clearAttributes() { commonInfo.clearAttributes(); } + public void putAttribute(String key, Object value) { commonInfo.putAttribute(key, value); } + public void removeAttribute(String key) { commonInfo.removeAttribute(key); } + public void putAttributes(Map map) { commonInfo.putAttributes(map); } + public void setNegLog10PError(double negLog10PError) { commonInfo.setNegLog10PError(negLog10PError); } + public void putAttribute(String key, Object value, boolean allowOverwrites) { commonInfo.putAttribute(key, value, allowOverwrites); } + public void setID(String id) { putAttribute(ID_KEY, id, true); } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCF3Codec.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCF3Codec.java new file mode 100755 index 000000000..9f653872a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCF3Codec.java @@ -0,0 +1,198 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +import org.broad.tribble.TribbleException; +import org.broad.tribble.readers.LineReader; +import org.broad.tribble.util.ParsingUtils; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.*; + + +/** + * A feature codec for the VCF3 specification, to read older VCF files. VCF3 has been + * depreciated in favor of VCF4 (See VCF codec for the latest information) + * + *

+ * Reads historical VCF3 encoded files (1000 Genomes Pilot results, for example) + *

+ * + *

+ * See also: @see VCF specification
+ * See also: @see VCF spec. publication + *

+ * + * @author Mark DePristo + * @since 2010 + */ +class VCF3Codec extends AbstractVCFCodec { + public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3"; + + + /** + * @param reader the line reader to take header lines from + * @return the number of header lines + */ + public Object readHeader(LineReader reader) { + List headerStrings = new ArrayList(); + + String line; + try { + boolean foundHeaderVersion = false; + while ((line = reader.readLine()) != null) { + lineNo++; + if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { + String[] lineFields = line.substring(2).split("="); + if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { + if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) + throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); + foundHeaderVersion = true; + version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); + if ( version != VCFHeaderVersion.VCF3_3 && version != VCFHeaderVersion.VCF3_2 ) + throw new TribbleException.InvalidHeader("This codec is strictly for VCFv3 and does not support " + lineFields[1]); + } + headerStrings.add(line); + } + else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { + if (!foundHeaderVersion) { + throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); + } + return createHeader(headerStrings, line); + } + else { + throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); + } + + } + } catch (IOException e) { + throw new RuntimeException("IO Exception ", e); + } + throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); + } + + + /** + * parse the filter string, first checking to see if we already have parsed it in a previous attempt + * @param filterString the string to parse + * @return a set of the filters applied + */ + protected Set parseFilters(String filterString) { + + // null for unfiltered + if ( filterString.equals(VCFConstants.UNFILTERED) ) + return null; + + // empty set for passes filters + LinkedHashSet fFields = new LinkedHashSet(); + + if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) + return fFields; + + if ( filterString.length() == 0 ) + generateException("The VCF specification requires a valid filter status"); + + // do we have the filter string cached? + if ( filterHash.containsKey(filterString) ) + return filterHash.get(filterString); + + // otherwise we have to parse and cache the value + if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) + fFields.add(filterString); + else + fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); + + filterHash.put(filterString, fFields); + + return fFields; + } + + /** + * create a genotype map + * @param str the string + * @param alleles the list of alleles + * @param chr chrom + * @param pos position + * @return a mapping of sample name to genotype object + */ + public Map createGenotypeMap(String str, List alleles, String chr, int pos) { + if (genotypeParts == null) + genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; + + int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); + + Map genotypes = new LinkedHashMap(nParts); + + // get the format keys + int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); + + // cycle through the sample names + Iterator sampleNameIterator = header.getGenotypeSamples().iterator(); + + // clear out our allele mapping + alleleMap.clear(); + + // cycle through the genotype strings + for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) { + int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); + + double GTQual = VariantContext.NO_NEG_LOG_10PERROR; + Set genotypeFilters = null; + Map gtAttributes = null; + String sampleName = sampleNameIterator.next(); + + // check to see if the value list is longer than the key list, which is a problem + if (nGTKeys < GTValueSplitSize) + generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]); + + int genotypeAlleleLocation = -1; + if (nGTKeys >= 1) { + gtAttributes = new HashMap(nGTKeys - 1); + + for (int i = 0; i < nGTKeys; i++) { + final String gtKey = new String(genotypeKeyArray[i]); + boolean missing = i >= GTValueSplitSize; + + if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) { + genotypeAlleleLocation = i; + } else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { + GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]); + } else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { + genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i])); + } else if ( missing || GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) ) { + gtAttributes.put(gtKey, VCFConstants.MISSING_VALUE_v4); + } else { + gtAttributes.put(gtKey, new String(GTValueArray[i])); + } + } + } + + // check to make sure we found a genotype field + if ( genotypeAlleleLocation < 0 ) + generateException("Unable to find the GT field for the record; the GT field is required"); + if ( genotypeAlleleLocation > 0 ) + generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes"); + + boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1; + + // add it to the list + try { + genotypes.put(sampleName, new Genotype(sampleName, + parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap), + GTQual, + genotypeFilters, + gtAttributes, + phased)); + } catch (TribbleException e) { + throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos); + } + } + + return genotypes; + } + + @Override + public boolean canDecode(final File potentialInput) { + return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFAltHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFAltHeaderLine.java new file mode 100644 index 000000000..e432fe411 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFAltHeaderLine.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +/** + * @author ebanks + * A class representing a key=value entry for ALT fields in the VCF header + */ +class VCFAltHeaderLine extends VCFSimpleHeaderLine { + + /** + * create a VCF filter header line + * + * @param name the name for this header line + * @param description the description for this header line + */ + public VCFAltHeaderLine(String name, String description) { + super(name, description, SupportedHeaderLineType.ALT); + } + + /** + * create a VCF info header line + * + * @param line the header line + * @param version the vcf header version + */ + protected VCFAltHeaderLine(String line, VCFHeaderVersion version) { + super(line, version, SupportedHeaderLineType.ALT); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCodec.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCodec.java new file mode 100755 index 000000000..f873aebcc --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCodec.java @@ -0,0 +1,228 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +import org.broad.tribble.TribbleException; +import org.broad.tribble.readers.LineReader; +import org.broad.tribble.util.ParsingUtils; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.*; + +/** + * A feature codec for the VCF 4 specification + * + *

+ * VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a + * header line, and then data lines each containing information about a position in the genome. + *

+ *

One of the main uses of next-generation sequencing is to discover variation amongst large populations + * of related samples. Recently the format for storing next-generation read alignments has been + * standardised by the SAM/BAM file format specification. This has significantly improved the + * interoperability of next-generation tools for alignment, visualisation, and variant calling. + * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent + * types of sequence variation, including SNPs, indels and larger structural variants, together + * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for + * fast data retrieval of variants from a range of positions on the reference genome. + * The format was developed for the 1000 Genomes Project, and has also been adopted by other projects + * such as UK10K, dbSNP, or the NHLBI Exome Project. VCFtools is a software suite that implements + * various utilities for processing VCF files, including validation, merging and comparing, + * and also provides a general Perl and Python API. + * The VCF specification and VCFtools are available from http://vcftools.sourceforge.net.

+ * + *

+ * See also: @see VCF specification
+ * See also: @see VCF spec. publication + *

+ * + *

File format example

+ *
+ *     ##fileformat=VCFv4.0
+ *     #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA12878
+ *     chr1    109     .       A       T       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:610,327:308:-316.30,-95.47,-803.03:99
+ *     chr1    147     .       C       A       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:294,49:118:-57.87,-34.96,-338.46:99
+ * 
+ * + * @author Mark DePristo + * @since 2010 + */ +public class VCFCodec extends AbstractVCFCodec { + // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + + public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; + + /** + * @param reader the line reader to take header lines from + * @return the number of header lines + */ + public Object readHeader(LineReader reader) { + List headerStrings = new ArrayList(); + + String line; + try { + boolean foundHeaderVersion = false; + while ((line = reader.readLine()) != null) { + lineNo++; + if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { + String[] lineFields = line.substring(2).split("="); + if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { + if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) + throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); + foundHeaderVersion = true; + version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); + if ( version == VCFHeaderVersion.VCF3_3 || version == VCFHeaderVersion.VCF3_2 ) + throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]); + if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 ) + throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]); + } + headerStrings.add(line); + } + else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { + if (!foundHeaderVersion) { + throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); + } + return createHeader(headerStrings, line); + } + else { + throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); + } + + } + } catch (IOException e) { + throw new RuntimeException("IO Exception ", e); + } + throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); + } + + + /** + * parse the filter string, first checking to see if we already have parsed it in a previous attempt + * + * @param filterString the string to parse + * @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF) + */ + protected Set parseFilters(String filterString) { + return parseFilters(filterHash, lineNo, filterString); + } + + public static Set parseFilters(final Map> cache, final int lineNo, final String filterString) { + // null for unfiltered + if ( filterString.equals(VCFConstants.UNFILTERED) ) + return null; + + if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) + return Collections.emptySet(); + if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) + generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); + if ( filterString.length() == 0 ) + generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); + + // do we have the filter string cached? + if ( cache != null && cache.containsKey(filterString) ) + return Collections.unmodifiableSet(cache.get(filterString)); + + // empty set for passes filters + LinkedHashSet fFields = new LinkedHashSet(); + // otherwise we have to parse and cache the value + if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) + fFields.add(filterString); + else + fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); + + fFields = fFields; + if ( cache != null ) cache.put(filterString, fFields); + + return Collections.unmodifiableSet(fFields); + } + + + /** + * create a genotype map + * @param str the string + * @param alleles the list of alleles + * @return a mapping of sample name to genotype object + */ + public Map createGenotypeMap(String str, List alleles, String chr, int pos) { + if (genotypeParts == null) + genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; + + int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); + + Map genotypes = new LinkedHashMap(nParts); + + // get the format keys + int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); + + // cycle through the sample names + Iterator sampleNameIterator = header.getGenotypeSamples().iterator(); + + // clear out our allele mapping + alleleMap.clear(); + + // cycle through the genotype strings + for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) { + int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); + + double GTQual = VariantContext.NO_NEG_LOG_10PERROR; + Set genotypeFilters = null; + Map gtAttributes = null; + String sampleName = sampleNameIterator.next(); + + // check to see if the value list is longer than the key list, which is a problem + if (nGTKeys < GTValueSplitSize) + generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]); + + int genotypeAlleleLocation = -1; + if (nGTKeys >= 1) { + gtAttributes = new HashMap(nGTKeys - 1); + + for (int i = 0; i < nGTKeys; i++) { + final String gtKey = new String(genotypeKeyArray[i]); + boolean missing = i >= GTValueSplitSize; + + // todo -- all of these on the fly parsing of the missing value should be static constants + if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) { + genotypeAlleleLocation = i; + } else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { + GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]); + } else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { + genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i])); + } else if ( missing ) { + gtAttributes.put(gtKey, VCFConstants.MISSING_VALUE_v4); + } else { + gtAttributes.put(gtKey, new String(GTValueArray[i])); + } + } + } + + // check to make sure we found a genotype field if we are a VCF4.0 file + if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 ) + generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0"); + if ( genotypeAlleleLocation > 0 ) + generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present"); + + List GTalleles = (genotypeAlleleLocation == -1 ? null : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap)); + boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1; + + // add it to the list + try { + genotypes.put(sampleName, + new Genotype(sampleName, + GTalleles, + GTQual, + genotypeFilters, + gtAttributes, + phased)); + } catch (TribbleException e) { + throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos); + } + } + + return genotypes; + } + + @Override + public boolean canDecode(final File potentialInput) { + return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCompoundHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCompoundHeaderLine.java new file mode 100755 index 000000000..74d6cf62f --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFCompoundHeaderLine.java @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext.v13; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * a base class for compound header lines, which include info lines and format lines (so far) + */ +abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine { + public enum SupportedHeaderLineType { + INFO(true), FORMAT(false); + + public final boolean allowFlagValues; + SupportedHeaderLineType(boolean flagValues) { + allowFlagValues = flagValues; + } + } + + // the field types + private String name; + private int count = -1; + private VCFHeaderLineCount countType; + private String description; + private VCFHeaderLineType type; + + // access methods + public String getName() { return name; } + public String getDescription() { return description; } + public VCFHeaderLineType getType() { return type; } + public VCFHeaderLineCount getCountType() { return countType; } + public int getCount() { + if ( countType != VCFHeaderLineCount.INTEGER ) + throw new ReviewedStingException("Asking for header line count when type is not an integer"); + return count; + } + + // utility method + public int getCount(int numAltAlleles) { + int myCount; + switch ( countType ) { + case INTEGER: myCount = count; break; + case UNBOUNDED: myCount = -1; break; + case A: myCount = numAltAlleles; break; + case G: myCount = ((numAltAlleles + 1) * (numAltAlleles + 2) / 2); break; + default: throw new ReviewedStingException("Unknown count type: " + countType); + } + return myCount; + } + + public void setNumberToUnbounded() { + countType = VCFHeaderLineCount.UNBOUNDED; + count = -1; + } + + // our type of line, i.e. format, info, etc + private final SupportedHeaderLineType lineType; + + /** + * create a VCF format header line + * + * @param name the name for this header line + * @param count the count for this header line + * @param type the type for this header line + * @param description the description for this header line + * @param lineType the header line type + */ + protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { + super(lineType.toString(), ""); + this.name = name; + this.countType = VCFHeaderLineCount.INTEGER; + this.count = count; + this.type = type; + this.description = description; + this.lineType = lineType; + validate(); + } + + /** + * create a VCF format header line + * + * @param name the name for this header line + * @param count the count type for this header line + * @param type the type for this header line + * @param description the description for this header line + * @param lineType the header line type + */ + protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { + super(lineType.toString(), ""); + this.name = name; + this.countType = count; + this.type = type; + this.description = description; + this.lineType = lineType; + validate(); + } + + /** + * create a VCF format header line + * + * @param line the header line + * @param version the VCF header version + * @param lineType the header line type + * + */ + protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { + super(lineType.toString(), ""); + Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description")); + name = mapping.get("ID"); + count = -1; + final String numberStr = mapping.get("Number"); + if ( numberStr.equals(VCFConstants.PER_ALLELE_COUNT) ) { + countType = VCFHeaderLineCount.A; + } else if ( numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT) ) { + countType = VCFHeaderLineCount.G; + } else if ( ((version == VCFHeaderVersion.VCF4_0 || version == VCFHeaderVersion.VCF4_1) && + numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || + ((version == VCFHeaderVersion.VCF3_2 || version == VCFHeaderVersion.VCF3_3) && + numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3)) ) { + countType = VCFHeaderLineCount.UNBOUNDED; + } else { + countType = VCFHeaderLineCount.INTEGER; + count = Integer.valueOf(numberStr); + + } + type = VCFHeaderLineType.valueOf(mapping.get("Type")); + if (type == VCFHeaderLineType.Flag && !allowFlagValues()) + throw new IllegalArgumentException("Flag is an unsupported type for this kind of field"); + + description = mapping.get("Description"); + if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided + description = UNBOUND_DESCRIPTION; + + this.lineType = lineType; + + validate(); + } + + private void validate() { + if ( name == null || type == null || description == null || lineType == null ) + throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s", + super.getKey(), name, type, description, lineType )); + } + + /** + * make a string representation of this header line + * @return a string representation + */ + protected String toStringEncoding() { + Map map = new LinkedHashMap(); + map.put("ID", name); + Object number; + switch ( countType ) { + case A: number = VCFConstants.PER_ALLELE_COUNT; break; + case G: number = VCFConstants.PER_GENOTYPE_COUNT; break; + case UNBOUNDED: number = VCFConstants.UNBOUNDED_ENCODING_v4; break; + case INTEGER: + default: number = count; + } + map.put("Number", number); + map.put("Type", type); + map.put("Description", description); + return lineType.toString() + "=" + toStringEncoding(map); + } + + /** + * returns true if we're equal to another compounder header line + * @param o a compound header line + * @return true if equal + */ + public boolean equals(Object o) { + if ( !(o instanceof VCFCompoundHeaderLine) ) + return false; + VCFCompoundHeaderLine other = (VCFCompoundHeaderLine)o; + return equalsExcludingDescription(other) && + description.equals(other.description); + } + + public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) { + return count == other.count && + countType == other.countType && + type == other.type && + lineType == other.lineType && + name.equals(other.name); + } + + public boolean sameLineTypeAndName(VCFCompoundHeaderLine other) { + return lineType == other.lineType && + name.equals(other.name); + } + + /** + * do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true) + * @return true if we do, false otherwise + */ + abstract boolean allowFlagValues(); + +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFConstants.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFConstants.java new file mode 100755 index 000000000..91f6b1ba9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFConstants.java @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext.v13; + +import java.util.Locale; + +final class VCFConstants { + public static final Locale VCF_LOCALE = Locale.US; + + // reserved INFO/FORMAT field keys + public static final String ANCESTRAL_ALLELE_KEY = "AA"; + public static final String ALLELE_COUNT_KEY = "AC"; + public static final String ALLELE_FREQUENCY_KEY = "AF"; + public static final String ALLELE_NUMBER_KEY = "AN"; + public static final String RMS_BASE_QUALITY_KEY = "BQ"; + public static final String CIGAR_KEY = "CIGAR"; + public static final String DBSNP_KEY = "DB"; + public static final String DEPTH_KEY = "DP"; + public static final String DOWNSAMPLED_KEY = "DS"; + public static final String EXPECTED_ALLELE_COUNT_KEY = "EC"; + public static final String END_KEY = "END"; + public static final String GENOTYPE_FILTER_KEY = "FT"; + public static final String GENOTYPE_KEY = "GT"; + @Deprecated + public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods + public static final String GENOTYPE_POSTERIORS_KEY = "GP"; + public static final String GENOTYPE_QUALITY_KEY = "GQ"; + public static final String HAPMAP2_KEY = "H2"; + public static final String HAPMAP3_KEY = "H3"; + public static final String HAPLOTYPE_QUALITY_KEY = "HQ"; + public static final String RMS_MAPPING_QUALITY_KEY = "MQ"; + public static final String MAPPING_QUALITY_ZERO_KEY = "MQ0"; + public static final String SAMPLE_NUMBER_KEY = "NS"; + public static final String PHRED_GENOTYPE_LIKELIHOODS_KEY = "PL"; // phred-scaled genotype likelihoods + public static final String PHASE_QUALITY_KEY = "PQ"; + public static final String PHASE_SET_KEY = "PS"; + public static final String OLD_DEPTH_KEY = "RD"; + public static final String STRAND_BIAS_KEY = "SB"; + public static final String SOMATIC_KEY = "SOMATIC"; + public static final String VALIDATED_KEY = "VALIDATED"; + public static final String THOUSAND_GENOMES_KEY = "1000G"; + + // separators + public static final String FORMAT_FIELD_SEPARATOR = ":"; + public static final String GENOTYPE_FIELD_SEPARATOR = ":"; + public static final char GENOTYPE_FIELD_SEPARATOR_CHAR = ':'; + public static final String FIELD_SEPARATOR = "\t"; + public static final char FIELD_SEPARATOR_CHAR = '\t'; + public static final String FILTER_CODE_SEPARATOR = ";"; + public static final String INFO_FIELD_ARRAY_SEPARATOR = ","; + public static final char INFO_FIELD_ARRAY_SEPARATOR_CHAR = ','; + public static final String ID_FIELD_SEPARATOR = ";"; + public static final String INFO_FIELD_SEPARATOR = ";"; + public static final char INFO_FIELD_SEPARATOR_CHAR = ';'; + public static final String UNPHASED = "/"; + public static final String PHASED = "|"; + public static final String PHASED_SWITCH_PROB_v3 = "\\"; + public static final String PHASING_TOKENS = "/|\\"; + + // old indel alleles + public static final char DELETION_ALLELE_v3 = 'D'; + public static final char INSERTION_ALLELE_v3 = 'I'; + + // missing/default values + public static final String UNFILTERED = "."; + public static final String PASSES_FILTERS_v3 = "0"; + public static final String PASSES_FILTERS_v4 = "PASS"; + public static final String EMPTY_ID_FIELD = "."; + public static final String EMPTY_INFO_FIELD = "."; + public static final String EMPTY_ALTERNATE_ALLELE_FIELD = "."; + public static final String MISSING_VALUE_v4 = "."; + public static final String MISSING_QUALITY_v3 = "-1"; + public static final Double MISSING_QUALITY_v3_DOUBLE = Double.valueOf(MISSING_QUALITY_v3); + + public static final String MISSING_GENOTYPE_QUALITY_v3 = "-1"; + public static final String MISSING_HAPLOTYPE_QUALITY_v3 = "-1"; + public static final String MISSING_DEPTH_v3 = "-1"; + public static final String UNBOUNDED_ENCODING_v4 = "."; + public static final String UNBOUNDED_ENCODING_v3 = "-1"; + public static final String PER_ALLELE_COUNT = "A"; + public static final String PER_GENOTYPE_COUNT = "G"; + public static final String EMPTY_ALLELE = "."; + public static final String EMPTY_GENOTYPE = "./."; + public static final double MAX_GENOTYPE_QUAL = 99.0; + + public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f"; + public static final String DOUBLE_PRECISION_INT_SUFFIX = ".00"; + public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFilterHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFilterHeaderLine.java new file mode 100755 index 000000000..5e16fbed0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFilterHeaderLine.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +/** + * @author ebanks + * A class representing a key=value entry for FILTER fields in the VCF header + */ +class VCFFilterHeaderLine extends VCFSimpleHeaderLine { + + /** + * create a VCF filter header line + * + * @param name the name for this header line + * @param description the description for this header line + */ + public VCFFilterHeaderLine(String name, String description) { + super(name, description, SupportedHeaderLineType.FILTER); + } + + /** + * create a VCF info header line + * + * @param line the header line + * @param version the vcf header version + */ + protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) { + super(line, version, SupportedHeaderLineType.FILTER); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFormatHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFormatHeaderLine.java new file mode 100755 index 000000000..f73c032cc --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFFormatHeaderLine.java @@ -0,0 +1,32 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + + +/** + * @author ebanks + *

+ * Class VCFFormatHeaderLine + *

+ * A class representing a key=value entry for genotype FORMAT fields in the VCF header + */ +class VCFFormatHeaderLine extends VCFCompoundHeaderLine { + + public VCFFormatHeaderLine(String name, int count, VCFHeaderLineType type, String description) { + super(name, count, type, description, SupportedHeaderLineType.FORMAT); + if (type == VCFHeaderLineType.Flag) + throw new IllegalArgumentException("Flag is an unsupported type for format fields"); + } + + public VCFFormatHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { + super(name, count, type, description, SupportedHeaderLineType.FORMAT); + } + + protected VCFFormatHeaderLine(String line, VCFHeaderVersion version) { + super(line, version, SupportedHeaderLineType.FORMAT); + } + + // format fields do not allow flag values (that wouldn't make much sense, how would you encode this in the genotype). + @Override + boolean allowFlagValues() { + return false; + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeader.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeader.java new file mode 100755 index 000000000..be1b49ec1 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeader.java @@ -0,0 +1,198 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + + +import org.broad.tribble.util.ParsingUtils; + +import java.util.*; + + +/** + * @author aaron + *

+ * Class VCFHeader + *

+ * A class representing the VCF header + */ +class VCFHeader { + + // the mandatory header fields + public enum HEADER_FIELDS { + CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO + } + + // the associated meta data + private final Set mMetaData; + private final Map mInfoMetaData = new HashMap(); + private final Map mFormatMetaData = new HashMap(); + private final Map mOtherMetaData = new HashMap(); + + // the list of auxillary tags + private final Set mGenotypeSampleNames = new LinkedHashSet(); + + // the character string that indicates meta data + public static final String METADATA_INDICATOR = "##"; + + // the header string indicator + public static final String HEADER_INDICATOR = "#"; + + // were the input samples sorted originally (or are we sorting them)? + private boolean samplesWereAlreadySorted = true; + + + /** + * create a VCF header, given a list of meta data and auxillary tags + * + * @param metaData the meta data associated with this header + */ + public VCFHeader(Set metaData) { + mMetaData = new TreeSet(metaData); + loadVCFVersion(); + loadMetaDataMaps(); + } + + /** + * create a VCF header, given a list of meta data and auxillary tags + * + * @param metaData the meta data associated with this header + * @param genotypeSampleNames the sample names + */ + public VCFHeader(Set metaData, Set genotypeSampleNames) { + mMetaData = new TreeSet(); + if ( metaData != null ) + mMetaData.addAll(metaData); + + mGenotypeSampleNames.addAll(genotypeSampleNames); + + loadVCFVersion(); + loadMetaDataMaps(); + + samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames); + } + + /** + * Adds a header line to the header metadata. + * + * @param headerLine Line to add to the existing metadata component. + */ + public void addMetaDataLine(VCFHeaderLine headerLine) { + mMetaData.add(headerLine); + } + + /** + * check our metadata for a VCF version tag, and throw an exception if the version is out of date + * or the version is not present + */ + public void loadVCFVersion() { + List toRemove = new ArrayList(); + for ( VCFHeaderLine line : mMetaData ) + if ( VCFHeaderVersion.isFormatString(line.getKey())) { + toRemove.add(line); + } + // remove old header lines for now, + mMetaData.removeAll(toRemove); + + } + + /** + * load the format/info meta data maps (these are used for quick lookup by key name) + */ + private void loadMetaDataMaps() { + for ( VCFHeaderLine line : mMetaData ) { + if ( line instanceof VCFInfoHeaderLine ) { + VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line; + mInfoMetaData.put(infoLine.getName(), infoLine); + } + else if ( line instanceof VCFFormatHeaderLine ) { + VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; + mFormatMetaData.put(formatLine.getName(), formatLine); + } + else { + mOtherMetaData.put(line.getKey(), line); + } + } + } + + /** + * get the header fields in order they're presented in the input file (which is now required to be + * the order presented in the spec). + * + * @return a set of the header fields, in order + */ + public Set getHeaderFields() { + Set fields = new LinkedHashSet(); + for (HEADER_FIELDS field : HEADER_FIELDS.values()) + fields.add(field); + return fields; + } + + /** + * get the meta data, associated with this header + * + * @return a set of the meta data + */ + public Set getMetaData() { + Set lines = new LinkedHashSet(); + lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString())); + lines.addAll(mMetaData); + return Collections.unmodifiableSet(lines); + } + + /** + * get the genotyping sample names + * + * @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false + */ + public Set getGenotypeSamples() { + return mGenotypeSampleNames; + } + + /** + * do we have genotyping data? + * + * @return true if we have genotyping columns, false otherwise + */ + public boolean hasGenotypingData() { + return mGenotypeSampleNames.size() > 0; + } + + /** + * were the input samples sorted originally? + * + * @return true if the input samples were sorted originally, false otherwise + */ + public boolean samplesWereAlreadySorted() { + return samplesWereAlreadySorted; + } + + /** @return the column count */ + public int getColumnCount() { + return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0); + } + + /** + * @param key the header key name + * @return the meta data line, or null if there is none + */ + public VCFInfoHeaderLine getInfoHeaderLine(String key) { + return mInfoMetaData.get(key); + } + + /** + * @param key the header key name + * @return the meta data line, or null if there is none + */ + public VCFFormatHeaderLine getFormatHeaderLine(String key) { + return mFormatMetaData.get(key); + } + + /** + * @param key the header key name + * @return the meta data line, or null if there is none + */ + public VCFHeaderLine getOtherHeaderLine(String key) { + return mOtherMetaData.get(key); + } +} + + + diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLine.java new file mode 100755 index 000000000..61b0722bd --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLine.java @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext.v13; + +import org.broad.tribble.TribbleException; + +import java.util.Map; + + +/** + * @author ebanks + *

+ * Class VCFHeaderLine + *

+ * A class representing a key=value entry in the VCF header + */ +class VCFHeaderLine implements Comparable { + protected static boolean ALLOW_UNBOUND_DESCRIPTIONS = true; + protected static String UNBOUND_DESCRIPTION = "Not provided in original VCF header"; + + private String mKey = null; + private String mValue = null; + + + /** + * create a VCF header line + * + * @param key the key for this header line + * @param value the value for this header line + */ + public VCFHeaderLine(String key, String value) { + if ( key == null ) + throw new IllegalArgumentException("VCFHeaderLine: key cannot be null: key = " + key); + mKey = key; + mValue = value; + } + + /** + * Get the key + * + * @return the key + */ + public String getKey() { + return mKey; + } + + /** + * Get the value + * + * @return the value + */ + public String getValue() { + return mValue; + } + + public String toString() { + return toStringEncoding(); + } + + /** + * Should be overloaded in sub classes to do subclass specific + * + * @return the string encoding + */ + protected String toStringEncoding() { + return mKey + "=" + mValue; + } + + public boolean equals(Object o) { + if ( !(o instanceof VCFHeaderLine) ) + return false; + return mKey.equals(((VCFHeaderLine)o).getKey()) && mValue.equals(((VCFHeaderLine)o).getValue()); + } + + public int compareTo(Object other) { + return toString().compareTo(other.toString()); + } + + /** + * @param line the line + * @return true if the line is a VCF meta data line, or false if it is not + */ + public static boolean isHeaderLine(String line) { + return line != null && line.length() > 0 && VCFHeader.HEADER_INDICATOR.equals(line.substring(0,1)); + } + + /** + * create a string of a mapping pair for the target VCF version + * @param keyValues a mapping of the key->value pairs to output + * @return a string, correctly formatted + */ + public static String toStringEncoding(Map keyValues) { + StringBuilder builder = new StringBuilder(); + builder.append("<"); + boolean start = true; + for (Map.Entry entry : keyValues.entrySet()) { + if (start) start = false; + else builder.append(","); + + if ( entry.getValue() == null ) throw new TribbleException.InternalCodecException("Header problem: unbound value at " + entry + " from " + keyValues); + + builder.append(entry.getKey()); + builder.append("="); + builder.append(entry.getValue().toString().contains(",") || + entry.getValue().toString().contains(" ") || + entry.getKey().equals("Description") ? "\""+ entry.getValue() + "\"" : entry.getValue()); + } + builder.append(">"); + return builder.toString(); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineCount.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineCount.java new file mode 100644 index 000000000..8fd29d188 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineCount.java @@ -0,0 +1,8 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +/** + * the count encodings we use for fields in VCF header lines + */ +public enum VCFHeaderLineCount { + INTEGER, A, G, UNBOUNDED; +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineTranslator.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineTranslator.java new file mode 100755 index 000000000..538b4ff8e --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineTranslator.java @@ -0,0 +1,124 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +import java.util.*; + +/** + * A class for translating between vcf header versions + */ +public class VCFHeaderLineTranslator { + private static Map mapping; + + static { + mapping = new HashMap(); + mapping.put(VCFHeaderVersion.VCF4_0,new VCF4Parser()); + mapping.put(VCFHeaderVersion.VCF4_1,new VCF4Parser()); + mapping.put(VCFHeaderVersion.VCF3_3,new VCF3Parser()); + mapping.put(VCFHeaderVersion.VCF3_2,new VCF3Parser()); + } + + public static Map parseLine(VCFHeaderVersion version, String valueLine, List expectedTagOrder) { + return mapping.get(version).parseLine(valueLine,expectedTagOrder); + } +} + + +interface VCFLineParser { + public Map parseLine(String valueLine, List expectedTagOrder); +} + + +/** + * a class that handles the to and from disk for VCF 4 lines + */ +class VCF4Parser implements VCFLineParser { + Set bracketed = new HashSet(); + + /** + * parse a VCF4 line + * @param valueLine the line + * @return a mapping of the tags parsed out + */ + public Map parseLine(String valueLine, List expectedTagOrder) { + // our return map + Map ret = new LinkedHashMap(); + + // a builder to store up characters as we go + StringBuilder builder = new StringBuilder(); + + // store the key when we're parsing out the values + String key = ""; + + // where are we in the stream of characters? + int index = 0; + + // are we inside a quotation? we don't special case ',' then + boolean inQuote = false; + + // a little switch machine to parse out the tags. Regex ended up being really complicated and ugly [yes, but this machine is getting ugly now... MAD] + for (char c: valueLine.toCharArray()) { + if ( c == '\"' ) { + inQuote = ! inQuote; + } else if ( inQuote ) { + builder.append(c); + } else { + switch (c) { + case ('<') : if (index == 0) break; // if we see a open bracket at the beginning, ignore it + case ('>') : if (index == valueLine.length()-1) ret.put(key,builder.toString().trim()); break; // if we see a close bracket, and we're at the end, add an entry to our list + case ('=') : key = builder.toString().trim(); builder = new StringBuilder(); break; // at an equals, copy the key and reset the builder + case (',') : ret.put(key,builder.toString().trim()); builder = new StringBuilder(); break; // drop the current key value to the return map + default: builder.append(c); // otherwise simply append to the current string + } + } + + index++; + } + + // validate the tags against the expected list + index = 0; + if (ret.size() > expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size()); + for (String str : ret.keySet()) { + if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); + index++; + } + return ret; + } +} + +class VCF3Parser implements VCFLineParser { + + public Map parseLine(String valueLine, List expectedTagOrder) { + // our return map + Map ret = new LinkedHashMap(); + + // a builder to store up characters as we go + StringBuilder builder = new StringBuilder(); + + // where are we in the stream of characters? + int index = 0; + // where in the expected tag order are we? + int tagIndex = 0; + + // are we inside a quotation? we don't special case ',' then + boolean inQuote = false; + + // a little switch machine to parse out the tags. Regex ended up being really complicated and ugly + for (char c: valueLine.toCharArray()) { + switch (c) { + case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it + case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map + default: builder.append(c); // otherwise simply append to the current string + } + index++; + } + ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); + + // validate the tags against the expected list + index = 0; + if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size()); + for (String str : ret.keySet()){ + if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); + index++; + } + return ret; + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineType.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineType.java new file mode 100755 index 000000000..65cf1a327 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderLineType.java @@ -0,0 +1,8 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +/** + * the type encodings we use for fields in VCF header lines + */ +enum VCFHeaderLineType { + Integer, Float, String, Character, Flag; +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderVersion.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderVersion.java new file mode 100755 index 000000000..21e737abe --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFHeaderVersion.java @@ -0,0 +1,91 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +import org.broad.tribble.TribbleException; + +/** + * information that identifies each header version + */ +enum VCFHeaderVersion { + VCF3_2("VCRv3.2","format"), + VCF3_3("VCFv3.3","fileformat"), + VCF4_0("VCFv4.0","fileformat"), + VCF4_1("VCFv4.1","fileformat"); + + private final String versionString; + private final String formatString; + + /** + * create the enum, privately, using: + * @param vString the version string + * @param fString the format string + */ + VCFHeaderVersion(String vString, String fString) { + this.versionString = vString; + this.formatString = fString; + } + + /** + * get the header version + * @param version the version string + * @return a VCFHeaderVersion object + */ + public static VCFHeaderVersion toHeaderVersion(String version) { + version = clean(version); + for (VCFHeaderVersion hv : VCFHeaderVersion.values()) + if (hv.versionString.equals(version)) + return hv; + return null; + } + + /** + * are we a valid version string of some type + * @param version the version string + * @return true if we're valid of some type, false otherwise + */ + public static boolean isVersionString(String version){ + return toHeaderVersion(version) != null; + } + + /** + * are we a valid format string for some type + * @param format the format string + * @return true if we're valid of some type, false otherwise + */ + public static boolean isFormatString(String format){ + format = clean(format); + for (VCFHeaderVersion hv : VCFHeaderVersion.values()) + if (hv.formatString.equals(format)) + return true; + return false; + } + + public static VCFHeaderVersion getHeaderVersion(String versionLine) { + String[] lineFields = versionLine.split("="); + if ( lineFields.length != 2 || !isFormatString(lineFields[0].substring(2)) ) + throw new TribbleException.InvalidHeader(versionLine + " is not a valid VCF version line"); + + if ( !isVersionString(lineFields[1]) ) + throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); + + return toHeaderVersion(lineFields[1]); + } + + /** + * Utility function to clean up a VCF header string + * + * @param s string + * @return trimmed version of s + */ + private static String clean(String s) { + return s.trim(); + } + + + public String getVersionString() { + return versionString; + } + + public String getFormatString() { + return formatString; + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFInfoHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFInfoHeaderLine.java new file mode 100755 index 000000000..642a78b76 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFInfoHeaderLine.java @@ -0,0 +1,29 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + + +/** + * @author ebanks + *

+ * Class VCFInfoHeaderLine + *

+ * A class representing a key=value entry for INFO fields in the VCF header + */ +class VCFInfoHeaderLine extends VCFCompoundHeaderLine { + public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) { + super(name, count, type, description, SupportedHeaderLineType.INFO); + } + + public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { + super(name, count, type, description, SupportedHeaderLineType.INFO); + } + + protected VCFInfoHeaderLine(String line, VCFHeaderVersion version) { + super(line, version, SupportedHeaderLineType.INFO); + } + + // info fields allow flag values + @Override + boolean allowFlagValues() { + return true; + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFNamedHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFNamedHeaderLine.java new file mode 100755 index 000000000..b3ce5d841 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFNamedHeaderLine.java @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext.v13; + +/** an interface for named header lines **/ +interface VCFNamedHeaderLine { + String getName(); +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFParser.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFParser.java new file mode 100755 index 000000000..95a3f7bf1 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFParser.java @@ -0,0 +1,22 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +import java.util.List; +import java.util.Map; + + +/** + * All VCF codecs need to implement this interface so that we can perform lazy loading. + */ +interface VCFParser { + + /** + * create a genotype map + * @param str the string + * @param alleles the list of alleles + * @param chr chrom + * @param pos position + * @return a mapping of sample name to genotype object + */ + public Map createGenotypeMap(String str, List alleles, String chr, int pos); + +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFSimpleHeaderLine.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFSimpleHeaderLine.java new file mode 100644 index 000000000..17706c705 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFSimpleHeaderLine.java @@ -0,0 +1,81 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.Map; + + +/** + * @author ebanks + * A class representing a key=value entry for simple VCF header types + */ +abstract class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine { + + public enum SupportedHeaderLineType { + FILTER, ALT; + } + + private String name; + private String description; + + // our type of line, i.e. filter, alt, etc + private final SupportedHeaderLineType lineType; + + + /** + * create a VCF filter header line + * + * @param name the name for this header line + * @param description the description for this header line + * @param lineType the header line type + */ + public VCFSimpleHeaderLine(String name, String description, SupportedHeaderLineType lineType) { + super(lineType.toString(), ""); + this.lineType = lineType; + this.name = name; + this.description = description; + + if ( name == null || description == null ) + throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s desc=%s", super.getKey(), name, description )); + } + + /** + * create a VCF info header line + * + * @param line the header line + * @param version the vcf header version + * @param lineType the header line type + */ + protected VCFSimpleHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { + super(lineType.toString(), ""); + this.lineType = lineType; + Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description")); + name = mapping.get("ID"); + description = mapping.get("Description"); + if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided + description = UNBOUND_DESCRIPTION; + } + + protected String toStringEncoding() { + Map map = new LinkedHashMap(); + map.put("ID", name); + map.put("Description", description); + return lineType.toString() + "=" + toStringEncoding(map); + } + + public boolean equals(Object o) { + if ( !(o instanceof VCFSimpleHeaderLine) ) + return false; + VCFSimpleHeaderLine other = (VCFSimpleHeaderLine)o; + return name.equals(other.name) && + description.equals(other.description); + } + + public String getName() { + return name; + } + + public String getDescription() { + return description; + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFUtils.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFUtils.java new file mode 100755 index 000000000..dc78d40ac --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFUtils.java @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext.v13; + +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; + +import java.util.*; + +/** + * A set of static utility methods for common operations on VCF files/records. + */ +class VCFUtils { + /** + * Constructor access disallowed...static utility methods only! + */ + private VCFUtils() { } + + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List> rodBindings) { + // Collect the eval rod names + final Set names = new TreeSet(); + for ( final RodBinding evalRod : rodBindings ) + names.add(evalRod.getName()); + return getVCFHeadersFromRods(toolkit, names); + } + + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit) { + return getVCFHeadersFromRods(toolkit, (Collection)null); + } + + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { + Map data = new HashMap(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + // ignore the rod if it's not in our list + if ( rodNames != null && !rodNames.contains(source.getName()) ) + continue; + + if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader ) + data.put(source.getName(), (VCFHeader)source.getHeader()); + } + + return data; + } + + public static Map getVCFHeadersFromRodPrefix(GenomeAnalysisEngine toolkit,String prefix) { + Map data = new HashMap(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + // ignore the rod if lacks the prefix + if ( ! source.getName().startsWith(prefix) ) + continue; + + if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader ) + data.put(source.getName(), (VCFHeader)source.getHeader()); + } + + return data; + } + + /** + * Gets the header fields from all VCF rods input by the user + * + * @param toolkit GATK engine + * + * @return a set of all fields + */ + public static Set getHeaderFields(GenomeAnalysisEngine toolkit) { + return getHeaderFields(toolkit, null); + } + + /** + * Gets the header fields from all VCF rods input by the user + * + * @param toolkit GATK engine + * @param rodNames names of rods to use, or null if we should use all possible ones + * + * @return a set of all fields + */ + public static Set getHeaderFields(GenomeAnalysisEngine toolkit, Collection rodNames) { + + // keep a map of sample name to occurrences encountered + TreeSet fields = new TreeSet(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + // ignore the rod if it's not in our list + if ( rodNames != null && !rodNames.contains(source.getName()) ) + continue; + + if ( source.getRecordType().equals(VariantContext.class)) { + VCFHeader header = (VCFHeader)source.getHeader(); + if ( header != null ) + fields.addAll(header.getMetaData()); + } + } + + return fields; + } + + /** Only displays a warning if a logger is provided and an identical warning hasn't been already issued */ + private static final class HeaderConflictWarner { + Logger logger; + Set alreadyIssued = new HashSet(); + + private HeaderConflictWarner(final Logger logger) { + this.logger = logger; + } + + public void warn(final VCFHeaderLine line, final String msg) { + if ( logger != null && ! alreadyIssued.contains(line.getKey()) ) { + alreadyIssued.add(line.getKey()); + logger.warn(msg); + } + } + } + + public static Set smartMergeHeaders(Collection headers, Logger logger) throws IllegalStateException { + HashMap map = new HashMap(); // from KEY.NAME -> line + HeaderConflictWarner conflictWarner = new HeaderConflictWarner(logger); + + // todo -- needs to remove all version headers from sources and add its own VCF version line + for ( VCFHeader source : headers ) { + //System.out.printf("Merging in header %s%n", source); + for ( VCFHeaderLine line : source.getMetaData()) { + String key = line.getKey(); + + if ( line instanceof VCFNamedHeaderLine) + key = key + "" + ((VCFNamedHeaderLine) line).getName(); + + if ( map.containsKey(key) ) { + VCFHeaderLine other = map.get(key); + if ( line.equals(other) ) + continue; + else if ( ! line.getClass().equals(other.getClass()) ) + throw new IllegalStateException("Incompatible header types: " + line + " " + other ); + else if ( line instanceof VCFFilterHeaderLine) { + String lineName = ((VCFFilterHeaderLine) line).getName(); String otherName = ((VCFFilterHeaderLine) other).getName(); + if ( ! lineName.equals(otherName) ) + throw new IllegalStateException("Incompatible header types: " + line + " " + other ); + } else if ( line instanceof VCFCompoundHeaderLine ) { + VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line; + VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other; + + // if the names are the same, but the values are different, we need to quit + if (! (compLine).equalsExcludingDescription(compOther) ) { + if ( compLine.getType().equals(compOther.getType()) ) { + // The Number entry is an Integer that describes the number of values that can be + // included with the INFO field. For example, if the INFO field contains a single + // number, then this value should be 1. However, if the INFO field describes a pair + // of numbers, then this value should be 2 and so on. If the number of possible + // values varies, is unknown, or is unbounded, then this value should be '.'. + conflictWarner.warn(line, "Promoting header field Number to . due to number differences in header lines: " + line + " " + other); + compOther.setNumberToUnbounded(); + } else if ( compLine.getType() == VCFHeaderLineType.Integer && compOther.getType() == VCFHeaderLineType.Float ) { + // promote key to Float + conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); + map.put(key, compOther); + } else if ( compLine.getType() == VCFHeaderLineType.Float && compOther.getType() == VCFHeaderLineType.Integer ) { + // promote key to Float + conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); + } else { + throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other ); + } + } + if ( ! compLine.getDescription().equals(compOther) ) + conflictWarner.warn(line, "Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine); + } else { + // we are not equal, but we're not anything special either + conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + line + " already present header = " + other); + } + } else { + map.put(key, line); + //System.out.printf("Adding header line %s%n", line); + } + } + } + + return new HashSet(map.values()); + } + + public static String rsIDOfFirstRealVariant(List VCs, VariantContext.Type type) { + if ( VCs == null ) + return null; + + String rsID = null; + for ( VariantContext vc : VCs ) { + if ( vc.getType() == type ) { + rsID = vc.getID(); + break; + } + } + + return rsID; + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFWriter.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFWriter.java new file mode 100755 index 000000000..15bdb5046 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VCFWriter.java @@ -0,0 +1,16 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +/** + * this class writes VCF files + */ +public interface VCFWriter { + + public void writeHeader(VCFHeader header); + + /** + * attempt to close the VCF file + */ + public void close(); + + public void add(VariantContext vc); +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContext.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContext.java new file mode 100755 index 000000000..3a193a00a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContext.java @@ -0,0 +1,1615 @@ +package org.broadinstitute.sting.utils.variantcontext.v13; + +import org.broad.tribble.Feature; +import org.broad.tribble.TribbleException; +import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +/** + * Class VariantContext + * + * == High-level overview == + * + * The VariantContext object is a single general class system for representing genetic variation data composed of: + * + * * Allele: representing single genetic haplotypes (A, T, ATC, -) + * * Genotype: an assignment of alleles for each chromosome of a single named sample at a particular locus + * * VariantContext: an abstract class holding all segregating alleles at a locus as well as genotypes + * for multiple individuals containing alleles at that locus + * + * The class system works by defining segregating alleles, creating a variant context representing the segregating + * information at a locus, and potentially creating and associating genotypes with individuals in the context. + * + * All of the classes are highly validating -- call validate() if you modify them -- so you can rely on the + * self-consistency of the data once you have a VariantContext in hand. The system has a rich set of assessor + * and manipulator routines, as well as more complex static support routines in VariantContextUtils. + * + * The VariantContext (and Genotype) objects are attributed (supporting addition of arbitrary key/value pairs) and + * filtered (can represent a variation that is viewed as suspect). + * + * VariantContexts are dynamically typed, so whether a VariantContext is a SNP, Indel, or NoVariant depends + * on the properties of the alleles in the context. See the detailed documentation on the Type parameter below. + * + * It's also easy to create subcontexts based on selected genotypes. + * + * == Working with Variant Contexts == + * By default, VariantContexts are immutable. In order to access (in the rare circumstances where you need them) + * setter routines, you need to create MutableVariantContexts and MutableGenotypes. + * + * === Some example data === + * + * Allele A, Aref, T, Tref; + * Allele del, delRef, ATC, ATCref; + * + * A [ref] / T at 10 + * GenomeLoc snpLoc = GenomeLocParser.createGenomeLoc("chr1", 10, 10); + * + * - / ATC [ref] from 20-23 + * GenomeLoc delLoc = GenomeLocParser.createGenomeLoc("chr1", 20, 22); + * + * // - [ref] / ATC immediately after 20 + * GenomeLoc insLoc = GenomeLocParser.createGenomeLoc("chr1", 20, 20); + * + * === Alleles === + * + * See the documentation in the Allele class itself + * + * What are they? + * + * Alleles can be either reference or non-reference + * + * Example alleles used here: + * + * del = new Allele("-"); + * A = new Allele("A"); + * Aref = new Allele("A", true); + * T = new Allele("T"); + * ATC = new Allele("ATC"); + * + * === Creating variant contexts === + * + * ==== By hand ==== + * + * Here's an example of a A/T polymorphism with the A being reference: + * + *

+ * VariantContext vc = new VariantContext(name, snpLoc, Arrays.asList(Aref, T));
+ * 
+ * + * If you want to create a non-variant site, just put in a single reference allele + * + *
+ * VariantContext vc = new VariantContext(name, snpLoc, Arrays.asList(Aref));
+ * 
+ * + * A deletion is just as easy: + * + *
+ * VariantContext vc = new VariantContext(name, delLoc, Arrays.asList(ATCref, del));
+ * 
+ * + * The only 2 things that distinguishes between a insertion and deletion are the reference allele + * and the location of the variation. An insertion has a Null reference allele and at least + * one non-reference Non-Null allele. Additionally, the location of the insertion is immediately after + * a 1-bp GenomeLoc (at say 20). + * + *
+ * VariantContext vc = new VariantContext("name", insLoc, Arrays.asList(delRef, ATC));
+ * 
+ * + * ==== Converting rods and other data structures to VCs ==== + * + * You can convert many common types into VariantContexts using the general function: + * + *
+ * VariantContextAdaptors.convertToVariantContext(name, myObject)
+ * 
+ * + * dbSNP and VCFs, for example, can be passed in as myObject and a VariantContext corresponding to that + * object will be returned. A null return type indicates that the type isn't yet supported. This is the best + * and easiest way to create contexts using RODs. + * + * + * === Working with genotypes === + * + *
+ * List alleles = Arrays.asList(Aref, T);
+ * Genotype g1 = new Genotype(Arrays.asList(Aref, Aref), "g1", 10);
+ * Genotype g2 = new Genotype(Arrays.asList(Aref, T), "g2", 10);
+ * Genotype g3 = new Genotype(Arrays.asList(T, T), "g3", 10);
+ * VariantContext vc = new VariantContext(snpLoc, alleles, Arrays.asList(g1, g2, g3));
+ * 
+ * + * At this point we have 3 genotypes in our context, g1-g3. + * + * You can assess a good deal of information about the genotypes through the VariantContext: + * + *
+ * vc.hasGenotypes()
+ * vc.isMonomorphic()
+ * vc.isPolymorphic()
+ * vc.getSamples().size()
+ *
+ * vc.getGenotypes()
+ * vc.getGenotypes().get("g1")
+ * vc.hasGenotype("g1")
+ *
+ * vc.getChromosomeCount()
+ * vc.getChromosomeCount(Aref)
+ * vc.getChromosomeCount(T)
+ * 
+ * + * === NO_CALL alleles === + * + * The system allows one to create Genotypes carrying special NO_CALL alleles that aren't present in the + * set of context alleles and that represent undetermined alleles in a genotype: + * + * Genotype g4 = new Genotype(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), "NO_DATA_FOR_SAMPLE", 10); + * + * + * === subcontexts === + * It's also very easy get subcontext based only the data in a subset of the genotypes: + * + *
+ * VariantContext vc12 = vc.subContextFromGenotypes(Arrays.asList(g1,g2));
+ * VariantContext vc1 = vc.subContextFromGenotypes(Arrays.asList(g1));
+ * 
+ * + * @author depristo + */ +public class VariantContext implements Feature { // to enable tribble intergration + protected InferredGeneticContext commonInfo = null; + public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; + public final static String UNPARSED_GENOTYPE_MAP_KEY = "_UNPARSED_GENOTYPE_MAP_"; + public final static String UNPARSED_GENOTYPE_PARSER_KEY = "_UNPARSED_GENOTYPE_PARSER_"; + public final static String ID_KEY = "ID"; + + private final Byte REFERENCE_BASE_FOR_INDEL; + + public final static Set PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet()); + + /** The location of this VariantContext */ + protected String contig; + protected long start; + protected long stop; + + /** The type (cached for performance reasons) of this context */ + protected Type type = null; + + /** A set of the alleles segregating in this context */ + final protected List alleles; + + /** A mapping from sampleName -> genotype objects for all genotypes associated with this context */ + protected Map genotypes = null; + + /** Counts for each of the possible Genotype types in this context */ + protected int[] genotypeCounts = null; + + public final static Map NO_GENOTYPES = Collections.unmodifiableMap(new HashMap()); + + // a fast cached access point to the ref / alt alleles for biallelic case + private Allele REF = null; + + // set to the alt allele when biallelic, otherwise == null + private Allele ALT = null; + + // were filters applied? + private boolean filtersWereAppliedToContext; + + // --------------------------------------------------------------------------------------------------------- + // + // constructors + // + // --------------------------------------------------------------------------------------------------------- + + + /** + * the complete constructor. Makes a complete VariantContext from its arguments + * This is the only constructor that is able to create indels! DO NOT USE THE OTHER ONES. + * + * @param source source + * @param contig the contig + * @param start the start base (one based) + * @param stop the stop reference base (one based) + * @param alleles alleles + * @param genotypes genotypes map + * @param negLog10PError qual + * @param filters filters: use null for unfiltered and empty set for passes filters + * @param attributes attributes + * @param referenceBaseForIndel padded reference base + */ + public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { + this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, referenceBaseForIndel, false, true); + } + + /** + * the complete constructor. Makes a complete VariantContext from its arguments + * + * @param source source + * @param contig the contig + * @param start the start base (one based) + * @param stop the stop reference base (one based) + * @param alleles alleles + * @param genotypes genotypes map + * @param negLog10PError qual + * @param filters filters: use null for unfiltered and empty set for passes filters + * @param attributes attributes + */ + public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { + this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, null, false, true); + } + + /** + * Makes a VariantContext from its arguments without parsing the genotypes. + * Note that this constructor assumes that if there is genotype data, then it's been put into + * the attributes with the UNPARSED_GENOTYPE_MAP_KEY and that the codec has been added with the + * UNPARSED_GENOTYPE_PARSER_KEY. It doesn't validate that this is the case because it's possible + * that there is no genotype data. + * + * @param source source + * @param contig the contig + * @param start the start base (one based) + * @param stop the stop reference base (one based) + * @param alleles alleles + * @param negLog10PError qual + * @param filters filters: use null for unfiltered and empty set for passes filters + * @param attributes attributes + * @param referenceBaseForIndel padded reference base + */ + public VariantContext(String source, String contig, long start, long stop, Collection alleles, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { + this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, referenceBaseForIndel, true, true); + } + + /** + * Create a new VariantContext + * + * @param source source + * @param contig the contig + * @param start the start base (one based) + * @param stop the stop reference base (one based) + * @param alleles alleles + * @param genotypes genotypes set + * @param negLog10PError qual + * @param filters filters: use null for unfiltered and empty set for passes filters + * @param attributes attributes + */ + public VariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { + this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, null, false, true); + } + + /** + * Create a new variant context without genotypes and no Perror, no filters, and no attributes + * + * @param source source + * @param contig the contig + * @param start the start base (one based) + * @param stop the stop reference base (one based) + * @param alleles alleles + */ + public VariantContext(String source, String contig, long start, long stop, Collection alleles) { + this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, null, false, true); + } + + /** + * Create a new variant context with genotypes but without Perror, filters, and attributes + * + * @param source source + * @param contig the contig + * @param start the start base (one based) + * @param stop the stop reference base (one based) + * @param alleles alleles + * @param genotypes genotypes + */ + public VariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes) { + this(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); + } + + /** + * Copy constructor + * + * @param other the VariantContext to copy + */ + public VariantContext(VariantContext other) { + this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, false, true); + } + + /** + * the actual constructor. Private access only + * + * @param source source + * @param contig the contig + * @param start the start base (one based) + * @param stop the stop reference base (one based) + * @param alleles alleles + * @param genotypes genotypes map + * @param negLog10PError qual + * @param filters filters: use null for unfiltered and empty set for passes filters + * @param attributes attributes + * @param referenceBaseForIndel padded reference base + * @param genotypesAreUnparsed true if the genotypes have not yet been parsed + * @param performValidation if true, call validate() as the final step in construction + */ + private VariantContext(String source, String contig, long start, long stop, + Collection alleles, Map genotypes, + double negLog10PError, Set filters, Map attributes, + Byte referenceBaseForIndel, boolean genotypesAreUnparsed, + boolean performValidation ) { + if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); } + this.contig = contig; + this.start = start; + this.stop = stop; + + if ( !genotypesAreUnparsed && attributes != null ) { + if ( attributes.containsKey(UNPARSED_GENOTYPE_MAP_KEY) ) + attributes.remove(UNPARSED_GENOTYPE_MAP_KEY); + if ( attributes.containsKey(UNPARSED_GENOTYPE_PARSER_KEY) ) + attributes.remove(UNPARSED_GENOTYPE_PARSER_KEY); + } + + this.commonInfo = new InferredGeneticContext(source, negLog10PError, filters, attributes); + filtersWereAppliedToContext = filters != null; + REFERENCE_BASE_FOR_INDEL = referenceBaseForIndel; + + if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); } + + // we need to make this a LinkedHashSet in case the user prefers a given ordering of alleles + this.alleles = makeAlleles(alleles); + + + if ( genotypes == null ) { genotypes = NO_GENOTYPES; } + this.genotypes = Collections.unmodifiableMap(genotypes); + + // cache the REF and ALT alleles + int nAlleles = alleles.size(); + for ( Allele a : alleles ) { + if ( a.isReference() ) { + REF = a; + } else if ( nAlleles == 2 ) { // only cache ALT when biallelic + ALT = a; + } + } + + if ( performValidation ) { + validate(); + } + } + + // --------------------------------------------------------------------------------------------------------- + // + // Partial-cloning routines (because Variant Context is immutable). + // + // IMPORTANT: These routines assume that the VariantContext on which they're called is already valid. + // Due to this assumption, they explicitly tell the constructor NOT to perform validation by + // calling validate(), and instead perform validation only on the data that's changed. + // + // Note that we don't call vc.getGenotypes() because that triggers the lazy loading. + // Also note that we need to create a new attributes map because it's unmodifiable and the constructor may try to modify it. + // + // --------------------------------------------------------------------------------------------------------- + + public static VariantContext modifyGenotypes(VariantContext vc, Map genotypes) { + VariantContext modifiedVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), false, false); + modifiedVC.validateGenotypes(); + return modifiedVC; + } + + public static VariantContext modifyLocation(VariantContext vc, String chr, int start, int end) { + VariantContext modifiedVC = new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true, false); + + // Since start and end have changed, we need to call both validateAlleles() and validateReferencePadding(), + // since those validation routines rely on the values of start and end: + modifiedVC.validateAlleles(); + modifiedVC.validateReferencePadding(); + + return modifiedVC; + } + + public static VariantContext modifyFilters(VariantContext vc, Set filters) { + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true, false); + } + + public static VariantContext modifyAttributes(VariantContext vc, Map attributes) { + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, vc.getReferenceBaseForIndel(), true, false); + } + + public static VariantContext modifyReferencePadding(VariantContext vc, Byte b) { + VariantContext modifiedVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), b, true, false); + modifiedVC.validateReferencePadding(); + return modifiedVC; + } + + public static VariantContext modifyPErrorFiltersAndAttributes(VariantContext vc, double negLog10PError, Set filters, Map attributes) { + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, vc.getReferenceBaseForIndel(), true, false); + } + + // --------------------------------------------------------------------------------------------------------- + // + // Selectors + // + // --------------------------------------------------------------------------------------------------------- + + /** + * Returns a context identical to this (i.e., filter, qual are all the same) but containing only the Genotype + * genotype and alleles in genotype. This is the right way to test if a single genotype is actually + * variant or not. + * + * @param genotype genotype + * @return vc subcontext + */ + public VariantContext subContextFromGenotypes(Genotype genotype) { + return subContextFromGenotypes(Arrays.asList(genotype)); + } + + + /** + * Returns a context identical to this (i.e., filter, qual are all the same) but containing only the Genotypes + * genotypes and alleles in these genotypes. This is the right way to test if a single genotype is actually + * variant or not. + * + * @param genotypes genotypes + * @return vc subcontext + */ + public VariantContext subContextFromGenotypes(Collection genotypes) { + return subContextFromGenotypes(genotypes, allelesOfGenotypes(genotypes)) ; + } + + /** + * Returns a context identical to this (i.e., filter, qual are all the same) but containing only the Genotypes + * genotypes. Also, the resulting variant context will contain the alleles provided, not only those found in genotypes + * + * @param genotypes genotypes + * @param alleles the set of allele segregating alleles at this site. Must include those in genotypes, but may be more + * @return vc subcontext + */ + public VariantContext subContextFromGenotypes(Collection genotypes, Collection alleles) { + return new VariantContext(getSource(), contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes(), getReferenceBaseForIndel()); + } + + + /** + * helper routine for subcontext + * @param genotypes genotypes + * @return allele set + */ + private Set allelesOfGenotypes(Collection genotypes) { + Set alleles = new HashSet(); + + boolean addedref = false; + for ( Genotype g : genotypes ) { + for ( Allele a : g.getAlleles() ) { + addedref = addedref || a.isReference(); + if ( a.isCalled() ) + alleles.add(a); + } + } + if ( ! addedref ) alleles.add(getReference()); + + return alleles; + } + + // --------------------------------------------------------------------------------------------------------- + // + // type operations + // + // --------------------------------------------------------------------------------------------------------- + + /** + * see: http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=handbook&part=ch5&rendertype=table&id=ch5.ch5_t3 + * + * Format: + * dbSNP variation class + * Rules for assigning allele classes + * Sample allele definition + * + * Single Nucleotide Polymorphisms (SNPs)a + * Strictly defined as single base substitutions involving A, T, C, or G. + * A/T + * + * Deletion/Insertion Polymorphisms (DIPs) + * Designated using the full sequence of the insertion as one allele, and either a fully + * defined string for the variant allele or a '-' character to specify the deleted allele. + * This class will be assigned to a variation if the variation alleles are of different lengths or + * if one of the alleles is deleted ('-'). + * T/-/CCTA/G + * + * No-variation + * Reports may be submitted for segments of sequence that are assayed and determined to be invariant + * in the sample. + * (NoVariation) + * + * Mixed + * Mix of other classes + * + * Also supports NO_VARIATION type, used to indicate that the site isn't polymorphic in the population + * + * + * Not currently supported: + * + * Heterozygous sequencea + * The term heterozygous is used to specify a region detected by certain methods that do not + * resolve the polymorphism into a specific sequence motif. In these cases, a unique flanking + * sequence must be provided to define a sequence context for the variation. + * (heterozygous) + * + * Microsatellite or short tandem repeat (STR) + * Alleles are designated by providing the repeat motif and the copy number for each allele. + * Expansion of the allele repeat motif designated in dbSNP into full-length sequence will + * be only an approximation of the true genomic sequence because many microsatellite markers are + * not fully sequenced and are resolved as size variants only. + * (CAC)8/9/10/11 + * + * Named variant + * Applies to insertion/deletion polymorphisms of longer sequence features, such as retroposon + * dimorphism for Alu or line elements. These variations frequently include a deletion '-' indicator + * for the absent allele. + * (alu) / - + * + * Multi-Nucleotide Polymorphism (MNP) + * Assigned to variations that are multi-base variations of a single, common length + * GGA/AGT + */ + public enum Type { + NO_VARIATION, + SNP, + MNP, // a multi-nucleotide polymorphism + INDEL, + SYMBOLIC, + MIXED, + } + + /** + * Determines (if necessary) and returns the type of this variation by examining the alleles it contains. + * + * @return the type of this VariantContext + **/ + public Type getType() { + if ( type == null ) + determineType(); + + return type; + } + + /** + * convenience method for SNPs + * + * @return true if this is a SNP, false otherwise + */ + public boolean isSNP() { return getType() == Type.SNP; } + + + /** + * convenience method for variants + * + * @return true if this is a variant allele, false if it's reference + */ + public boolean isVariant() { return getType() != Type.NO_VARIATION; } + + /** + * convenience method for point events + * + * @return true if this is a SNP or ref site, false if it's an indel or mixed event + */ + public boolean isPointEvent() { return isSNP() || !isVariant(); } + + /** + * convenience method for indels + * + * @return true if this is an indel, false otherwise + */ + public boolean isIndel() { return getType() == Type.INDEL; } + + /** + * @return true if the alleles indicate a simple insertion (i.e., the reference allele is Null) + */ + public boolean isSimpleInsertion() { + // can't just call !isSimpleDeletion() because of complex indels + return getType() == Type.INDEL && getReference().isNull() && isBiallelic(); + } + + /** + * @return true if the alleles indicate a simple deletion (i.e., a single alt allele that is Null) + */ + public boolean isSimpleDeletion() { + // can't just call !isSimpleInsertion() because of complex indels + return getType() == Type.INDEL && getAlternateAllele(0).isNull() && isBiallelic(); + } + + /** + * @return true if the alleles indicate neither a simple deletion nor a simple insertion + */ + public boolean isComplexIndel() { + return isIndel() && !isSimpleDeletion() && !isSimpleInsertion(); + } + + public boolean isSymbolic() { + return getType() == Type.SYMBOLIC; + } + + public boolean isMNP() { + return getType() == Type.MNP; + } + + /** + * convenience method for indels + * + * @return true if this is an mixed variation, false otherwise + */ + public boolean isMixed() { return getType() == Type.MIXED; } + + + // --------------------------------------------------------------------------------------------------------- + // + // Generic accessors + // + // --------------------------------------------------------------------------------------------------------- + + public boolean hasID() { + return commonInfo.hasAttribute(ID_KEY); + } + + public String getID() { + return (String)commonInfo.getAttribute(ID_KEY); + } + + public boolean hasReferenceBaseForIndel() { + return REFERENCE_BASE_FOR_INDEL != null; + } + + // the indel base that gets stripped off for indels + public Byte getReferenceBaseForIndel() { + return REFERENCE_BASE_FOR_INDEL; + } + + // --------------------------------------------------------------------------------------------------------- + // + // get routines to access context info fields + // + // --------------------------------------------------------------------------------------------------------- + public String getSource() { return commonInfo.getName(); } + public Set getFilters() { return commonInfo.getFilters(); } + public boolean isFiltered() { return commonInfo.isFiltered(); } + public boolean isNotFiltered() { return commonInfo.isNotFiltered(); } + public boolean filtersWereApplied() { return filtersWereAppliedToContext; } + public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); } + public double getNegLog10PError() { return commonInfo.getNegLog10PError(); } + public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); } + + public Map getAttributes() { return commonInfo.getAttributes(); } + public boolean hasAttribute(String key) { return commonInfo.hasAttribute(key); } + public Object getAttribute(String key) { return commonInfo.getAttribute(key); } + + public Object getAttribute(String key, Object defaultValue) { + return commonInfo.getAttribute(key, defaultValue); + } + + public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); } + public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } + public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } + public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } + + // --------------------------------------------------------------------------------------------------------- + // + // Working with alleles + // + // --------------------------------------------------------------------------------------------------------- + + /** + * @return the reference allele for this context + */ + public Allele getReference() { + Allele ref = REF; + if ( ref == null ) + throw new IllegalStateException("BUG: no reference allele found at " + this); + return ref; + } + + + /** + * @return true if the context is strictly bi-allelic + */ + public boolean isBiallelic() { + return getNAlleles() == 2; + } + + /** + * @return The number of segregating alleles in this context + */ + public int getNAlleles() { + return alleles.size(); + } + + /** + * @return The allele sharing the same bases as this String. A convenience method; better to use byte[] + */ + public Allele getAllele(String allele) { + return getAllele(allele.getBytes()); + } + + /** + * @return The allele sharing the same bases as this byte[], or null if no such allele is present. + */ + public Allele getAllele(byte[] allele) { + return Allele.getMatchingAllele(getAlleles(), allele); + } + + /** + * @return True if this context contains Allele allele, or false otherwise + */ + public boolean hasAllele(Allele allele) { + return hasAllele(allele, false); + } + + public boolean hasAllele(Allele allele, boolean ignoreRefState) { + if ( allele == REF || allele == ALT ) // optimization for cached cases + return true; + + for ( Allele a : getAlleles() ) { + if ( a.equals(allele, ignoreRefState) ) + return true; + } + + return false; + } + + + /** + * Gets the alleles. This method should return all of the alleles present at the location, + * including the reference allele. There are no constraints imposed on the ordering of alleles + * in the set. If the reference is not an allele in this context it will not be included. + * + * @return the set of alleles + */ + public List getAlleles() { return alleles; } + + /** + * Gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference allele. There are no constraints imposed on the ordering of alleles + * in the set. + * + * @return the set of alternate alleles + */ + public List getAlternateAlleles() { + return alleles.subList(1, alleles.size()); + } + + /** + * Gets the sizes of the alternate alleles if they are insertion/deletion events, and returns a list of their sizes + * + * @return a list of indel lengths ( null if not of type indel or mixed ) + */ + public List getIndelLengths() { + if ( getType() != Type.INDEL && getType() != Type.MIXED ) { + return null; + } + + List lengths = new ArrayList(); + for ( Allele a : getAlternateAlleles() ) { + lengths.add(a.length() - getReference().length()); + } + + return lengths; + } + + /** + * @param i -- the ith allele (from 0 to n - 2 for a context with n alleles including a reference allele) + * @return the ith non-reference allele in this context + * @throws IllegalArgumentException if i is invalid + */ + public Allele getAlternateAllele(int i) { + return alleles.get(i+1); + } + + /** + * @param other VariantContext whose alternate alleles to compare against + * @return true if this VariantContext has the same alternate alleles as other, + * regardless of ordering. Otherwise returns false. + */ + public boolean hasSameAlternateAllelesAs ( VariantContext other ) { + List thisAlternateAlleles = getAlternateAlleles(); + List otherAlternateAlleles = other.getAlternateAlleles(); + + if ( thisAlternateAlleles.size() != otherAlternateAlleles.size() ) { + return false; + } + + for ( Allele allele : thisAlternateAlleles ) { + if ( ! otherAlternateAlleles.contains(allele) ) { + return false; + } + } + + return true; + } + + // --------------------------------------------------------------------------------------------------------- + // + // Working with genotypes + // + // --------------------------------------------------------------------------------------------------------- + + private void loadGenotypes() { + if ( !hasAttribute(UNPARSED_GENOTYPE_MAP_KEY) ) { + if ( genotypes == null ) + genotypes = NO_GENOTYPES; + return; + } + + Object parserObj = getAttribute(UNPARSED_GENOTYPE_PARSER_KEY); + if ( parserObj == null || !(parserObj instanceof VCFParser) ) + throw new IllegalStateException("There is no VCF parser stored to unparse the genotype data"); + VCFParser parser = (VCFParser)parserObj; + + Object mapObj = getAttribute(UNPARSED_GENOTYPE_MAP_KEY); + if ( mapObj == null ) + throw new IllegalStateException("There is no mapping string stored to unparse the genotype data"); + + genotypes = parser.createGenotypeMap(mapObj.toString(), new ArrayList(alleles), getChr(), getStart()); + + commonInfo.removeAttribute(UNPARSED_GENOTYPE_MAP_KEY); + commonInfo.removeAttribute(UNPARSED_GENOTYPE_PARSER_KEY); + + validateGenotypes(); + } + + /** + * @return the number of samples in the context + */ + public int getNSamples() { + loadGenotypes(); + return genotypes.size(); + } + + /** + * @return true if the context has associated genotypes + */ + public boolean hasGenotypes() { + loadGenotypes(); + return genotypes.size() > 0; + } + + public boolean hasGenotypes(Collection sampleNames) { + loadGenotypes(); + for ( String name : sampleNames ) { + if ( ! genotypes.containsKey(name) ) + return false; + } + return true; + } + + /** + * @return set of all Genotypes associated with this context + */ + public Map getGenotypes() { + loadGenotypes(); + return genotypes; + } + + public List getGenotypesSortedByName() { + loadGenotypes(); + Collection types = new TreeMap(genotypes).values(); + return new ArrayList(types); + } + + /** + * Returns a map from sampleName -> Genotype for the genotype associated with sampleName. Returns a map + * for consistency with the multi-get function. + * + * @param sampleName + * @return + * @throws IllegalArgumentException if sampleName isn't bound to a genotype + */ + public Map getGenotypes(String sampleName) { + return getGenotypes(Arrays.asList(sampleName)); + } + + /** + * Returns a map from sampleName -> Genotype for each sampleName in sampleNames. Returns a map + * for consistency with the multi-get function. + * + * @param sampleNames a unique list of sample names + * @return + * @throws IllegalArgumentException if sampleName isn't bound to a genotype + */ + public Map getGenotypes(Collection sampleNames) { + HashMap map = new HashMap(); + + for ( String name : sampleNames ) { + if ( map.containsKey(name) ) throw new IllegalArgumentException("Duplicate names detected in requested samples " + sampleNames); + final Genotype g = getGenotype(name); + if ( g != null ) { + map.put(name, g); + } + } + + return map; + } + + /** + * @return the set of all sample names in this context + */ + public Set getSampleNames() { + return getGenotypes().keySet(); + } + + /** + * @param sample the sample name + * + * @return the Genotype associated with the given sample in this context or null if the sample is not in this context + */ + public Genotype getGenotype(String sample) { + return getGenotypes().get(sample); + } + + public boolean hasGenotype(String sample) { + return getGenotypes().containsKey(sample); + } + + public Genotype getGenotype(int ith) { + return getGenotypesSortedByName().get(ith); + } + + + /** + * Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS + * + * @return chromosome count + */ + public int getChromosomeCount() { + int n = 0; + + for ( Genotype g : getGenotypes().values() ) { + n += g.isNoCall() ? 0 : g.getPloidy(); + } + + return n; + } + + /** + * Returns the number of chromosomes carrying allele A in the genotypes + * + * @param a allele + * @return chromosome count + */ + public int getChromosomeCount(Allele a) { + int n = 0; + + for ( Genotype g : getGenotypes().values() ) { + n += g.getAlleles(a).size(); + } + + return n; + } + + /** + * Genotype-specific functions -- are the genotypes monomorphic w.r.t. to the alleles segregating at this + * site? That is, is the number of alternate alleles among all fo the genotype == 0? + * + * @return true if it's monomorphic + */ + public boolean isMonomorphic() { + return ! isVariant() || (hasGenotypes() && getHomRefCount() + getNoCallCount() == getNSamples()); + } + + /** + * Genotype-specific functions -- are the genotypes polymorphic w.r.t. to the alleles segregating at this + * site? That is, is the number of alternate alleles among all fo the genotype > 0? + * + * @return true if it's polymorphic + */ + public boolean isPolymorphic() { + return ! isMonomorphic(); + } + + private void calculateGenotypeCounts() { + if ( genotypeCounts == null ) { + genotypeCounts = new int[Genotype.Type.values().length]; + + for ( Genotype g : getGenotypes().values() ) { + if ( g.isNoCall() ) + genotypeCounts[Genotype.Type.NO_CALL.ordinal()]++; + else if ( g.isHomRef() ) + genotypeCounts[Genotype.Type.HOM_REF.ordinal()]++; + else if ( g.isHet() ) + genotypeCounts[Genotype.Type.HET.ordinal()]++; + else if ( g.isHomVar() ) + genotypeCounts[Genotype.Type.HOM_VAR.ordinal()]++; + else + genotypeCounts[Genotype.Type.MIXED.ordinal()]++; + } + } + } + + /** + * Genotype-specific functions -- how many no-calls are there in the genotypes? + * + * @return number of no calls + */ + public int getNoCallCount() { + calculateGenotypeCounts(); + return genotypeCounts[Genotype.Type.NO_CALL.ordinal()]; + } + + /** + * Genotype-specific functions -- how many hom ref calls are there in the genotypes? + * + * @return number of hom ref calls + */ + public int getHomRefCount() { + calculateGenotypeCounts(); + return genotypeCounts[Genotype.Type.HOM_REF.ordinal()]; + } + + /** + * Genotype-specific functions -- how many het calls are there in the genotypes? + * + * @return number of het calls + */ + public int getHetCount() { + calculateGenotypeCounts(); + return genotypeCounts[Genotype.Type.HET.ordinal()]; + } + + /** + * Genotype-specific functions -- how many hom var calls are there in the genotypes? + * + * @return number of hom var calls + */ + public int getHomVarCount() { + return genotypeCounts[Genotype.Type.HOM_VAR.ordinal()]; + } + + /** + * Genotype-specific functions -- how many mixed calls are there in the genotypes? + * + * @return number of mixed calls + */ + public int getMixedCount() { + return genotypeCounts[Genotype.Type.MIXED.ordinal()]; + } + + // --------------------------------------------------------------------------------------------------------- + // + // validation: extra-strict validation routines for paranoid users + // + // --------------------------------------------------------------------------------------------------------- + + /** + * Run all extra-strict validation tests on a Variant Context object + * + * @param reference the true reference allele + * @param paddedRefBase the reference base used for padding indels + * @param rsIDs the true dbSNP IDs + */ + public void extraStrictValidation(Allele reference, Byte paddedRefBase, Set rsIDs) { + // validate the reference + validateReferenceBases(reference, paddedRefBase); + + // validate the RS IDs + validateRSIDs(rsIDs); + + // validate the altenate alleles + validateAlternateAlleles(); + + // validate the AN and AC fields + validateChromosomeCounts(); + + // TODO: implement me + //checkReferenceTrack(); + } + + public void validateReferenceBases(Allele reference, Byte paddedRefBase) { + // don't validate if we're a complex event + if ( !isComplexIndel() && !reference.isNull() && !reference.basesMatch(getReference()) ) { + throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString())); + } + + // we also need to validate the padding base for simple indels + if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) ) { + throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), (char)paddedRefBase.byteValue(), (char)getReferenceBaseForIndel().byteValue())); + } + } + + public void validateRSIDs(Set rsIDs) { + if ( rsIDs != null && hasAttribute(VariantContext.ID_KEY) ) { + for ( String id : getID().split(VCFConstants.ID_FIELD_SEPARATOR) ) { + if ( id.startsWith("rs") && !rsIDs.contains(id) ) + throw new TribbleException.InternalCodecException(String.format("the rsID %s for the record at position %s:%d is not in dbSNP", id, getChr(), getStart())); + } + } + } + + public void validateAlternateAlleles() { + if ( !hasGenotypes() ) + return; + + List reportedAlleles = getAlleles(); + Set observedAlleles = new HashSet(); + observedAlleles.add(getReference()); + for ( Genotype g : getGenotypes().values() ) { + if ( g.isCalled() ) + observedAlleles.addAll(g.getAlleles()); + } + + if ( reportedAlleles.size() != observedAlleles.size() ) + throw new TribbleException.InternalCodecException(String.format("the ALT allele(s) for the record at position %s:%d do not match what is observed in the per-sample genotypes", getChr(), getStart())); + + int originalSize = reportedAlleles.size(); + // take the intersection and see if things change + observedAlleles.retainAll(reportedAlleles); + if ( observedAlleles.size() != originalSize ) + throw new TribbleException.InternalCodecException(String.format("the ALT allele(s) for the record at position %s:%d do not match what is observed in the per-sample genotypes", getChr(), getStart())); + } + + public void validateChromosomeCounts() { + Map observedAttrs = calculateChromosomeCounts(); + + // AN + if ( hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) { + int reportedAN = Integer.valueOf(getAttribute(VCFConstants.ALLELE_NUMBER_KEY).toString()); + int observedAN = (Integer)observedAttrs.get(VCFConstants.ALLELE_NUMBER_KEY); + if ( reportedAN != observedAN ) + throw new TribbleException.InternalCodecException(String.format("the Allele Number (AN) tag is incorrect for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedAN, observedAN)); + } + + // AC + if ( hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { + List observedACs = (List)observedAttrs.get(VCFConstants.ALLELE_COUNT_KEY); + if ( getAttribute(VCFConstants.ALLELE_COUNT_KEY) instanceof List ) { + Collections.sort(observedACs); + List reportedACs = (List)getAttribute(VCFConstants.ALLELE_COUNT_KEY); + Collections.sort(reportedACs); + if ( observedACs.size() != reportedACs.size() ) + throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have the correct number of values for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedACs.size(), observedACs.size())); + for (int i = 0; i < observedACs.size(); i++) { + if ( Integer.valueOf(reportedACs.get(i).toString()) != observedACs.get(i) ) + throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedACs.get(i), observedACs.get(i))); + } + } else { + if ( observedACs.size() != 1 ) + throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have enough values for the record at position %s:%d", getChr(), getStart())); + int reportedAC = Integer.valueOf(getAttribute(VCFConstants.ALLELE_COUNT_KEY).toString()); + if ( reportedAC != observedACs.get(0) ) + throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedAC, observedACs.get(0))); + } + } + } + + private Map calculateChromosomeCounts() { + Map attributes = new HashMap(); + + attributes.put(VCFConstants.ALLELE_NUMBER_KEY, getChromosomeCount()); + ArrayList alleleFreqs = new ArrayList(); + ArrayList alleleCounts = new ArrayList(); + + // if there are alternate alleles, record the relevant tags + if ( getAlternateAlleles().size() > 0 ) { + for ( Allele allele : getAlternateAlleles() ) { + alleleCounts.add(getChromosomeCount(allele)); + alleleFreqs.add((double)getChromosomeCount(allele) / (double)getChromosomeCount()); + } + } + // otherwise, set them to 0 + else { + alleleCounts.add(0); + alleleFreqs.add(0.0); + } + + attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts); + attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs); + return attributes; + } + + // --------------------------------------------------------------------------------------------------------- + // + // validation: the normal validation routines are called automatically upon creation of the VC + // + // --------------------------------------------------------------------------------------------------------- + + /** + * To be called by any modifying routines + */ + private boolean validate() { + return validate(true); + } + + private boolean validate(boolean throwException) { + try { + validateReferencePadding(); + validateAlleles(); + validateGenotypes(); + } catch ( IllegalArgumentException e ) { + if ( throwException ) + throw e; + else + return false; + } + + return true; + } + + private void validateReferencePadding() { + if (hasSymbolicAlleles()) // symbolic alleles don't need padding... + return; + + boolean needsPadding = (getReference().length() == getEnd() - getStart()); // off by one because padded base was removed + + if ( needsPadding && !hasReferenceBaseForIndel() ) + throw new ReviewedStingException("Badly formed variant context at location " + getChr() + ":" + getStart() + "; no padded reference base was provided."); + } + + private void validateAlleles() { + // check alleles + boolean alreadySeenRef = false, alreadySeenNull = false; + for ( Allele allele : alleles ) { + // make sure there's only one reference allele + if ( allele.isReference() ) { + if ( alreadySeenRef ) throw new IllegalArgumentException("BUG: Received two reference tagged alleles in VariantContext " + alleles + " this=" + this); + alreadySeenRef = true; + } + + if ( allele.isNoCall() ) { + throw new IllegalArgumentException("BUG: Cannot add a no call allele to a variant context " + alleles + " this=" + this); + } + + // make sure there's only one null allele + if ( allele.isNull() ) { + if ( alreadySeenNull ) throw new IllegalArgumentException("BUG: Received two null alleles in VariantContext " + alleles + " this=" + this); + alreadySeenNull = true; + } + } + + // make sure there's one reference allele + if ( ! alreadySeenRef ) + throw new IllegalArgumentException("No reference allele found in VariantContext"); + +// if ( getType() == Type.INDEL ) { +// if ( getReference().length() != (getLocation().size()-1) ) { + long length = (stop - start) + 1; + if ( (getReference().isNull() && length != 1 ) || + (getReference().isNonNull() && (length - getReference().length() > 1))) { + throw new IllegalStateException("BUG: GenomeLoc " + contig + ":" + start + "-" + stop + " has a size == " + length + " but the variation reference allele has length " + getReference().length() + " this = " + this); + } + } + + private void validateGenotypes() { + if ( this.genotypes == null ) throw new IllegalStateException("Genotypes is null"); + + for ( Map.Entry elt : this.genotypes.entrySet() ) { + String name = elt.getKey(); + Genotype g = elt.getValue(); + + if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName()); + + if ( g.isAvailable() ) { + for ( Allele gAllele : g.getAlleles() ) { + if ( ! hasAllele(gAllele) && gAllele.isCalled() ) + throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles); + } + } + } + } + + // --------------------------------------------------------------------------------------------------------- + // + // utility routines + // + // --------------------------------------------------------------------------------------------------------- + + private void determineType() { + if ( type == null ) { + switch ( getNAlleles() ) { + case 0: + throw new IllegalStateException("Unexpected error: requested type of VariantContext with no alleles!" + this); + case 1: + // note that this doesn't require a reference allele. You can be monomorphic independent of having a + // reference allele + type = Type.NO_VARIATION; + break; + default: + determinePolymorphicType(); + } + } + } + + private void determinePolymorphicType() { + type = null; + + // do a pairwise comparison of all alleles against the reference allele + for ( Allele allele : alleles ) { + if ( allele == REF ) + continue; + + // find the type of this allele relative to the reference + Type biallelicType = typeOfBiallelicVariant(REF, allele); + + // for the first alternate allele, set the type to be that one + if ( type == null ) { + type = biallelicType; + } + // if the type of this allele is different from that of a previous one, assign it the MIXED type and quit + else if ( biallelicType != type ) { + type = Type.MIXED; + return; + } + } + } + + private static Type typeOfBiallelicVariant(Allele ref, Allele allele) { + if ( ref.isSymbolic() ) + throw new IllegalStateException("Unexpected error: encountered a record with a symbolic reference allele"); + + if ( allele.isSymbolic() ) + return Type.SYMBOLIC; + + if ( ref.length() == allele.length() ) { + if ( allele.length() == 1 ) + return Type.SNP; + else + return Type.MNP; + } + + // Important note: previously we were checking that one allele is the prefix of the other. However, that's not an + // appropriate check as can be seen from the following example: + // REF = CTTA and ALT = C,CT,CA + // This should be assigned the INDEL type but was being marked as a MIXED type because of the prefix check. + // In truth, it should be absolutely impossible to return a MIXED type from this method because it simply + // performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type + // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point + // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. + return Type.INDEL; + + // old incorrect logic: + // if (oneIsPrefixOfOther(ref, allele)) + // return Type.INDEL; + // else + // return Type.MIXED; + } + + public String toString() { + return String.format("[VC %s @ %s of type=%s alleles=%s attr=%s GT=%s", + getSource(), contig + ":" + (start - stop == 0 ? start : start + "-" + stop), this.getType(), + ParsingUtils.sortList(this.getAlleles()), ParsingUtils.sortedString(this.getAttributes()), this.getGenotypesSortedByName()); + } + + // protected basic manipulation routines + private static List makeAlleles(Collection alleles) { + final List alleleList = new ArrayList(alleles.size()); + + boolean sawRef = false; + for ( final Allele a : alleles ) { + for ( final Allele b : alleleList ) { + if ( a.equals(b, true) ) + throw new IllegalArgumentException("Duplicate allele added to VariantContext: " + a); + } + + // deal with the case where the first allele isn't the reference + if ( a.isReference() ) { + if ( sawRef ) + throw new IllegalArgumentException("Alleles for a VariantContext must contain at most one reference allele: " + alleles); + alleleList.add(0, a); + sawRef = true; + } + else + alleleList.add(a); + } + + if ( alleleList.isEmpty() ) + throw new IllegalArgumentException("Cannot create a VariantContext with an empty allele list"); + + if ( alleleList.get(0).isNonReference() ) + throw new IllegalArgumentException("Alleles for a VariantContext must contain at least one reference allele: " + alleles); + + return alleleList; + } + + public static Map genotypeCollectionToMap(Map dest, Collection genotypes) { + for ( Genotype g : genotypes ) { + if ( dest.containsKey(g.getSampleName() ) ) + throw new IllegalArgumentException("Duplicate genotype added to VariantContext: " + g); + dest.put(g.getSampleName(), g); + } + + return dest; + } + + // --------------------------------------------------------------------------------------------------------- + // + // tribble integration routines -- not for public consumption + // + // --------------------------------------------------------------------------------------------------------- + public String getChr() { + return contig; + } + + public int getStart() { + return (int)start; + } + + public int getEnd() { + return (int)stop; + } + + private boolean hasSymbolicAlleles() { + for (Allele a: getAlleles()) { + if (a.isSymbolic()) { + return true; + } + } + return false; + } + + public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { + + // see if we need to pad common reference base from all alleles + boolean padVC; + + // We need to pad a VC with a common base if the length of the reference allele is less than the length of the VariantContext. + // This happens because the position of e.g. an indel is always one before the actual event (as per VCF convention). + long locLength = (inputVC.getEnd() - inputVC.getStart()) + 1; + if (inputVC.hasSymbolicAlleles()) + padVC = true; + else if (inputVC.getReference().length() == locLength) + padVC = false; + else if (inputVC.getReference().length() == locLength-1) + padVC = true; + else throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + + " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); + + // nothing to do if we don't need to pad bases + if (padVC) { + + if ( !inputVC.hasReferenceBaseForIndel() ) + throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); + + Byte refByte = inputVC.getReferenceBaseForIndel(); + + List alleles = new ArrayList(); + Map genotypes = new TreeMap(); + + Map inputGenotypes = inputVC.getGenotypes(); + + for (Allele a : inputVC.getAlleles()) { + // get bases for current allele and create a new one with trimmed bases + if (a.isSymbolic()) { + alleles.add(a); + } else { + String newBases; + if ( refBaseShouldBeAppliedToEndOfAlleles ) + newBases = a.getBaseString() + new String(new byte[]{refByte}); + else + newBases = new String(new byte[]{refByte}) + a.getBaseString(); + alleles.add(Allele.create(newBases,a.isReference())); + } + } + + // now we can recreate new genotypes with trimmed alleles + for (String sample : inputVC.getSampleNames()) { + Genotype g = inputGenotypes.get(sample); + + List inAlleles = g.getAlleles(); + List newGenotypeAlleles = new ArrayList(); + for (Allele a : inAlleles) { + if (a.isCalled()) { + if (a.isSymbolic()) { + newGenotypeAlleles.add(a); + } else { + String newBases; + if ( refBaseShouldBeAppliedToEndOfAlleles ) + newBases = a.getBaseString() + new String(new byte[]{refByte}); + else + newBases = new String(new byte[]{refByte}) + a.getBaseString(); + newGenotypeAlleles.add(Allele.create(newBases,a.isReference())); + } + } + else { + // add no-call allele + newGenotypeAlleles.add(Allele.NO_CALL); + } + } + genotypes.put(sample, new Genotype(sample, newGenotypeAlleles, g.getNegLog10PError(), + g.getFilters(),g.getAttributes(),g.isPhased())); + + } + + // Do not change the filter state if filters were not applied to this context + Set inputVCFilters = inputVC.filtersWereAppliedToContext ? inputVC.getFilters() : null; + return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVCFilters, inputVC.getAttributes(),refByte); + } + else + return inputVC; + + } + + public ArrayList getTwoAllelesWithHighestAlleleCounts() { + // first idea: get two alleles with highest AC + int maxAC1 = 0, maxAC2=0,maxAC1ind =0, maxAC2ind = 0; + int i=0; + int[] alleleCounts = new int[this.getAlleles().size()]; + ArrayList alleleArray = new ArrayList(); + for (Allele a:this.getAlleles()) { + int ac = this.getChromosomeCount(a); + if (ac >=maxAC1) { + maxAC1 = ac; + maxAC1ind = i; + } + alleleArray.add(a); + alleleCounts[i++] = ac; + } + // now get second best allele + for (i=0; i < alleleCounts.length; i++) { + if (i == maxAC1ind) + continue; + if (alleleCounts[i] >= maxAC2) { + maxAC2 = alleleCounts[i]; + maxAC2ind = i; + } + } + + Allele alleleA, alleleB; + if (alleleArray.get(maxAC1ind).isReference()) { + alleleA = alleleArray.get(maxAC1ind); + alleleB = alleleArray.get(maxAC2ind); + } + else if (alleleArray.get(maxAC2ind).isReference()) { + alleleA = alleleArray.get(maxAC2ind); + alleleB = alleleArray.get(maxAC1ind); + } else { + alleleA = alleleArray.get(maxAC1ind); + alleleB = alleleArray.get(maxAC2ind); + } + ArrayList a = new ArrayList(); + a.add(alleleA); + a.add(alleleB); + return a; + } + public Allele getAltAlleleWithHighestAlleleCount() { + // first idea: get two alleles with highest AC + Allele best = null; + int maxAC1 = 0; + for (Allele a:this.getAlternateAlleles()) { + int ac = this.getChromosomeCount(a); + if (ac >=maxAC1) { + maxAC1 = ac; + best = a; + } + + } + return best; + } + + public int[] getGLIndecesOfAllele(Allele inputAllele) { + int[] idxVector = new int[3]; + int numAlleles = this.getAlleles().size(); + + int idxDiag = numAlleles; + int incr = numAlleles - 1; + int k=1; + for (Allele a: getAlternateAlleles()) { + // multi-allelic approximation, part 1: Ideally + // for each alt allele compute marginal (suboptimal) posteriors - + // compute indices for AA,AB,BB for current allele - genotype likelihoods are a linear vector that can be thought of + // as a row-wise upper triangular matrix of likelihoods. + // So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC. + // 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD + + int idxAA = 0; + int idxAB = k++; + // yy is always element on the diagonal. + // 2 alleles: BBelement 2 + // 3 alleles: BB element 3. CC element 5 + // 4 alleles: + int idxBB = idxDiag; + + if (a.equals(inputAllele)) { + idxVector[0] = idxAA; + idxVector[1] = idxAB; + idxVector[2] = idxBB; + break; + } + idxDiag += incr--; + } + return idxVector; + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContextUtils.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContextUtils.java new file mode 100755 index 000000000..e2cf2ecf0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantContextUtils.java @@ -0,0 +1,1407 @@ +/* + * Copyright (c) 2010. The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext.v13; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.samtools.util.StringUtil; +import org.apache.commons.jexl2.Expression; +import org.apache.commons.jexl2.JexlEngine; +import org.apache.log4j.Logger; +import org.broad.tribble.util.popgen.HardyWeinbergCalculation; +import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.Serializable; +import java.util.*; + +public class VariantContextUtils { + private static Logger logger = Logger.getLogger(VariantContextUtils.class); + public final static String MERGE_INTERSECTION = "Intersection"; + public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; + public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; + public final static String MERGE_FILTER_PREFIX = "filterIn"; + + final public static JexlEngine engine = new JexlEngine(); + static { + engine.setSilent(false); // will throw errors now for selects that don't evaluate properly + engine.setLenient(false); + } + + /** + * Create a new VariantContext + * + * @param name name + * @param loc location + * @param alleles alleles + * @param genotypes genotypes set + * @param negLog10PError qual + * @param filters filters: use null for unfiltered and empty set for passes filters + * @param attributes attributes + * @return VariantContext object + */ + public static VariantContext toVC(String name, GenomeLoc loc, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { + return new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes != null ? VariantContext.genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes); + } + + /** + * Create a new variant context without genotypes and no Perror, no filters, and no attributes + * @param name name + * @param loc location + * @param alleles alleles + * @return VariantContext object + */ + public static VariantContext toVC(String name, GenomeLoc loc, Collection alleles) { + return new VariantContext (name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, VariantContext.NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); + } + + /** + * Create a new variant context without genotypes and no Perror, no filters, and no attributes + * @param name name + * @param loc location + * @param alleles alleles + * @param genotypes genotypes + * @return VariantContext object + */ + public static VariantContext toVC(String name, GenomeLoc loc, Collection alleles, Collection genotypes) { + return new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); + } + + /** + * Copy constructor + * + * @param other the VariantContext to copy + * @return VariantContext object + */ + public static VariantContext toVC(VariantContext other) { + return new VariantContext(other.getSource(), other.getChr(), other.getStart(), other.getEnd(), other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.getFilters(), other.getAttributes()); + } + + /** + * Update the attributes of the attributes map given the VariantContext to reflect the proper chromosome-based VCF tags + * + * @param vc the VariantContext + * @param attributes the attributes map to populate; must not be null; may contain old values + * @param removeStaleValues should we remove stale values from the mapping? + */ + public static void calculateChromosomeCounts(VariantContext vc, Map attributes, boolean removeStaleValues) { + // if everyone is a no-call, remove the old attributes if requested + if ( vc.getChromosomeCount() == 0 && removeStaleValues ) { + if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) + attributes.remove(VCFConstants.ALLELE_COUNT_KEY); + if ( attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY) ) + attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); + if ( attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY) ) + attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); + return; + } + + if ( vc.hasGenotypes() ) { + attributes.put(VCFConstants.ALLELE_NUMBER_KEY, vc.getChromosomeCount()); + + // if there are alternate alleles, record the relevant tags + if ( vc.getAlternateAlleles().size() > 0 ) { + ArrayList alleleFreqs = new ArrayList(); + ArrayList alleleCounts = new ArrayList(); + double totalChromosomes = (double)vc.getChromosomeCount(); + for ( Allele allele : vc.getAlternateAlleles() ) { + int altChromosomes = vc.getChromosomeCount(allele); + alleleCounts.add(altChromosomes); + String freq = String.format(makePrecisionFormatStringFromDenominatorValue(totalChromosomes), ((double)altChromosomes / totalChromosomes)); + alleleFreqs.add(freq); + } + + attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); + attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); + } + else { + attributes.put(VCFConstants.ALLELE_COUNT_KEY, 0); + attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, 0.0); + } + } + } + + private static String makePrecisionFormatStringFromDenominatorValue(double maxValue) { + int precision = 1; + + while ( maxValue > 1 ) { + precision++; + maxValue /= 10.0; + } + + return "%." + precision + "f"; + } + + public static Genotype removePLs(Genotype g) { + Map attrs = new HashMap(g.getAttributes()); + attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); + attrs.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY); + return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attrs, g.isPhased()); + } + + /** + * A simple but common wrapper for matching VariantContext objects using JEXL expressions + */ + public static class JexlVCMatchExp { + public String name; + public Expression exp; + + /** + * Create a new matcher expression with name and JEXL expression exp + * @param name name + * @param exp expression + */ + public JexlVCMatchExp(String name, Expression exp) { + this.name = name; + this.exp = exp; + } + } + + /** + * Method for creating JexlVCMatchExp from input walker arguments names and exps. These two arrays contain + * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return + * a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to + * match() below. + * + * @param names names + * @param exps expressions + * @return list of matches + */ + public static List initializeMatchExps(String[] names, String[] exps) { + if ( names == null || exps == null ) + throw new ReviewedStingException("BUG: neither names nor exps can be null: names " + Arrays.toString(names) + " exps=" + Arrays.toString(exps) ); + + if ( names.length != exps.length ) + throw new UserException("Inconsistent number of provided filter names and expressions: names=" + Arrays.toString(names) + " exps=" + Arrays.toString(exps)); + + Map map = new HashMap(); + for ( int i = 0; i < names.length; i++ ) { map.put(names[i], exps[i]); } + + return VariantContextUtils.initializeMatchExps(map); + } + + public static List initializeMatchExps(ArrayList names, ArrayList exps) { + String[] nameArray = new String[names.size()]; + String[] expArray = new String[exps.size()]; + return initializeMatchExps(names.toArray(nameArray), exps.toArray(expArray)); + } + + + /** + * Method for creating JexlVCMatchExp from input walker arguments mapping from names to exps. These two arrays contain + * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return + * a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to + * match() below. + * + * @param names_and_exps mapping of names to expressions + * @return list of matches + */ + public static List initializeMatchExps(Map names_and_exps) { + List exps = new ArrayList(); + + for ( Map.Entry elt : names_and_exps.entrySet() ) { + String name = elt.getKey(); + String expStr = elt.getValue(); + + if ( name == null || expStr == null ) throw new IllegalArgumentException("Cannot create null expressions : " + name + " " + expStr); + try { + Expression exp = engine.createExpression(expStr); + exps.add(new JexlVCMatchExp(name, exp)); + } catch (Exception e) { + throw new UserException.BadArgumentValue(name, "Invalid expression used (" + expStr + "). Please see the JEXL docs for correct syntax.") ; + } + } + + return exps; + } + + /** + * Returns true if exp match VC. See collection<> version for full docs. + * @param vc variant context + * @param exp expression + * @return true if there is a match + */ + public static boolean match(VariantContext vc, JexlVCMatchExp exp) { + return match(vc,Arrays.asList(exp)).get(exp); + } + + /** + * Matches each JexlVCMatchExp exp against the data contained in vc, and returns a map from these + * expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL + * expressions to VariantContext records. Use initializeMatchExps() to create the list of JexlVCMatchExp + * expressions. + * + * @param vc variant context + * @param exps expressions + * @return true if there is a match + */ + public static Map match(VariantContext vc, Collection exps) { + return new JEXLMap(exps,vc); + + } + + /** + * Returns true if exp match VC/g. See collection<> version for full docs. + * @param vc variant context + * @param g genotype + * @param exp expression + * @return true if there is a match + */ + public static boolean match(VariantContext vc, Genotype g, JexlVCMatchExp exp) { + return match(vc,g,Arrays.asList(exp)).get(exp); + } + + /** + * Matches each JexlVCMatchExp exp against the data contained in vc/g, and returns a map from these + * expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL + * expressions to VariantContext records/genotypes. Use initializeMatchExps() to create the list of JexlVCMatchExp + * expressions. + * + * @param vc variant context + * @param g genotype + * @param exps expressions + * @return true if there is a match + */ + public static Map match(VariantContext vc, Genotype g, Collection exps) { + return new JEXLMap(exps,vc,g); + } + + public static double computeHardyWeinbergPvalue(VariantContext vc) { + if ( vc.getChromosomeCount() == 0 ) + return 0.0; + return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); + } + + /** + * Returns a newly allocated VC that is the same as VC, but without genotypes + * @param vc variant context + * @return new VC without genotypes + */ + @Requires("vc != null") + @Ensures("result != null") + public static VariantContext sitesOnlyVariantContext(VariantContext vc) { + return VariantContext.modifyGenotypes(vc, null); + } + + /** + * Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes + * @param vcs collection of VCs + * @return new VCs without genotypes + */ + @Requires("vcs != null") + @Ensures("result != null") + public static Collection sitesOnlyVariantContexts(Collection vcs) { + List r = new ArrayList(); + for ( VariantContext vc : vcs ) + r.add(sitesOnlyVariantContext(vc)); + return r; + } + + public static VariantContext pruneVariantContext(VariantContext vc) { + return pruneVariantContext(vc, null); + } + + public static VariantContext pruneVariantContext(final VariantContext vc, final Collection keysToPreserve ) { + final MutableVariantContext mvc = new MutableVariantContext(vc); + + if ( keysToPreserve == null || keysToPreserve.size() == 0 ) + mvc.clearAttributes(); + else { + final Map d = mvc.getAttributes(); + mvc.clearAttributes(); + for ( String key : keysToPreserve ) + if ( d.containsKey(key) ) + mvc.putAttribute(key, d.get(key)); + } + + // this must be done as the ID is stored in the attributes field + if ( vc.hasID() ) mvc.setID(vc.getID()); + + Collection gs = mvc.getGenotypes().values(); + mvc.clearGenotypes(); + for ( Genotype g : gs ) { + MutableGenotype mg = new MutableGenotype(g); + mg.clearAttributes(); + if ( keysToPreserve != null ) + for ( String key : keysToPreserve ) + if ( g.hasAttribute(key) ) + mg.putAttribute(key, g.getAttribute(key)); + mvc.addGenotype(mg); + } + + return mvc; + } + + public enum GenotypeMergeType { + /** + * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. + */ + UNIQUIFY, + /** + * Take genotypes in priority order (see the priority argument). + */ + PRIORITIZE, + /** + * Take the genotypes in any order. + */ + UNSORTED, + /** + * Require that all samples/genotypes be unique between all inputs. + */ + REQUIRE_UNIQUE + } + + public enum FilteredRecordMergeType { + /** + * Union - leaves the record if any record is unfiltered. + */ + KEEP_IF_ANY_UNFILTERED, + /** + * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. + */ + KEEP_IF_ALL_UNFILTERED + } + + /** + * Performs a master merge on the VCs. Here there is a master input [contains all of the information] and many + * VCs containing partial, extra genotype information which should be added to the master. For example, + * we scatter out the phasing algorithm over some samples in the master, producing a minimal VCF with phasing + * information per genotype. The master merge will add the PQ information from each genotype record, where + * appropriate, to the master VC. + * + * @param unsortedVCs collection of VCs + * @param masterName name of master VC + * @return master-merged VC + */ + public static VariantContext masterMerge(Collection unsortedVCs, String masterName) { + VariantContext master = findMaster(unsortedVCs, masterName); + Map genotypes = master.getGenotypes(); + for (Genotype g : genotypes.values()) { + genotypes.put(g.getSampleName(), new MutableGenotype(g)); + } + + Map masterAttributes = new HashMap(master.getAttributes()); + + for (VariantContext vc : unsortedVCs) { + if (!vc.getSource().equals(masterName)) { + for (Genotype g : vc.getGenotypes().values()) { + MutableGenotype masterG = (MutableGenotype) genotypes.get(g.getSampleName()); + for (Map.Entry attr : g.getAttributes().entrySet()) { + if (!masterG.hasAttribute(attr.getKey())) { + //System.out.printf("Adding GT attribute %s to masterG %s, new %s%n", attr, masterG, g); + masterG.putAttribute(attr.getKey(), attr.getValue()); + } + } + + if (masterG.isPhased() != g.isPhased()) { + if (masterG.sameGenotype(g)) { + // System.out.printf("Updating phasing %s to masterG %s, new %s%n", g.isPhased(), masterG, g); + masterG.setAlleles(g.getAlleles()); + masterG.setPhase(g.isPhased()); + } + //else System.out.println("WARNING: Not updating phase, since genotypes differ between master file and auxiliary info file!"); + } + +// if ( MathUtils.compareDoubles(masterG.getNegLog10PError(), g.getNegLog10PError()) != 0 ) { +// System.out.printf("Updating GQ %s to masterG %s, new %s%n", g.getNegLog10PError(), masterG, g); +// masterG.setNegLog10PError(g.getNegLog10PError()); +// } + + } + + for (Map.Entry attr : vc.getAttributes().entrySet()) { + if (!masterAttributes.containsKey(attr.getKey())) { + //System.out.printf("Adding VC attribute %s to master %s, new %s%n", attr, master, vc); + masterAttributes.put(attr.getKey(), attr.getValue()); + } + } + } + } + + return new VariantContext(master.getSource(), master.getChr(), master.getStart(), master.getEnd(), master.getAlleles(), genotypes, master.getNegLog10PError(), master.getFilters(), masterAttributes); + } + + private static VariantContext findMaster(Collection unsortedVCs, String masterName) { + for (VariantContext vc : unsortedVCs) { + if (vc.getSource().equals(masterName)) { + return vc; + } + } + + throw new ReviewedStingException(String.format("Couldn't find master VCF %s at %s", masterName, unsortedVCs.iterator().next())); + } + + /** + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. + * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * the sample name + * + * @param genomeLocParser loc parser + * @param unsortedVCs collection of unsorted VCs + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs + * @param filteredRecordMergeType merge type for filtered records + * @param genotypeMergeOptions merge option for genotypes + * @param annotateOrigin should we annotate the set it came from? + * @param printMessages should we print messages? + * @param setKey the key name of the set + * @param filteredAreUncalled are filtered records uncalled? + * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @return new VariantContext representing the merge of unsortedVCs + */ + public static VariantContext simpleMerge(final GenomeLocParser genomeLocParser, + final Collection unsortedVCs, + final List priorityListOfVCs, + final FilteredRecordMergeType filteredRecordMergeType, + final GenotypeMergeType genotypeMergeOptions, + final boolean annotateOrigin, + final boolean printMessages, + final String setKey, + final boolean filteredAreUncalled, + final boolean mergeInfoWithMaxAC ) { + if ( unsortedVCs == null || unsortedVCs.size() == 0 ) + return null; + + if ( annotateOrigin && priorityListOfVCs == null ) + throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts"); + + if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE ) + verifyUniqueSampleNames(unsortedVCs); + + List prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); + // Make sure all variant contexts are padded with reference base in case of indels if necessary + List VCs = new ArrayList(); + + for (VariantContext vc : prepaddedVCs) { + // also a reasonable place to remove filtered calls, if needed + if ( ! filteredAreUncalled || vc.isNotFiltered() ) + VCs.add(VariantContext.createVariantContextWithPaddedAlleles(vc, false)); + } + if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled + return null; + + // establish the baseline info from the first VC + final VariantContext first = VCs.get(0); + final String name = first.getSource(); + final Allele refAllele = determineReferenceAllele(VCs); + + final Set alleles = new LinkedHashSet(); + final Set filters = new TreeSet(); + final Map attributes = new TreeMap(); + final Set inconsistentAttributes = new HashSet(); + final Set variantSources = new HashSet(); // contains the set of sources we found in our set of VCs that are variant + final Set rsIDs = new LinkedHashSet(1); // most of the time there's one id + + GenomeLoc loc = getLocation(genomeLocParser,first); + int depth = 0; + int maxAC = -1; + final Map attributesWithMaxAC = new TreeMap(); + double negLog10PError = -1; + VariantContext vcWithMaxAC = null; + Map genotypes = new TreeMap(); + + // counting the number of filtered and variant VCs + int nFiltered = 0; + + boolean remapped = false; + + // cycle through and add info from the other VCs, making sure the loc/reference matches + + for ( VariantContext vc : VCs ) { + if ( loc.getStart() != vc.getStart() ) // || !first.getReference().equals(vc.getReference()) ) + throw new ReviewedStingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); + + if ( getLocation(genomeLocParser,vc).size() > loc.size() ) + loc = getLocation(genomeLocParser,vc); // get the longest location + + nFiltered += vc.isFiltered() ? 1 : 0; + if ( vc.isVariant() ) variantSources.add(vc.getSource()); + + AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); + remapped = remapped || alleleMapping.needsRemapping(); + + alleles.addAll(alleleMapping.values()); + + mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); + + negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1); + + filters.addAll(vc.getFilters()); + + // + // add attributes + // + // special case DP (add it up) and ID (just preserve it) + // + if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); + if ( vc.hasID() && ! vc.getID().equals(VCFConstants.EMPTY_ID_FIELD) ) rsIDs.add(vc.getID()); + if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { + String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); + // lets see if the string contains a , separator + if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { + List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); + for (String alleleCount : alleleCountArray) { + final int ac = Integer.valueOf(alleleCount.trim()); + if (ac > maxAC) { + maxAC = ac; + vcWithMaxAC = vc; + } + } + } else { + final int ac = Integer.valueOf(rawAlleleCounts); + if (ac > maxAC) { + maxAC = ac; + vcWithMaxAC = vc; + } + } + } + + for (Map.Entry p : vc.getAttributes().entrySet()) { + String key = p.getKey(); + // if we don't like the key already, don't go anywhere + if ( ! inconsistentAttributes.contains(key) ) { + boolean alreadyFound = attributes.containsKey(key); + Object boundValue = attributes.get(key); + boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + + if ( alreadyFound && ! boundValue.equals(p.getValue()) && ! boundIsMissingValue ) { + // we found the value but we're inconsistent, put it in the exclude list + //System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, p.getValue()); + inconsistentAttributes.add(key); + attributes.remove(key); + } else if ( ! alreadyFound || boundIsMissingValue ) { // no value + //if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), p.getValue()); + attributes.put(key, p.getValue()); + } + } + } + } + + // if we have more alternate alleles in the merged VC than in one or more of the + // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF + for ( VariantContext vc : VCs ) { + if (vc.alleles.size() == 1) + continue; + if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { + logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", + genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); + genotypes = stripPLs(genotypes); + // this will remove stale AC,AF attributed from vc + calculateChromosomeCounts(vc, attributes, true); + break; + } + } + + // take the VC with the maxAC and pull the attributes into a modifiable map + if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { + attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); + } + + // if at least one record was unfiltered and we want a union, clear all of the filters + if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() ) + filters.clear(); + + + if ( annotateOrigin ) { // we care about where the call came from + String setValue; + if ( nFiltered == 0 && variantSources.size() == priorityListOfVCs.size() ) // nothing was unfiltered + setValue = MERGE_INTERSECTION; + else if ( nFiltered == VCs.size() ) // everything was filtered out + setValue = MERGE_FILTER_IN_ALL; + else if ( variantSources.isEmpty() ) // everyone was reference + setValue = MERGE_REF_IN_ALL; + else { + LinkedHashSet s = new LinkedHashSet(); + for ( VariantContext vc : VCs ) + if ( vc.isVariant() ) + s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); + setValue = Utils.join("-", s); + } + + if ( setKey != null ) { + attributes.put(setKey, setValue); + if( mergeInfoWithMaxAC && vcWithMaxAC != null ) { attributesWithMaxAC.put(setKey, vcWithMaxAC.getSource()); } + } + } + + if ( depth > 0 ) + attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); + + if ( ! rsIDs.isEmpty() ) { + attributes.put(VariantContext.ID_KEY, Utils.join(",", rsIDs)); + } + + VariantContext merged = new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, negLog10PError, filters, (mergeInfoWithMaxAC ? attributesWithMaxAC : attributes) ); + // Trim the padded bases of all alleles if necessary + merged = createVariantContextWithTrimmedAlleles(merged); + + if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); + return merged; + } + + private static final boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { + final Iterator it1 = alleleSet1.iterator(); + final Iterator it2 = alleleSet2.iterator(); + + while ( it1.hasNext() && it2.hasNext() ) { + final Allele a1 = it1.next(); + final Allele a2 = it2.next(); + if ( ! a1.equals(a2) ) + return true; + } + + // by this point, at least one of the iterators is empty. All of the elements + // we've compared are equal up until this point. But it's possible that the + // sets aren't the same size, which is indicated by the test below. If they + // are of the same size, though, the sets are compatible + return it1.hasNext() || it2.hasNext(); + } + + public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { + // if all alleles of vc1 are a contained in alleles of vc2, return true + if (!vc1.getReference().equals(vc2.getReference())) + return false; + + for (Allele a :vc1.getAlternateAlleles()) { + if (!vc2.getAlternateAlleles().contains(a)) + return false; + } + + return true; + } + public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { + // see if we need to trim common reference base from all alleles + boolean trimVC; + + // We need to trim common reference base from all alleles in all genotypes if a ref base is common to all alleles + Allele refAllele = inputVC.getReference(); + if (!inputVC.isVariant()) + trimVC = false; + else if (refAllele.isNull()) + trimVC = false; + else { + trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList(inputVC.getAlternateAlleles()), + inputVC.getReference().getDisplayString()) > 0); + } + + // nothing to do if we don't need to trim bases + if (trimVC) { + List alleles = new ArrayList(); + Map genotypes = new TreeMap(); + + // set the reference base for indels in the attributes + Map attributes = new TreeMap(inputVC.getAttributes()); + + Map originalToTrimmedAlleleMap = new HashMap(); + + for (Allele a : inputVC.getAlleles()) { + if (a.isSymbolic()) { + alleles.add(a); + originalToTrimmedAlleleMap.put(a, a); + } else { + // get bases for current allele and create a new one with trimmed bases + byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length()); + Allele trimmedAllele = Allele.create(newBases, a.isReference()); + alleles.add(trimmedAllele); + originalToTrimmedAlleleMap.put(a, trimmedAllele); + } + } + + // detect case where we're trimming bases but resulting vc doesn't have any null allele. In that case, we keep original representation + // example: mixed records such as {TA*,TGA,TG} + boolean hasNullAlleles = false; + + for (Allele a: originalToTrimmedAlleleMap.values()) { + if (a.isNull()) + hasNullAlleles = true; + if (a.isReference()) + refAllele = a; + } + + if (!hasNullAlleles) + return inputVC; + // now we can recreate new genotypes with trimmed alleles + for ( Map.Entry sample : inputVC.getGenotypes().entrySet() ) { + + List originalAlleles = sample.getValue().getAlleles(); + List trimmedAlleles = new ArrayList(); + for ( Allele a : originalAlleles ) { + if ( a.isCalled() ) + trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); + else + trimmedAlleles.add(Allele.NO_CALL); + } + genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles)); + + } + return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0])); + + } + + return inputVC; + } + + public static Map stripPLs(Map genotypes) { + Map newGs = new HashMap(genotypes.size()); + + for ( Map.Entry g : genotypes.entrySet() ) { + newGs.put(g.getKey(), g.getValue().hasLikelihoods() ? removePLs(g.getValue()) : g.getValue()); + } + + return newGs; + } + + public static Map> separateVariantContextsByType(Collection VCs) { + HashMap> mappedVCs = new HashMap>(); + for ( VariantContext vc : VCs ) { + + // look at previous variant contexts of different type. If: + // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list + // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) + // c) neither: do nothing, just add vc to its own list + boolean addtoOwnList = true; + for (VariantContext.Type type : VariantContext.Type.values()) { + if (type.equals(vc.getType())) + continue; + + if (!mappedVCs.containsKey(type)) + continue; + + List vcList = mappedVCs.get(type); + for (int k=0; k < vcList.size(); k++) { + VariantContext otherVC = vcList.get(k); + if (allelesAreSubset(otherVC,vc)) { + // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list + vcList.remove(k); + // avoid having empty lists + if (vcList.size() == 0) + mappedVCs.remove(vcList); + if ( !mappedVCs.containsKey(vc.getType()) ) + mappedVCs.put(vc.getType(), new ArrayList()); + mappedVCs.get(vc.getType()).add(otherVC); + break; + } + else if (allelesAreSubset(vc,otherVC)) { + // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own + mappedVCs.get(type).add(vc); + addtoOwnList = false; + break; + } + } + } + if (addtoOwnList) { + if ( !mappedVCs.containsKey(vc.getType()) ) + mappedVCs.put(vc.getType(), new ArrayList()); + mappedVCs.get(vc.getType()).add(vc); + } + } + + return mappedVCs; + } + + private static class AlleleMapper { + private VariantContext vc = null; + private Map map = null; + public AlleleMapper(VariantContext vc) { this.vc = vc; } + public AlleleMapper(Map map) { this.map = map; } + public boolean needsRemapping() { return this.map != null; } + public Collection values() { return map != null ? map.values() : vc.getAlleles(); } + + public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } + + public List remap(List as) { + List newAs = new ArrayList(); + for ( Allele a : as ) { + //System.out.printf(" Remapping %s => %s%n", a, remap(a)); + newAs.add(remap(a)); + } + return newAs; + } + } + + static private void verifyUniqueSampleNames(Collection unsortedVCs) { + Set names = new HashSet(); + for ( VariantContext vc : unsortedVCs ) { + for ( String name : vc.getSampleNames() ) { + //System.out.printf("Checking %s %b%n", name, names.contains(name)); + if ( names.contains(name) ) + throw new UserException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered " + name); + } + + names.addAll(vc.getSampleNames()); + } + } + + + static private Allele determineReferenceAllele(List VCs) { + Allele ref = null; + + for ( VariantContext vc : VCs ) { + Allele myRef = vc.getReference(); + if ( ref == null || ref.length() < myRef.length() ) + ref = myRef; + else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) + throw new UserException.BadInput(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); + } + + return ref; + } + + static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set allAlleles) { + if ( refAllele.equals(vc.getReference()) ) + return new AlleleMapper(vc); + else { + // we really need to do some work. The refAllele is the longest reference allele seen at this + // start site. So imagine it is: + // + // refAllele: ACGTGA + // myRef: ACGT + // myAlt: - + // + // We need to remap all of the alleles in vc to include the extra GA so that + // myRef => refAllele and myAlt => GA + // + + Allele myRef = vc.getReference(); + if ( refAllele.length() <= myRef.length() ) throw new ReviewedStingException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); + byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); + +// System.out.printf("Remapping allele at %s%n", vc); +// System.out.printf("ref %s%n", refAllele); +// System.out.printf("myref %s%n", myRef ); +// System.out.printf("extrabases %s%n", new String(extraBases)); + + Map map = new HashMap(); + for ( Allele a : vc.getAlleles() ) { + if ( a.isReference() ) + map.put(a, refAllele); + else { + Allele extended = Allele.extend(a, extraBases); + for ( Allele b : allAlleles ) + if ( extended.equals(b) ) + extended = b; +// System.out.printf(" Extending %s => %s%n", a, extended); + map.put(a, extended); + } + } + + // debugging +// System.out.printf("mapping %s%n", map); + + return new AlleleMapper(map); + } + } + + static class CompareByPriority implements Comparator, Serializable { + List priorityListOfVCs; + public CompareByPriority(List priorityListOfVCs) { + this.priorityListOfVCs = priorityListOfVCs; + } + + private int getIndex(VariantContext vc) { + int i = priorityListOfVCs.indexOf(vc.getSource()); + if ( i == -1 ) throw new UserException.BadArgumentValue(Utils.join(",", priorityListOfVCs), "Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getSource()); + return i; + } + + public int compare(VariantContext vc1, VariantContext vc2) { + return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); + } + } + + public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, GenotypeMergeType mergeOption ) { + if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) + throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); + + if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) + return new ArrayList(unsortedVCs); + else { + ArrayList sorted = new ArrayList(unsortedVCs); + Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); + return sorted; + } + } + + private static void mergeGenotypes(Map mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { + for ( Genotype g : oneVC.getGenotypes().values() ) { + String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples); + if ( ! mergedGenotypes.containsKey(name) ) { + // only add if the name is new + Genotype newG = g; + + if ( uniqifySamples || alleleMapping.needsRemapping() ) { + MutableGenotype mutG = new MutableGenotype(name, g); + if ( alleleMapping.needsRemapping() ) mutG.setAlleles(alleleMapping.remap(g.getAlleles())); + newG = mutG; + } + + mergedGenotypes.put(name, newG); + } + } + } + + public static String mergedSampleName(String trackName, String sampleName, boolean uniqify ) { + return uniqify ? sampleName + "." + trackName : sampleName; + } + + /** + * Returns a context identical to this with the REF and ALT alleles reverse complemented. + * + * @param vc variant context + * @return new vc + */ + public static VariantContext reverseComplement(VariantContext vc) { + // create a mapping from original allele to reverse complemented allele + HashMap alleleMap = new HashMap(vc.getAlleles().size()); + for ( Allele originalAllele : vc.getAlleles() ) { + Allele newAllele; + if ( originalAllele.isNoCall() || originalAllele.isNull() ) + newAllele = originalAllele; + else + newAllele = Allele.create(BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); + alleleMap.put(originalAllele, newAllele); + } + + // create new Genotype objects + Map newGenotypes = new HashMap(vc.getNSamples()); + for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { + List newAlleles = new ArrayList(); + for ( Allele allele : genotype.getValue().getAlleles() ) { + Allele newAllele = alleleMap.get(allele); + if ( newAllele == null ) + newAllele = Allele.NO_CALL; + newAlleles.add(newAllele); + } + newGenotypes.put(genotype.getKey(), Genotype.modifyAlleles(genotype.getValue(), newAlleles)); + } + + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes()); + + } + + public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { + if ( allowedAttributes == null ) + return vc; + + Map newGenotypes = new HashMap(vc.getNSamples()); + for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { + Map attrs = new HashMap(); + for ( Map.Entry attr : genotype.getValue().getAttributes().entrySet() ) { + if ( allowedAttributes.contains(attr.getKey()) ) + attrs.put(attr.getKey(), attr.getValue()); + } + newGenotypes.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attrs)); + } + + return VariantContext.modifyGenotypes(vc, newGenotypes); + } + + public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { + if (!context.isSNP() || !context.isBiallelic()) + throw new IllegalStateException("Requested SNP substitution type for bialleic non-SNP " + context); + return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); + } + + /** + * If this is a BiAlleic SNP, is it a transition? + */ + public static boolean isTransition(VariantContext context) { + return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; + } + + /** + * If this is a BiAlleic SNP, is it a transversion? + */ + public static boolean isTransversion(VariantContext context) { + return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; + } + + /** + * create a genome location, given a variant context + * @param genomeLocParser parser + * @param vc the variant context + * @return the genomeLoc + */ + public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) { + return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); + } + + public abstract static class AlleleMergeRule { + // vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2): + abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2); + + public String toString() { + return "all samples are mergeable"; + } + } + + // NOTE: returns null if vc1 and vc2 are not merged into a single MNP record + + public static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) { + if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2)) + return null; + + // Check that it's logically possible to merge the VCs: + if (!allSamplesAreMergeable(vc1, vc2)) + return null; + + // Check if there's a "point" in merging the VCs (e.g., annotations could be changed) + if (!alleleMergeRule.allelesShouldBeMerged(vc1, vc2)) + return null; + + return reallyMergeIntoMNP(vc1, vc2, referenceFile); + } + + private static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { + int startInter = vc1.getEnd() + 1; + int endInter = vc2.getStart() - 1; + byte[] intermediateBases = null; + if (startInter <= endInter) { + intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases(); + StringUtil.toUpperCase(intermediateBases); + } + MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added + + Map mergedGenotypes = new HashMap(); + for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { + String sample = gt1Entry.getKey(); + Genotype gt1 = gt1Entry.getValue(); + Genotype gt2 = vc2.getGenotype(sample); + + List site1Alleles = gt1.getAlleles(); + List site2Alleles = gt2.getAlleles(); + + List mergedAllelesForSample = new LinkedList(); + + /* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records, + we preserve phase information (if any) relative to whatever precedes vc1: + */ + Iterator all2It = site2Alleles.iterator(); + for (Allele all1 : site1Alleles) { + Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() + + Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2); + mergedAllelesForSample.add(mergedAllele); + } + + double mergedGQ = Math.max(gt1.getNegLog10PError(), gt2.getNegLog10PError()); + Set mergedGtFilters = new HashSet(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered + + Map mergedGtAttribs = new HashMap(); + PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2); + if (phaseQual.PQ != null) + mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ); + + Genotype mergedGt = new Genotype(sample, mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased); + mergedGenotypes.put(sample, mergedGt); + } + + String mergedName = VariantContextUtils.mergeVariantContextNames(vc1.getSource(), vc2.getSource()); + double mergedNegLog10PError = Math.max(vc1.getNegLog10PError(), vc2.getNegLog10PError()); + Set mergedFilters = new HashSet(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered + Map mergedAttribs = VariantContextUtils.mergeVariantContextAttributes(vc1, vc2); + + VariantContext mergedVc = new VariantContext(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles(), mergedGenotypes, mergedNegLog10PError, mergedFilters, mergedAttribs); + + mergedAttribs = new HashMap(mergedVc.getAttributes()); + VariantContextUtils.calculateChromosomeCounts(mergedVc, mergedAttribs, true); + mergedVc = VariantContext.modifyAttributes(mergedVc, mergedAttribs); + + return mergedVc; + } + + private static class AlleleOneAndTwo { + private Allele all1; + private Allele all2; + + public AlleleOneAndTwo(Allele all1, Allele all2) { + this.all1 = all1; + this.all2 = all2; + } + + public int hashCode() { + return all1.hashCode() + all2.hashCode(); + } + + public boolean equals(Object other) { + if (!(other instanceof AlleleOneAndTwo)) + return false; + + AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other; + return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2)); + } + } + + private static class MergedAllelesData { + private Map mergedAlleles; + private byte[] intermediateBases; + private int intermediateLength; + + public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) { + this.mergedAlleles = new HashMap(); // implemented equals() and hashCode() for AlleleOneAndTwo + this.intermediateBases = intermediateBases; + this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0; + + this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true); + } + + public Allele ensureMergedAllele(Allele all1, Allele all2) { + return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor + } + + private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) { + AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2); + Allele mergedAllele = mergedAlleles.get(all12); + + if (mergedAllele == null) { + byte[] bases1 = all1.getBases(); + byte[] bases2 = all2.getBases(); + + byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length]; + System.arraycopy(bases1, 0, mergedBases, 0, bases1.length); + if (intermediateBases != null) + System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength); + System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length); + + mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime); + mergedAlleles.put(all12, mergedAllele); + } + + return mergedAllele; + } + + public Set getAllMergedAlleles() { + return new HashSet(mergedAlleles.values()); + } + } + + private static String mergeVariantContextNames(String name1, String name2) { + return name1 + "_" + name2; + } + + private static Map mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) { + Map mergedAttribs = new HashMap(); + + List vcList = new LinkedList(); + vcList.add(vc1); + vcList.add(vc2); + + String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY}; + for (String orAttrib : MERGE_OR_ATTRIBS) { + boolean attribVal = false; + for (VariantContext vc : vcList) { + attribVal = vc.getAttributeAsBoolean(orAttrib, false); + if (attribVal) // already true, so no reason to continue: + break; + } + mergedAttribs.put(orAttrib, attribVal); + } + + // Merge ID fields: + String iDVal = null; + for (VariantContext vc : vcList) { + String val = vc.getAttributeAsString(VariantContext.ID_KEY, null); + if (val != null && !val.equals(VCFConstants.EMPTY_ID_FIELD)) { + if (iDVal == null) + iDVal = val; + else + iDVal += VCFConstants.ID_FIELD_SEPARATOR + val; + } + } + if (iDVal != null) + mergedAttribs.put(VariantContext.ID_KEY, iDVal); + + return mergedAttribs; + } + + private static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) { + GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1); + GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2); + + if (!loc1.onSameContig(loc2)) + throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome"); + + if (!loc1.isBefore(loc2)) + throw new ReviewedStingException("Can only merge if vc1 is BEFORE vc2"); + + if (vc1.isFiltered() || vc2.isFiltered()) + return false; + + if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets + return false; + + if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2)) + return false; + + return true; + } + + private static boolean allGenotypesAreUnfilteredAndCalled(VariantContext vc) { + for (Map.Entry gtEntry : vc.getGenotypes().entrySet()) { + Genotype gt = gtEntry.getValue(); + if (gt.isNoCall() || gt.isFiltered()) + return false; + } + + return true; + } + + // Assumes that vc1 and vc2 were already checked to have the same sample names: + + private static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { + // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1: + for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { + String sample = gt1Entry.getKey(); + Genotype gt1 = gt1Entry.getValue(); + Genotype gt2 = vc2.getGenotype(sample); + + if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom + return false; + } + + return true; + } + + public static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) { + if (gt1.getPloidy() != gt2.getPloidy()) + return false; + + /* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard]. + + HOWEVER, EVEN if this is not the case, but gt1.isHom(), + it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1. + */ + return (gt2.isPhased() || gt2.isHom() || gt1.isHom()); + } + + private static class PhaseAndQuality { + public boolean isPhased; + public Double PQ = null; + + public PhaseAndQuality(Genotype gt) { + this.isPhased = gt.isPhased(); + if (this.isPhased) { + this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); + if ( this.PQ == -1 ) this.PQ = null; + } + } + } + + // Assumes that alleleSegregationIsKnown(gt1, gt2): + + private static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) { + if (gt2.isPhased() || gt2.isHom()) + return new PhaseAndQuality(gt1); // maintain the phase of gt1 + + if (!gt1.isHom()) + throw new ReviewedStingException("alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()"); + + /* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype + + For example, if we're merging the third Genotype with the second one: + 0/1 + 1|1 + 0/1 + + Then, we want to output: + 0/1 + 1/2 + */ + return new PhaseAndQuality(gt2); // maintain the phase of gt2 [since !gt2.genotypesArePhased()] + } + + /* Checks if any sample has a MNP of ALT alleles (segregating together): + [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)] + */ + + public static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { + for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { + String sample = gt1Entry.getKey(); + Genotype gt1 = gt1Entry.getValue(); + Genotype gt2 = vc2.getGenotype(sample); + + List site1Alleles = gt1.getAlleles(); + List site2Alleles = gt2.getAlleles(); + + Iterator all2It = site2Alleles.iterator(); + for (Allele all1 : site1Alleles) { + Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() + + if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate + return true; + } + } + + return false; + } + + /* Checks if all samples are consistent in their haplotypes: + [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)] + */ + + public static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) { + // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference): + Map allele1ToAllele2 = new HashMap(); + Map allele2ToAllele1 = new HashMap(); + + // Note the segregation of the alleles for the reference genome: + allele1ToAllele2.put(vc1.getReference(), vc2.getReference()); + allele2ToAllele1.put(vc2.getReference(), vc1.getReference()); + + // Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples). + for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { + String sample = gt1Entry.getKey(); + Genotype gt1 = gt1Entry.getValue(); + Genotype gt2 = vc2.getGenotype(sample); + + List site1Alleles = gt1.getAlleles(); + List site2Alleles = gt2.getAlleles(); + + Iterator all2It = site2Alleles.iterator(); + for (Allele all1 : site1Alleles) { + Allele all2 = all2It.next(); + + Allele all1To2 = allele1ToAllele2.get(all1); + if (all1To2 == null) + allele1ToAllele2.put(all1, all2); + else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2 + return false; + + Allele all2To1 = allele2ToAllele1.get(all2); + if (all2To1 == null) + allele2ToAllele1.put(all2, all1); + else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1 + return false; + } + } + + return true; + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantJEXLContext.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantJEXLContext.java new file mode 100644 index 000000000..f16861f30 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/v13/VariantJEXLContext.java @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2010. The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext.v13; + +import org.apache.commons.jexl2.JexlContext; +import org.apache.commons.jexl2.MapContext; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/** + * + * @author aaron + * @author depristo + * + * Class VariantJEXLContext + * + * implements the JEXML context for VariantContext; this saves us from + * having to generate a JEXML context lookup map everytime we want to evaluate an expression. + * + * This is package protected, only classes in variantcontext should have access to it. + * + * // todo -- clean up to remove or better support genotype filtering + */ + +class VariantJEXLContext implements JexlContext { + // our stored variant context + private VariantContext vc; + + private interface AttributeGetter { + public Object get(VariantContext vc); + } + + private static Map x = new HashMap(); + + static { + x.put("vc", new AttributeGetter() { public Object get(VariantContext vc) { return vc; }}); + x.put("CHROM", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getChr(); }}); + x.put("POS", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getStart(); }}); + x.put("TYPE", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getType().toString(); }}); + x.put("QUAL", new AttributeGetter() { public Object get(VariantContext vc) { return 10 * vc.getNegLog10PError(); }}); + x.put("ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getAlleles(); }}); + x.put("N_ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getNAlleles(); }}); + x.put("FILTER", new AttributeGetter() { public Object get(VariantContext vc) { return vc.isFiltered() ? "1" : "0"; }}); + +// x.put("GT", new AttributeGetter() { public Object get(VariantContext vc) { return g.getGenotypeString(); }}); + x.put("homRefCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHomRefCount(); }}); + x.put("hetCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHetCount(); }}); + x.put("homVarCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHomVarCount(); }}); + } + + public VariantJEXLContext(VariantContext vc) { + this.vc = vc; + } + + public Object get(String name) { + Object result = null; + if ( x.containsKey(name) ) { // dynamic resolution of name -> value via map + result = x.get(name).get(vc); + } else if ( vc.hasAttribute(name)) { + result = vc.getAttribute(name); + } else if ( vc.getFilters().contains(name) ) { + result = "1"; + } + + //System.out.printf("dynamic lookup %s => %s%n", name, result); + + return result; + } + + public boolean has(String name) { + return get(name) != null; + } + + public void set(String name, Object value) { + throw new UnsupportedOperationException("remove() not supported on a VariantJEXLContext"); + } +} + + + + +/** + * this is an implementation of a Map of JexlVCMatchExp to true or false values. It lazy initializes each value + * as requested to save as much processing time as possible. + * + * Compatible with JEXL 1.1 (this code will be easier if we move to 2.0, all of the functionality can go into the + * JexlContext's get() + * + */ + +class JEXLMap implements Map { + // our variant context and/or Genotype + private final VariantContext vc; + private final Genotype g; + + // our context + private JexlContext jContext = null; + + // our mapping from JEXLVCMatchExp to Booleans, which will be set to NULL for previously uncached JexlVCMatchExp + private Map jexl; + + + public JEXLMap(Collection jexlCollection, VariantContext vc, Genotype g) { + this.vc = vc; + this.g = g; + initialize(jexlCollection); + } + + public JEXLMap(Collection jexlCollection, VariantContext vc) { + this(jexlCollection, vc, null); + } + + private void initialize(Collection jexlCollection) { + jexl = new HashMap(); + for (VariantContextUtils.JexlVCMatchExp exp: jexlCollection) { + jexl.put(exp, null); + } + } + + /** + * create the internal JexlContext, only when required. This code is where new JEXL context variables + * should get added. + * + */ + private void createContext() { + if ( g == null ) { + // todo -- remove dependancy on g to the entire system + jContext = new VariantJEXLContext(vc); + } else { + // + // this whole branch is here just to support G jexl operations + // + Map infoMap = new HashMap(); + + if ( vc != null ) { + // create a mapping of what we know about the variant context, its Chromosome, positions, etc. + infoMap.put("CHROM", vc.getChr()); + infoMap.put("POS", vc.getStart()); + infoMap.put("TYPE", vc.getType().toString()); + infoMap.put("QUAL", String.valueOf(vc.getPhredScaledQual())); + + // add alleles + infoMap.put("ALLELES", Utils.join(";", vc.getAlleles())); + infoMap.put("N_ALLELES", String.valueOf(vc.getNAlleles())); + + // add attributes + addAttributesToMap(infoMap, vc.getAttributes()); + + // add filter fields + infoMap.put("FILTER", vc.isFiltered() ? "1" : "0"); + for ( Object filterCode : vc.getFilters() ) { + infoMap.put(String.valueOf(filterCode), "1"); + } + + // add genotype-specific fields + // TODO -- implement me when we figure out a good way to represent this + // for ( Genotype g : vc.getGenotypes().values() ) { + // String prefix = g.getSampleName() + "."; + // addAttributesToMap(infoMap, g.getAttributes(), prefix); + // infoMap.put(prefix + "GT", g.getGenotypeString()); + // } + + // add specific genotype if one is provided + infoMap.put(VCFConstants.GENOTYPE_KEY, g.getGenotypeString()); + infoMap.put("isHomRef", g.isHomRef() ? "1" : "0"); + infoMap.put("isHet", g.isHet() ? "1" : "0"); + infoMap.put("isHomVar", g.isHomVar() ? "1" : "0"); + infoMap.put(VCFConstants.GENOTYPE_QUALITY_KEY, new Double(g.getPhredScaledQual())); + for ( Map.Entry e : g.getAttributes().entrySet() ) { + if ( e.getValue() != null && !e.getValue().equals(VCFConstants.MISSING_VALUE_v4) ) + infoMap.put(e.getKey(), e.getValue()); + } + } + + // create the internal context that we can evaluate expressions against + + jContext = new MapContext(infoMap); + } + } + + /** + * @return the size of the internal data structure + */ + public int size() { + return jexl.size(); + } + + /** + * @return true if we're empty + */ + public boolean isEmpty() { return this.jexl.isEmpty(); } + + /** + * do we contain the specified key + * @param o the key + * @return true if we have a value for that key + */ + public boolean containsKey(Object o) { return jexl.containsKey(o); } + + public Boolean get(Object o) { + // if we've already determined the value, return it + if (jexl.containsKey(o) && jexl.get(o) != null) return jexl.get(o); + + // try and cast the expression + VariantContextUtils.JexlVCMatchExp e = (VariantContextUtils.JexlVCMatchExp) o; + evaluateExpression(e); + return jexl.get(e); + } + + /** + * get the keyset of map + * @return a set of keys of type JexlVCMatchExp + */ + public Set keySet() { + return jexl.keySet(); + } + + /** + * get all the values of the map. This is an expensive call, since it evaluates all keys that haven't + * been evaluated yet. This is fine if you truely want all the keys, but if you only want a portion, or know + * the keys you want, you would be better off using get() to get them by name. + * @return a collection of boolean values, representing the results of all the variants evaluated + */ + public Collection values() { + // this is an expensive call + for (VariantContextUtils.JexlVCMatchExp exp : jexl.keySet()) + if (jexl.get(exp) == null) + evaluateExpression(exp); + return jexl.values(); + } + + /** + * evaulate a JexlVCMatchExp's expression, given the current context (and setup the context if it's null) + * @param exp the JexlVCMatchExp to evaluate + */ + private void evaluateExpression(VariantContextUtils.JexlVCMatchExp exp) { + // if the context is null, we need to create it to evaluate the JEXL expression + if (this.jContext == null) createContext(); + try { + jexl.put (exp, (Boolean) exp.exp.evaluate(jContext)); + } catch (Exception e) { + throw new UserException.CommandLineException(String.format("Invalid JEXL expression detected for %s with message %s", exp.name, e.getMessage())); + } + } + + /** + * helper function: adds the list of attributes to the information map we're building + * @param infoMap the map + * @param attributes the attributes + */ + private static void addAttributesToMap(Map infoMap, Map attributes ) { + for (Map.Entry e : attributes.entrySet()) { + infoMap.put(e.getKey(), String.valueOf(e.getValue())); + } + } + + public Boolean put(VariantContextUtils.JexlVCMatchExp jexlVCMatchExp, Boolean aBoolean) { + return jexl.put(jexlVCMatchExp,aBoolean); + } + + public void putAll(Map map) { + jexl.putAll(map); + } + + // ////////////////////////////////////////////////////////////////////////////////////// + // The Following are unsupported at the moment + // ////////////////////////////////////////////////////////////////////////////////////// + + // this doesn't make much sense to implement, boolean doesn't offer too much variety to deal + // with evaluating every key in the internal map. + public boolean containsValue(Object o) { + throw new UnsupportedOperationException("containsValue() not supported on a JEXLMap"); + } + + // this doesn't make much sense + public Boolean remove(Object o) { + throw new UnsupportedOperationException("remove() not supported on a JEXLMap"); + } + + + public Set> entrySet() { + throw new UnsupportedOperationException("clear() not supported on a JEXLMap"); + } + + // nope + public void clear() { + throw new UnsupportedOperationException("clear() not supported on a JEXLMap"); + } +}