diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java index 74d29db90..30a186151 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java @@ -16,7 +16,7 @@ public class Genotype { protected InferredGeneticContext commonInfo; public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; - protected List alleles = new ArrayList(); + protected List alleles = null; // new ArrayList(); private boolean genotypesArePhased = false; public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean genotypesArePhased) { diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/InferredGeneticContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/InferredGeneticContext.java index cdb694b08..3b76c7731 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/InferredGeneticContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/InferredGeneticContext.java @@ -13,10 +13,13 @@ import java.util.*; final class InferredGeneticContext { public static final double NO_NEG_LOG_10PERROR = -1.0; + private static Set NO_FILTERS = Collections.unmodifiableSet(new HashSet()); + private static Map NO_ATTRIBUTES = Collections.unmodifiableMap(new HashMap()); + private double negLog10PError = NO_NEG_LOG_10PERROR; private String name = null; - private Set filters = new HashSet(); - private Map attributes = new HashMap(); + private Set filters = NO_FILTERS; + private Map attributes = NO_ATTRIBUTES; // public InferredGeneticContext(String name) { // this.name = name; @@ -73,6 +76,9 @@ final class InferredGeneticContext { } public void addFilter(String filter) { + if ( filters == NO_FILTERS ) // immutable -> mutable + filters = new HashSet(filters); + if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this); if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this); filters.add(filter); @@ -85,7 +91,10 @@ final class InferredGeneticContext { } public void clearFilters() { - filters.clear(); + if ( filters == NO_FILTERS ) + filters = new HashSet(); + else + filters.clear(); } public void setFilters(Collection filters) { @@ -123,7 +132,10 @@ final class InferredGeneticContext { // // --------------------------------------------------------------------------------------------------------- public void clearAttributes() { - this.attributes.clear(); + if ( attributes == NO_ATTRIBUTES ) + attributes = new HashMap(); + else + this.attributes.clear(); } /** @@ -136,7 +148,7 @@ final class InferredGeneticContext { // todo -- define common attributes as enum public void setAttributes(Map map) { - this.attributes.clear(); + clearAttributes(); putAttributes(map); } @@ -148,10 +160,15 @@ final class InferredGeneticContext { if ( hasAttribute(key) && ! allowOverwrites ) throw new StingException("Attempting to overwrite key->value binding: key = " + key + " this = " + this); + if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable + attributes = new HashMap(attributes); + this.attributes.put(key, value); } public void removeAttribute(String key) { + if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable + attributes = new HashMap(attributes); this.attributes.remove(key); } diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java index 2ccf3f5f5..3d4e95459 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Utils; import org.broad.tribble.vcf.VCFRecord; +import org.broad.tribble.Feature; import java.util.*; @@ -160,7 +161,7 @@ import java.util.*; * * @author depristo */ -public class VariantContext { +public class VariantContext implements Feature { // to enable tribble intergration protected InferredGeneticContext commonInfo = null; public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; @@ -997,4 +998,24 @@ public class VariantContext { return dest; } + // --------------------------------------------------------------------------------------------------------- + // + // tribble integration routines -- not for public consumption + // + // --------------------------------------------------------------------------------------------------------- + @Override + public String getChr() { + return getLocation().getContig(); + } + + @Override + public int getStart() { + return (int)getLocation().getStart(); + } + + @Override + public int getEnd() { + return (int)getLocation().getStop(); + } + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java index 61f190e91..db59624fa 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -4,6 +4,7 @@ import org.broad.tribble.Feature; import org.broad.tribble.FeatureCodec; import org.broad.tribble.exception.CodecLineParsingException; import org.broad.tribble.util.LineReader; +import org.broad.tribble.util.ParsingUtils; import org.broad.tribble.vcf.VCFHeader; import org.broad.tribble.vcf.VCFHeaderLine; import org.broad.tribble.vcf.VCFReaderUtils; @@ -12,6 +13,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLoc; import java.io.IOException; import java.util.*; @@ -30,60 +32,11 @@ public class VCF4Codec implements FeatureCodec { throw new StingException("DON'T USE THIS"); } - /** - * this a super hack like method to parse out what we need from a variant context - * @param line the line to parse - * @return - */ - @Override - public Feature decode(String line) { - - // our header cannot be null, we need the genotype sample names and counts - if (header == null) throw new IllegalStateException("VCF Header cannot be null"); - - // split the line on whitespace (Jim's parser will be faster, but it's broken right now) - String[] result = line.split("\\s+"); - - // check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes) - if (result.length != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + result.length+ " for line " + line); - - // our genotype names - Iterator iter = header.getGenotypeSamples().iterator(); - - // out genotype map, sample name to genotype - Map genotypes = new LinkedHashMap(); - - // our allele list, add the reference and the alts - List alleles = new ArrayList(); - String[] alts = result[4].split(","); - for (String alt : alts) - alleles.add(new Allele(alt,false)); - alleles.add(new Allele(result[3],true)); - - // parse out each of the genotypes - for (int genotypeIndex = 9; genotypeIndex < header.getColumnCount(); genotypeIndex++) { - if (!iter.hasNext()) throw new StingException("Wrong number of samples!"); - String sample = iter.next(); - genotypes.put(sample,createGenotypeFromString(sample,result[genotypeIndex],result[8].split(":"),alts,result[3])); - } - - // make a new set of all the filters - Set filters = new TreeSet(); - filters.addAll(Arrays.asList(result[5].split(","))); - - // create, validate, and return the record - VCF4Record rec = new VCF4Record(result[2], - GenomeLocParser.createGenomeLoc(result[0],Long.valueOf(result[1])), - Collections.unmodifiableCollection(alleles), - genotypes, - Double.valueOf(result[5]), - filters, - new HashMap()); - rec.validate(); - return rec; + public VCF4Codec(boolean itsOKImTesting) { + if ( ! itsOKImTesting ) + throw new StingException("DON'T USE THIS"); } - /** * this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates * @param reader the line reader to take header lines from @@ -118,39 +71,175 @@ public class VCF4Codec implements FeatureCodec { throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file"); } + private static int ZERO_CHAR = (byte)'0'; + private static Allele oneAllele(char index, List alleles) { + if ( index == '.' ) + return Allele.NO_CALL; + else { + int i = ((byte)index) - ZERO_CHAR; + return alleles.get(i); + } + } - /** - * create the genotype object from the VCF record - * @param name the name of the sample - * @param VCFEntry the entry; the text containing all the fields corresponding to the format fields - * @param formatStrings the format string - * @param altAlleleList our alt alleles - * @param reference the reference base(s) - * @return a Genotype object - */ - private Genotype createGenotypeFromString(String name, String VCFEntry, String[] formatStrings, String[] altAlleleList, String reference) { - // split the text entry into parts - String genotypeSplit[] = VCFEntry.split(":"); + private static Map> alleleMap = new HashMap>(3); - Set aList = new TreeSet(); - Map attributes = new LinkedHashMap(); + static long cacheHit = 0, gtParse = 0; - // for each entry in the vcf field (we drive by this so that dropped fields aren't processed - for (int index = 0; index < genotypeSplit.length; index++) { - if (formatStrings[index].toUpperCase().equals("GT")) { - String[] genotypes = genotypeSplit[index].split("[\\\\|\\/]+"); - for (String g : genotypes) { - int altIndex = Integer.valueOf(g); - if (altIndex == 0) - aList.add(new Allele(reference,true)); - else - aList.add(new Allele(altAlleleList[altIndex-1])); + private static List parseGenotypeAlleles(String GT, List alleles, Map> cache) { + // this should cache results [since they are immutable] and return a single object for each genotype + if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0 + List GTAlleles = cache.get(GT); + if ( GTAlleles == null ) { + GTAlleles = Arrays.asList(oneAllele(GT.charAt(0), alleles), oneAllele(GT.charAt(2), alleles)); + cache.put(GT, GTAlleles); + } +// else { +// cacheHit++; +// } +// gtParse++; +// +// if ( cacheHit % 10000 == 0 ) +// System.out.printf("Cache hit %d %d %.2f%n", cacheHit, gtParse, (100.0*cacheHit) / gtParse); + + return GTAlleles; + } + + private Map parseInfo(String infoField, String id) { + Map attributes = new HashMap(); + + if ( ! infoField.equals(".") ) { // empty info field + for ( String field : infoField.split(";") ) { + int eqI = field.indexOf("="); + String key = null; + Object value = null; + if ( eqI != -1 ) { + key = field.substring(0, eqI); + value = field.substring(eqI+1, field.length()); // todo -- needs to convert to int, double, etc + //System.out.printf("%s %s%n", key, value); + } else { + key = field; + value = 1; } - } else { - attributes.put(formatStrings[index],genotypeSplit[index]); + + attributes.put(key, value); } } - return new Genotype(name,new ArrayList(aList),0.0,new HashSet(),attributes,false); + attributes.put("ID", id); + return attributes; + } + + //private static String[] CachedGTKey = new String[100]; + private static String[] CachedGTValues = new String[100]; + + public static boolean parseGenotypesToo = false; // for performance testing purposes + public static boolean validate = true; // for performance testing purposes + private static boolean REQUIRE_HEADER = false; + + // a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over + private String[] parts = null; + + public Feature decode(String line) { + if ( parts == null ) + parts = REQUIRE_HEADER ? new String[header.getColumnCount()] : new String[10000]; // todo -- remove require header + + int nParts = ParsingUtils.split(line, parts, '\t'); + + if (REQUIRE_HEADER) { // todo -- remove require header + // our header cannot be null, we need the genotype sample names and counts + if ( header == null) throw new IllegalStateException("VCF Header cannot be null"); + + // check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes) + if (nParts != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + nParts + " for line " + line); + } + + return parseVCFLine(parts, nParts); + } + + private static Double parseQual(String qualString) { + return qualString.equals("-1") ? VariantContext.NO_NEG_LOG_10PERROR : Double.valueOf(qualString) / 10; + } + + private VariantContext parseVCFLine(String[] parts, int nParts) { + // chr5 157273992 rs1211159 C T 0.00 0 . GT 0/0 0/0 0 + String contig = parts[0]; + long pos = Long.valueOf(parts[1]); + String id = parts[2]; + String ref = parts[3]; + String alts = parts[4]; + Double qual = parseQual(parts[5]); + String filter = parts[6]; + String info = parts[7]; + String GT = parts[8]; + int genotypesStart = 9; + + // add the reference allele + if ( ! Allele.acceptableAlleleBases(ref) ) { + System.out.printf("Excluding vcf record %s%n", ref); + return null; + } + + Set filters = ! filter.equals(".") ? new HashSet(Arrays.asList(filter.split(";"))) : null; + Map attributes = parseInfo(info, id); + + // add all of the alt alleles + + // todo -- use Allele factor method, not new, so we can keep a cache of the alleles since they are always the same + List alleles = new ArrayList(2); // we are almost always biallelic + Allele refAllele = new Allele(ref, true); + alleles.add(refAllele); + + for ( String alt : alts.split(",") ) { + if ( ! Allele.acceptableAlleleBases(alt) ) { + //System.out.printf("Excluding vcf record %s%n", vcf); + return null; + } + + Allele allele = new Allele(alt, false); + if ( ! allele.isNoCall() ) + alleles.add(allele); + } + + String[] GTKeys = GT.split(":"); // to performance issue + + Map genotypes = new HashMap(nParts); + if ( parseGenotypesToo ) { + alleleMap.clear(); + for ( int genotypeOffset = genotypesStart; genotypeOffset < nParts; genotypeOffset++ ) { + String sample = parts[genotypeOffset]; + String[] GTValues = CachedGTValues; + ParsingUtils.split(sample, GTValues, ':'); // to performance issue + List genotypeAlleles = parseGenotypeAlleles(GTValues[0], alleles, alleleMap); + double GTQual = VariantContext.NO_NEG_LOG_10PERROR; + + // todo -- the parsing of attributes could be made lazy for performance + Map gtAttributes = null; + if ( GTKeys.length > 1 ) { + gtAttributes = new HashMap(GTKeys.length - 1); + for ( int i = 1; i < GTKeys.length; i++ ) { + if ( GTKeys[i].equals("GQ") ) { + GTQual = parseQual(GTValues[i]); + } else { + gtAttributes.put(GTKeys[i], GTValues[i]); + } + } + } + + Set genotypeFilters = null; + // genotypeFilters = new HashSet(); +// if ( vcfG.isFiltered() ) // setup the FL genotype filter fields +// genotypeFilters.addAll(Arrays.asList(vcfG.getFields().get(VCFGenotypeRecord.GENOTYPE_FILTER_KEY).split(";"))); + + boolean phased = GTKeys[0].charAt(1) == '|'; + Genotype g = new Genotype("X" + genotypeOffset, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased); + genotypes.put(g.getSampleName(), g); + } + } + + GenomeLoc loc = GenomeLocParser.createGenomeLoc(contig,pos,pos+refAllele.length()-1); + + VariantContext vc = new VariantContext("foo", loc, alleles, genotypes, qual, filters, attributes); + if ( validate ) vc.validate(); + return vc; } /** @@ -159,6 +248,6 @@ public class VCF4Codec implements FeatureCodec { */ @Override public Class getFeatureType() { - return VCF4Record.class; + return VariantContext.class; } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Record.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Record.java deleted file mode 100644 index c45663716..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Record.java +++ /dev/null @@ -1,45 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata.features.vcf4; - -import org.broad.tribble.Feature; -import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; -import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.util.Collection; -import java.util.Map; -import java.util.Set; - -/** - * simple variant context wrapped as VCF4 - */ -public class VCF4Record extends VariantContext implements Feature { - /** - * create a VCF4Record, which is really a variant context - * @param name the name of the record - * @param loc it's location - * @param alleles the set of alleles - * @param genotypes any genotypes for this record - * @param negLog10PError the probability of being a wrong call - * @param filters the set of filters applied to this variant - * @param attributes any other attributes - */ - public VCF4Record(String name, GenomeLoc loc, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { - super(name, loc, alleles, genotypes, negLog10PError, filters, attributes); - } - - @Override - public String getChr() { - return getLocation().getContig(); - } - - @Override - public int getStart() { - return (int)getLocation().getStart(); - } - - @Override - public int getEnd() { - return (int)getLocation().getStop(); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4ReaderTestWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4ReaderTestWalker.java new file mode 100755 index 000000000..23ca27ee5 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4ReaderTestWalker.java @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.walkers; + +import org.broad.tribble.vcf.*; +import org.broad.tribble.util.ParsingUtils; +import org.broad.tribble.util.AsciiLineReader; +import org.broad.tribble.FeatureCodec; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; +import org.broadinstitute.sting.gatk.refdata.features.vcf4.VCF4Codec; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.utils.genotype.vcf.*; +import org.broadinstitute.sting.commandline.Argument; + +import java.util.*; +import java.io.*; + +import com.sun.xml.internal.ws.wsdl.parser.ParserUtil; + +/** + * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * + * @Author chartl + * @Date Apr 13, 2010 + */ +public class VCF4ReaderTestWalker extends RodWalker { + @Argument(shortName="MR", doc="", required=false) + int maxRecords = -1; + @Argument(shortName="vcf", doc="", required=true) + File vcfFile = null; + @Argument(shortName="Parse", doc="", required=true) + ParsingStatus splitFile = ParsingStatus.NONE; + @Argument(shortName="DontValidate", doc="", required=false) + boolean DontValidate = false; + + @Argument(shortName="USE_VCF3", doc="", required=false) + boolean USE_VCF3 = false; + + + public enum ParsingStatus { NONE, SPLIT_LINES, VARIANTS, GENOTYPES } + + public void initialize() { + } + + public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext context, AlignmentContext alicon) { + return null; + } + + public Long reduce(VCFRecord con, Long num) { + if ( con == null ) { + return num; + } + + return 1 + num; + } + + public Long reduceInit() { + return 0l; + } + + String[] parts = new String[10000]; + public void onTraversalDone(Long num){ + VCF4Codec vcf4codec = new VCF4Codec(true); + VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES; + VCF4Codec.validate = ! DontValidate; + + VCFCodec vcf3codec = new VCFCodec(); + + FeatureCodec codec = USE_VCF3 ? vcf3codec : vcf4codec; + + try { + AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(vcfFile)); + int lineNumber = codec.readHeader(lineReader); + out.printf("Read %d header lines%n", lineNumber); + + while (true) { + String line = lineReader.readLine(); + + if ( line == null ) + break; + + lineNumber++; + if ( lineNumber >= maxRecords && maxRecords != -1 ) { + return; + } + + if ( line.charAt(0) == '#' ) + continue; + + Object vc = null; + if ( splitFile == ParsingStatus.NONE ) { + + } + else if ( splitFile == ParsingStatus.SPLIT_LINES ) { + // todo -- look at header and determine number of elements that need to be parsed. Should be static per file + int nParts = ParsingUtils.split(line, parts, '\t'); + } else { + vc = codec.decode(line); + if ( USE_VCF3 ) { + VCFRecord rec = (VCFRecord)vc; + GenomeLoc loc = GenomeLocParser.createGenomeLoc(rec.getChr(), rec.getStart()); + ReferenceContext ref = new ReferenceContext(loc, (byte)rec.getReference().charAt(0)); + vc = VariantContextAdaptors.toVariantContext("X", vc, ref); + } + } + + if ( lineNumber % 10000 == 0 ) { + System.out.printf("%10d: %s%n", lineNumber, line.subSequence(0, 50)); + System.out.printf("%10d: %s%n", lineNumber, vc); + } + } + } catch ( FileNotFoundException e ) { + throw new StingException(e.getMessage()); + } catch ( IOException e ) { + throw new StingException(e.getMessage()); + } + } +} \ No newline at end of file