From 6482b87741e33aa3ec34d3f9791ee9b68a1e8403 Mon Sep 17 00:00:00 2001 From: aaron Date: Fri, 4 Jun 2010 07:38:46 +0000 Subject: [PATCH] adding the super experimental, half-broken, generally crippled, awkwardly commented, header ignoring vcf4 code. Don't use this, unless you're a developer for VCF4. If so, remove the exception from the constructor so that it won't always exception out. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3486 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/refdata/features/vcf4/VCF4Codec.java | 164 ++++++++++++++++++ .../refdata/features/vcf4/VCF4Record.java | 45 +++++ 2 files changed, 209 insertions(+) create mode 100644 java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java create mode 100644 java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Record.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java new file mode 100644 index 000000000..61f190e91 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -0,0 +1,164 @@ +package org.broadinstitute.sting.gatk.refdata.features.vcf4; + +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.exception.CodecLineParsingException; +import org.broad.tribble.util.LineReader; +import org.broad.tribble.vcf.VCFHeader; +import org.broad.tribble.vcf.VCFHeaderLine; +import org.broad.tribble.vcf.VCFReaderUtils; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.StingException; + +import java.io.IOException; +import java.util.*; + +/** + * a feature codec for the VCF 4 specification. Our aim is to read in the records and convert to VariantContext as + * quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + */ +public class VCF4Codec implements FeatureCodec { + + // we have to store the list of strings that make up the header until they're needed + private List headerStrings = new ArrayList(); + private VCFHeader header = null; + + public VCF4Codec() { + throw new StingException("DON'T USE THIS"); + } + + /** + * this a super hack like method to parse out what we need from a variant context + * @param line the line to parse + * @return + */ + @Override + public Feature decode(String line) { + + // our header cannot be null, we need the genotype sample names and counts + if (header == null) throw new IllegalStateException("VCF Header cannot be null"); + + // split the line on whitespace (Jim's parser will be faster, but it's broken right now) + String[] result = line.split("\\s+"); + + // check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes) + if (result.length != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + result.length+ " for line " + line); + + // our genotype names + Iterator iter = header.getGenotypeSamples().iterator(); + + // out genotype map, sample name to genotype + Map genotypes = new LinkedHashMap(); + + // our allele list, add the reference and the alts + List alleles = new ArrayList(); + String[] alts = result[4].split(","); + for (String alt : alts) + alleles.add(new Allele(alt,false)); + alleles.add(new Allele(result[3],true)); + + // parse out each of the genotypes + for (int genotypeIndex = 9; genotypeIndex < header.getColumnCount(); genotypeIndex++) { + if (!iter.hasNext()) throw new StingException("Wrong number of samples!"); + String sample = iter.next(); + genotypes.put(sample,createGenotypeFromString(sample,result[genotypeIndex],result[8].split(":"),alts,result[3])); + } + + // make a new set of all the filters + Set filters = new TreeSet(); + filters.addAll(Arrays.asList(result[5].split(","))); + + // create, validate, and return the record + VCF4Record rec = new VCF4Record(result[2], + GenomeLocParser.createGenomeLoc(result[0],Long.valueOf(result[1])), + Collections.unmodifiableCollection(alleles), + genotypes, + Double.valueOf(result[5]), + filters, + new HashMap()); + rec.validate(); + return rec; + } + + + /** + * this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates + * @param reader the line reader to take header lines from + * @return the number of header lines + */ + @Override + public int readHeader(LineReader reader) { + String line = ""; + try { + while ((line = reader.readLine()) != null) { + if (line.startsWith("##")) { + headerStrings.add(line); + } + else if (line.startsWith("#")) { + headerStrings.add(line); + String[] genotypes = line.split("\\s+"); + Set genotypeSampleNames = new TreeSet(); + for (int x = 8; x < genotypes.length; x++) + genotypeSampleNames.add(genotypes[x]); + // this should be the next line -> header = VCFReaderUtils.createHeader(headerStrings); + header = new VCFHeader(new HashSet(),genotypeSampleNames); + return headerStrings.size(); + } + else { + throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file"); + } + + } + } catch (IOException e) { + throw new RuntimeException("IO Exception ", e); + } + throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file"); + } + + + /** + * create the genotype object from the VCF record + * @param name the name of the sample + * @param VCFEntry the entry; the text containing all the fields corresponding to the format fields + * @param formatStrings the format string + * @param altAlleleList our alt alleles + * @param reference the reference base(s) + * @return a Genotype object + */ + private Genotype createGenotypeFromString(String name, String VCFEntry, String[] formatStrings, String[] altAlleleList, String reference) { + // split the text entry into parts + String genotypeSplit[] = VCFEntry.split(":"); + + Set aList = new TreeSet(); + Map attributes = new LinkedHashMap(); + + // for each entry in the vcf field (we drive by this so that dropped fields aren't processed + for (int index = 0; index < genotypeSplit.length; index++) { + if (formatStrings[index].toUpperCase().equals("GT")) { + String[] genotypes = genotypeSplit[index].split("[\\\\|\\/]+"); + for (String g : genotypes) { + int altIndex = Integer.valueOf(g); + if (altIndex == 0) + aList.add(new Allele(reference,true)); + else + aList.add(new Allele(altAlleleList[altIndex-1])); + } + } else { + attributes.put(formatStrings[index],genotypeSplit[index]); + } + } + return new Genotype(name,new ArrayList(aList),0.0,new HashSet(),attributes,false); + } + + /** + * + * @return the type of record + */ + @Override + public Class getFeatureType() { + return VCF4Record.class; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Record.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Record.java new file mode 100644 index 000000000..c45663716 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Record.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.gatk.refdata.features.vcf4; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.Collection; +import java.util.Map; +import java.util.Set; + +/** + * simple variant context wrapped as VCF4 + */ +public class VCF4Record extends VariantContext implements Feature { + /** + * create a VCF4Record, which is really a variant context + * @param name the name of the record + * @param loc it's location + * @param alleles the set of alleles + * @param genotypes any genotypes for this record + * @param negLog10PError the probability of being a wrong call + * @param filters the set of filters applied to this variant + * @param attributes any other attributes + */ + public VCF4Record(String name, GenomeLoc loc, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { + super(name, loc, alleles, genotypes, negLog10PError, filters, attributes); + } + + @Override + public String getChr() { + return getLocation().getContig(); + } + + @Override + public int getStart() { + return (int)getLocation().getStart(); + } + + @Override + public int getEnd() { + return (int)getLocation().getStop(); + } +}