Optimized, nearly complete VCF4 reader 2-4x faster than the previous implementation, along with a VCF4 reader performance testing walker that can read 3/4 files, useful for benchmarking

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3487 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-06-04 18:11:38 +00:00
parent 6482b87741
commit b811e61ae1
6 changed files with 363 additions and 132 deletions

View File

@ -16,7 +16,7 @@ public class Genotype {
protected InferredGeneticContext commonInfo;
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
protected List<Allele> alleles = new ArrayList<Allele>();
protected List<Allele> alleles = null; // new ArrayList<Allele>();
private boolean genotypesArePhased = false;
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean genotypesArePhased) {

View File

@ -13,10 +13,13 @@ import java.util.*;
final class InferredGeneticContext {
public static final double NO_NEG_LOG_10PERROR = -1.0;
private static Set<String> NO_FILTERS = Collections.unmodifiableSet(new HashSet<String>());
private static Map<String, Object> NO_ATTRIBUTES = Collections.unmodifiableMap(new HashMap<String, Object>());
private double negLog10PError = NO_NEG_LOG_10PERROR;
private String name = null;
private Set<String> filters = new HashSet<String>();
private Map<String, Object> attributes = new HashMap<String, Object>();
private Set<String> filters = NO_FILTERS;
private Map<String, Object> attributes = NO_ATTRIBUTES;
// public InferredGeneticContext(String name) {
// this.name = name;
@ -73,6 +76,9 @@ final class InferredGeneticContext {
}
public void addFilter(String filter) {
if ( filters == NO_FILTERS ) // immutable -> mutable
filters = new HashSet<String>(filters);
if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this);
if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this);
filters.add(filter);
@ -85,7 +91,10 @@ final class InferredGeneticContext {
}
public void clearFilters() {
filters.clear();
if ( filters == NO_FILTERS )
filters = new HashSet<String>();
else
filters.clear();
}
public void setFilters(Collection<String> filters) {
@ -123,7 +132,10 @@ final class InferredGeneticContext {
//
// ---------------------------------------------------------------------------------------------------------
public void clearAttributes() {
this.attributes.clear();
if ( attributes == NO_ATTRIBUTES )
attributes = new HashMap<String, Object>();
else
this.attributes.clear();
}
/**
@ -136,7 +148,7 @@ final class InferredGeneticContext {
// todo -- define common attributes as enum
public void setAttributes(Map<String, ?> map) {
this.attributes.clear();
clearAttributes();
putAttributes(map);
}
@ -148,10 +160,15 @@ final class InferredGeneticContext {
if ( hasAttribute(key) && ! allowOverwrites )
throw new StingException("Attempting to overwrite key->value binding: key = " + key + " this = " + this);
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
attributes = new HashMap<String, Object>(attributes);
this.attributes.put(key, value);
}
public void removeAttribute(String key) {
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
attributes = new HashMap<String, Object>(attributes);
this.attributes.remove(key);
}

View File

@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broad.tribble.vcf.VCFRecord;
import org.broad.tribble.Feature;
import java.util.*;
@ -160,7 +161,7 @@ import java.util.*;
*
* @author depristo
*/
public class VariantContext {
public class VariantContext implements Feature { // to enable tribble intergration
protected InferredGeneticContext commonInfo = null;
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
@ -997,4 +998,24 @@ public class VariantContext {
return dest;
}
// ---------------------------------------------------------------------------------------------------------
//
// tribble integration routines -- not for public consumption
//
// ---------------------------------------------------------------------------------------------------------
@Override
public String getChr() {
return getLocation().getContig();
}
@Override
public int getStart() {
return (int)getLocation().getStart();
}
@Override
public int getEnd() {
return (int)getLocation().getStop();
}
}

View File

@ -4,6 +4,7 @@ import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.exception.CodecLineParsingException;
import org.broad.tribble.util.LineReader;
import org.broad.tribble.util.ParsingUtils;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.VCFReaderUtils;
@ -12,6 +13,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.io.IOException;
import java.util.*;
@ -30,60 +32,11 @@ public class VCF4Codec implements FeatureCodec {
throw new StingException("DON'T USE THIS");
}
/**
* this a super hack like method to parse out what we need from a variant context
* @param line the line to parse
* @return
*/
@Override
public Feature decode(String line) {
// our header cannot be null, we need the genotype sample names and counts
if (header == null) throw new IllegalStateException("VCF Header cannot be null");
// split the line on whitespace (Jim's parser will be faster, but it's broken right now)
String[] result = line.split("\\s+");
// check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
if (result.length != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + result.length+ " for line " + line);
// our genotype names
Iterator<String> iter = header.getGenotypeSamples().iterator();
// out genotype map, sample name to genotype
Map<String, Genotype> genotypes = new LinkedHashMap<String,Genotype>();
// our allele list, add the reference and the alts
List<Allele> alleles = new ArrayList<Allele>();
String[] alts = result[4].split(",");
for (String alt : alts)
alleles.add(new Allele(alt,false));
alleles.add(new Allele(result[3],true));
// parse out each of the genotypes
for (int genotypeIndex = 9; genotypeIndex < header.getColumnCount(); genotypeIndex++) {
if (!iter.hasNext()) throw new StingException("Wrong number of samples!");
String sample = iter.next();
genotypes.put(sample,createGenotypeFromString(sample,result[genotypeIndex],result[8].split(":"),alts,result[3]));
}
// make a new set of all the filters
Set<String> filters = new TreeSet<String>();
filters.addAll(Arrays.asList(result[5].split(",")));
// create, validate, and return the record
VCF4Record rec = new VCF4Record(result[2],
GenomeLocParser.createGenomeLoc(result[0],Long.valueOf(result[1])),
Collections.unmodifiableCollection(alleles),
genotypes,
Double.valueOf(result[5]),
filters,
new HashMap<String,Object>());
rec.validate();
return rec;
public VCF4Codec(boolean itsOKImTesting) {
if ( ! itsOKImTesting )
throw new StingException("DON'T USE THIS");
}
/**
* this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates
* @param reader the line reader to take header lines from
@ -118,39 +71,175 @@ public class VCF4Codec implements FeatureCodec {
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
}
private static int ZERO_CHAR = (byte)'0';
private static Allele oneAllele(char index, List<Allele> alleles) {
if ( index == '.' )
return Allele.NO_CALL;
else {
int i = ((byte)index) - ZERO_CHAR;
return alleles.get(i);
}
}
/**
* create the genotype object from the VCF record
* @param name the name of the sample
* @param VCFEntry the entry; the text containing all the fields corresponding to the format fields
* @param formatStrings the format string
* @param altAlleleList our alt alleles
* @param reference the reference base(s)
* @return a Genotype object
*/
private Genotype createGenotypeFromString(String name, String VCFEntry, String[] formatStrings, String[] altAlleleList, String reference) {
// split the text entry into parts
String genotypeSplit[] = VCFEntry.split(":");
private static Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
Set<Allele> aList = new TreeSet<Allele>();
Map<String, Object> attributes = new LinkedHashMap<String,Object>();
static long cacheHit = 0, gtParse = 0;
// for each entry in the vcf field (we drive by this so that dropped fields aren't processed
for (int index = 0; index < genotypeSplit.length; index++) {
if (formatStrings[index].toUpperCase().equals("GT")) {
String[] genotypes = genotypeSplit[index].split("[\\\\|\\/]+");
for (String g : genotypes) {
int altIndex = Integer.valueOf(g);
if (altIndex == 0)
aList.add(new Allele(reference,true));
else
aList.add(new Allele(altAlleleList[altIndex-1]));
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
// this should cache results [since they are immutable] and return a single object for each genotype
if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
List<Allele> GTAlleles = cache.get(GT);
if ( GTAlleles == null ) {
GTAlleles = Arrays.asList(oneAllele(GT.charAt(0), alleles), oneAllele(GT.charAt(2), alleles));
cache.put(GT, GTAlleles);
}
// else {
// cacheHit++;
// }
// gtParse++;
//
// if ( cacheHit % 10000 == 0 )
// System.out.printf("Cache hit %d %d %.2f%n", cacheHit, gtParse, (100.0*cacheHit) / gtParse);
return GTAlleles;
}
private Map<String, Object> parseInfo(String infoField, String id) {
Map<String, Object> attributes = new HashMap<String, Object>();
if ( ! infoField.equals(".") ) { // empty info field
for ( String field : infoField.split(";") ) {
int eqI = field.indexOf("=");
String key = null;
Object value = null;
if ( eqI != -1 ) {
key = field.substring(0, eqI);
value = field.substring(eqI+1, field.length()); // todo -- needs to convert to int, double, etc
//System.out.printf("%s %s%n", key, value);
} else {
key = field;
value = 1;
}
} else {
attributes.put(formatStrings[index],genotypeSplit[index]);
attributes.put(key, value);
}
}
return new Genotype(name,new ArrayList(aList),0.0,new HashSet<String>(),attributes,false);
attributes.put("ID", id);
return attributes;
}
//private static String[] CachedGTKey = new String[100];
private static String[] CachedGTValues = new String[100];
public static boolean parseGenotypesToo = false; // for performance testing purposes
public static boolean validate = true; // for performance testing purposes
private static boolean REQUIRE_HEADER = false;
// a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
private String[] parts = null;
public Feature decode(String line) {
if ( parts == null )
parts = REQUIRE_HEADER ? new String[header.getColumnCount()] : new String[10000]; // todo -- remove require header
int nParts = ParsingUtils.split(line, parts, '\t');
if (REQUIRE_HEADER) { // todo -- remove require header
// our header cannot be null, we need the genotype sample names and counts
if ( header == null) throw new IllegalStateException("VCF Header cannot be null");
// check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
if (nParts != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + nParts + " for line " + line);
}
return parseVCFLine(parts, nParts);
}
private static Double parseQual(String qualString) {
return qualString.equals("-1") ? VariantContext.NO_NEG_LOG_10PERROR : Double.valueOf(qualString) / 10;
}
private VariantContext parseVCFLine(String[] parts, int nParts) {
// chr5 157273992 rs1211159 C T 0.00 0 . GT 0/0 0/0 0
String contig = parts[0];
long pos = Long.valueOf(parts[1]);
String id = parts[2];
String ref = parts[3];
String alts = parts[4];
Double qual = parseQual(parts[5]);
String filter = parts[6];
String info = parts[7];
String GT = parts[8];
int genotypesStart = 9;
// add the reference allele
if ( ! Allele.acceptableAlleleBases(ref) ) {
System.out.printf("Excluding vcf record %s%n", ref);
return null;
}
Set<String> filters = ! filter.equals(".") ? new HashSet<String>(Arrays.asList(filter.split(";"))) : null;
Map<String, Object> attributes = parseInfo(info, id);
// add all of the alt alleles
// todo -- use Allele factor method, not new, so we can keep a cache of the alleles since they are always the same
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
Allele refAllele = new Allele(ref, true);
alleles.add(refAllele);
for ( String alt : alts.split(",") ) {
if ( ! Allele.acceptableAlleleBases(alt) ) {
//System.out.printf("Excluding vcf record %s%n", vcf);
return null;
}
Allele allele = new Allele(alt, false);
if ( ! allele.isNoCall() )
alleles.add(allele);
}
String[] GTKeys = GT.split(":"); // to performance issue
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(nParts);
if ( parseGenotypesToo ) {
alleleMap.clear();
for ( int genotypeOffset = genotypesStart; genotypeOffset < nParts; genotypeOffset++ ) {
String sample = parts[genotypeOffset];
String[] GTValues = CachedGTValues;
ParsingUtils.split(sample, GTValues, ':'); // to performance issue
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValues[0], alleles, alleleMap);
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
// todo -- the parsing of attributes could be made lazy for performance
Map<String, String> gtAttributes = null;
if ( GTKeys.length > 1 ) {
gtAttributes = new HashMap<String, String>(GTKeys.length - 1);
for ( int i = 1; i < GTKeys.length; i++ ) {
if ( GTKeys[i].equals("GQ") ) {
GTQual = parseQual(GTValues[i]);
} else {
gtAttributes.put(GTKeys[i], GTValues[i]);
}
}
}
Set<String> genotypeFilters = null;
// genotypeFilters = new HashSet<String>();
// if ( vcfG.isFiltered() ) // setup the FL genotype filter fields
// genotypeFilters.addAll(Arrays.asList(vcfG.getFields().get(VCFGenotypeRecord.GENOTYPE_FILTER_KEY).split(";")));
boolean phased = GTKeys[0].charAt(1) == '|';
Genotype g = new Genotype("X" + genotypeOffset, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
genotypes.put(g.getSampleName(), g);
}
}
GenomeLoc loc = GenomeLocParser.createGenomeLoc(contig,pos,pos+refAllele.length()-1);
VariantContext vc = new VariantContext("foo", loc, alleles, genotypes, qual, filters, attributes);
if ( validate ) vc.validate();
return vc;
}
/**
@ -159,6 +248,6 @@ public class VCF4Codec implements FeatureCodec {
*/
@Override
public Class getFeatureType() {
return VCF4Record.class;
return VariantContext.class;
}
}

View File

@ -1,45 +0,0 @@
package org.broadinstitute.sting.gatk.refdata.features.vcf4;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
/**
* simple variant context wrapped as VCF4
*/
public class VCF4Record extends VariantContext implements Feature {
/**
* create a VCF4Record, which is really a variant context
* @param name the name of the record
* @param loc it's location
* @param alleles the set of alleles
* @param genotypes any genotypes for this record
* @param negLog10PError the probability of being a wrong call
* @param filters the set of filters applied to this variant
* @param attributes any other attributes
*/
public VCF4Record(String name, GenomeLoc loc, Collection<Allele> alleles, Map<String, Genotype> genotypes, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
super(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
}
@Override
public String getChr() {
return getLocation().getContig();
}
@Override
public int getStart() {
return (int)getLocation().getStart();
}
@Override
public int getEnd() {
return (int)getLocation().getStop();
}
}

View File

@ -0,0 +1,149 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broad.tribble.vcf.*;
import org.broad.tribble.util.ParsingUtils;
import org.broad.tribble.util.AsciiLineReader;
import org.broad.tribble.FeatureCodec;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.features.vcf4.VCF4Codec;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.commandline.Argument;
import java.util.*;
import java.io.*;
import com.sun.xml.internal.ws.wsdl.parser.ParserUtil;
/**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
*
* @Author chartl
* @Date Apr 13, 2010
*/
public class VCF4ReaderTestWalker extends RodWalker<VCFRecord,Long> {
@Argument(shortName="MR", doc="", required=false)
int maxRecords = -1;
@Argument(shortName="vcf", doc="", required=true)
File vcfFile = null;
@Argument(shortName="Parse", doc="", required=true)
ParsingStatus splitFile = ParsingStatus.NONE;
@Argument(shortName="DontValidate", doc="", required=false)
boolean DontValidate = false;
@Argument(shortName="USE_VCF3", doc="", required=false)
boolean USE_VCF3 = false;
public enum ParsingStatus { NONE, SPLIT_LINES, VARIANTS, GENOTYPES }
public void initialize() {
}
public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext context, AlignmentContext alicon) {
return null;
}
public Long reduce(VCFRecord con, Long num) {
if ( con == null ) {
return num;
}
return 1 + num;
}
public Long reduceInit() {
return 0l;
}
String[] parts = new String[10000];
public void onTraversalDone(Long num){
VCF4Codec vcf4codec = new VCF4Codec(true);
VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES;
VCF4Codec.validate = ! DontValidate;
VCFCodec vcf3codec = new VCFCodec();
FeatureCodec codec = USE_VCF3 ? vcf3codec : vcf4codec;
try {
AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(vcfFile));
int lineNumber = codec.readHeader(lineReader);
out.printf("Read %d header lines%n", lineNumber);
while (true) {
String line = lineReader.readLine();
if ( line == null )
break;
lineNumber++;
if ( lineNumber >= maxRecords && maxRecords != -1 ) {
return;
}
if ( line.charAt(0) == '#' )
continue;
Object vc = null;
if ( splitFile == ParsingStatus.NONE ) {
}
else if ( splitFile == ParsingStatus.SPLIT_LINES ) {
// todo -- look at header and determine number of elements that need to be parsed. Should be static per file
int nParts = ParsingUtils.split(line, parts, '\t');
} else {
vc = codec.decode(line);
if ( USE_VCF3 ) {
VCFRecord rec = (VCFRecord)vc;
GenomeLoc loc = GenomeLocParser.createGenomeLoc(rec.getChr(), rec.getStart());
ReferenceContext ref = new ReferenceContext(loc, (byte)rec.getReference().charAt(0));
vc = VariantContextAdaptors.toVariantContext("X", vc, ref);
}
}
if ( lineNumber % 10000 == 0 ) {
System.out.printf("%10d: %s%n", lineNumber, line.subSequence(0, 50));
System.out.printf("%10d: %s%n", lineNumber, vc);
}
}
} catch ( FileNotFoundException e ) {
throw new StingException(e.getMessage());
} catch ( IOException e ) {
throw new StingException(e.getMessage());
}
}
}