Optimized, nearly complete VCF4 reader 2-4x faster than the previous implementation, along with a VCF4 reader performance testing walker that can read 3/4 files, useful for benchmarking
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3487 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6482b87741
commit
b811e61ae1
|
|
@ -16,7 +16,7 @@ public class Genotype {
|
|||
|
||||
protected InferredGeneticContext commonInfo;
|
||||
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
|
||||
protected List<Allele> alleles = new ArrayList<Allele>();
|
||||
protected List<Allele> alleles = null; // new ArrayList<Allele>();
|
||||
private boolean genotypesArePhased = false;
|
||||
|
||||
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean genotypesArePhased) {
|
||||
|
|
|
|||
|
|
@ -13,10 +13,13 @@ import java.util.*;
|
|||
final class InferredGeneticContext {
|
||||
public static final double NO_NEG_LOG_10PERROR = -1.0;
|
||||
|
||||
private static Set<String> NO_FILTERS = Collections.unmodifiableSet(new HashSet<String>());
|
||||
private static Map<String, Object> NO_ATTRIBUTES = Collections.unmodifiableMap(new HashMap<String, Object>());
|
||||
|
||||
private double negLog10PError = NO_NEG_LOG_10PERROR;
|
||||
private String name = null;
|
||||
private Set<String> filters = new HashSet<String>();
|
||||
private Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
private Set<String> filters = NO_FILTERS;
|
||||
private Map<String, Object> attributes = NO_ATTRIBUTES;
|
||||
|
||||
// public InferredGeneticContext(String name) {
|
||||
// this.name = name;
|
||||
|
|
@ -73,6 +76,9 @@ final class InferredGeneticContext {
|
|||
}
|
||||
|
||||
public void addFilter(String filter) {
|
||||
if ( filters == NO_FILTERS ) // immutable -> mutable
|
||||
filters = new HashSet<String>(filters);
|
||||
|
||||
if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this);
|
||||
if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this);
|
||||
filters.add(filter);
|
||||
|
|
@ -85,7 +91,10 @@ final class InferredGeneticContext {
|
|||
}
|
||||
|
||||
public void clearFilters() {
|
||||
filters.clear();
|
||||
if ( filters == NO_FILTERS )
|
||||
filters = new HashSet<String>();
|
||||
else
|
||||
filters.clear();
|
||||
}
|
||||
|
||||
public void setFilters(Collection<String> filters) {
|
||||
|
|
@ -123,7 +132,10 @@ final class InferredGeneticContext {
|
|||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
public void clearAttributes() {
|
||||
this.attributes.clear();
|
||||
if ( attributes == NO_ATTRIBUTES )
|
||||
attributes = new HashMap<String, Object>();
|
||||
else
|
||||
this.attributes.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -136,7 +148,7 @@ final class InferredGeneticContext {
|
|||
// todo -- define common attributes as enum
|
||||
|
||||
public void setAttributes(Map<String, ?> map) {
|
||||
this.attributes.clear();
|
||||
clearAttributes();
|
||||
putAttributes(map);
|
||||
}
|
||||
|
||||
|
|
@ -148,10 +160,15 @@ final class InferredGeneticContext {
|
|||
if ( hasAttribute(key) && ! allowOverwrites )
|
||||
throw new StingException("Attempting to overwrite key->value binding: key = " + key + " this = " + this);
|
||||
|
||||
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
|
||||
attributes = new HashMap<String, Object>(attributes);
|
||||
|
||||
this.attributes.put(key, value);
|
||||
}
|
||||
|
||||
public void removeAttribute(String key) {
|
||||
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
|
||||
attributes = new HashMap<String, Object>(attributes);
|
||||
this.attributes.remove(key);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.StingException;
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broad.tribble.vcf.VCFRecord;
|
||||
import org.broad.tribble.Feature;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -160,7 +161,7 @@ import java.util.*;
|
|||
*
|
||||
* @author depristo
|
||||
*/
|
||||
public class VariantContext {
|
||||
public class VariantContext implements Feature { // to enable tribble intergration
|
||||
protected InferredGeneticContext commonInfo = null;
|
||||
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
|
||||
|
||||
|
|
@ -997,4 +998,24 @@ public class VariantContext {
|
|||
return dest;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// tribble integration routines -- not for public consumption
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
@Override
|
||||
public String getChr() {
|
||||
return getLocation().getContig();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getStart() {
|
||||
return (int)getLocation().getStart();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getEnd() {
|
||||
return (int)getLocation().getStop();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -4,6 +4,7 @@ import org.broad.tribble.Feature;
|
|||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.exception.CodecLineParsingException;
|
||||
import org.broad.tribble.util.LineReader;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broad.tribble.vcf.VCFHeader;
|
||||
import org.broad.tribble.vcf.VCFHeaderLine;
|
||||
import org.broad.tribble.vcf.VCFReaderUtils;
|
||||
|
|
@ -12,6 +13,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
|||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
|
@ -30,60 +32,11 @@ public class VCF4Codec implements FeatureCodec {
|
|||
throw new StingException("DON'T USE THIS");
|
||||
}
|
||||
|
||||
/**
|
||||
* this a super hack like method to parse out what we need from a variant context
|
||||
* @param line the line to parse
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public Feature decode(String line) {
|
||||
|
||||
// our header cannot be null, we need the genotype sample names and counts
|
||||
if (header == null) throw new IllegalStateException("VCF Header cannot be null");
|
||||
|
||||
// split the line on whitespace (Jim's parser will be faster, but it's broken right now)
|
||||
String[] result = line.split("\\s+");
|
||||
|
||||
// check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
|
||||
if (result.length != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + result.length+ " for line " + line);
|
||||
|
||||
// our genotype names
|
||||
Iterator<String> iter = header.getGenotypeSamples().iterator();
|
||||
|
||||
// out genotype map, sample name to genotype
|
||||
Map<String, Genotype> genotypes = new LinkedHashMap<String,Genotype>();
|
||||
|
||||
// our allele list, add the reference and the alts
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
String[] alts = result[4].split(",");
|
||||
for (String alt : alts)
|
||||
alleles.add(new Allele(alt,false));
|
||||
alleles.add(new Allele(result[3],true));
|
||||
|
||||
// parse out each of the genotypes
|
||||
for (int genotypeIndex = 9; genotypeIndex < header.getColumnCount(); genotypeIndex++) {
|
||||
if (!iter.hasNext()) throw new StingException("Wrong number of samples!");
|
||||
String sample = iter.next();
|
||||
genotypes.put(sample,createGenotypeFromString(sample,result[genotypeIndex],result[8].split(":"),alts,result[3]));
|
||||
}
|
||||
|
||||
// make a new set of all the filters
|
||||
Set<String> filters = new TreeSet<String>();
|
||||
filters.addAll(Arrays.asList(result[5].split(",")));
|
||||
|
||||
// create, validate, and return the record
|
||||
VCF4Record rec = new VCF4Record(result[2],
|
||||
GenomeLocParser.createGenomeLoc(result[0],Long.valueOf(result[1])),
|
||||
Collections.unmodifiableCollection(alleles),
|
||||
genotypes,
|
||||
Double.valueOf(result[5]),
|
||||
filters,
|
||||
new HashMap<String,Object>());
|
||||
rec.validate();
|
||||
return rec;
|
||||
public VCF4Codec(boolean itsOKImTesting) {
|
||||
if ( ! itsOKImTesting )
|
||||
throw new StingException("DON'T USE THIS");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates
|
||||
* @param reader the line reader to take header lines from
|
||||
|
|
@ -118,39 +71,175 @@ public class VCF4Codec implements FeatureCodec {
|
|||
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
private static int ZERO_CHAR = (byte)'0';
|
||||
private static Allele oneAllele(char index, List<Allele> alleles) {
|
||||
if ( index == '.' )
|
||||
return Allele.NO_CALL;
|
||||
else {
|
||||
int i = ((byte)index) - ZERO_CHAR;
|
||||
return alleles.get(i);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* create the genotype object from the VCF record
|
||||
* @param name the name of the sample
|
||||
* @param VCFEntry the entry; the text containing all the fields corresponding to the format fields
|
||||
* @param formatStrings the format string
|
||||
* @param altAlleleList our alt alleles
|
||||
* @param reference the reference base(s)
|
||||
* @return a Genotype object
|
||||
*/
|
||||
private Genotype createGenotypeFromString(String name, String VCFEntry, String[] formatStrings, String[] altAlleleList, String reference) {
|
||||
// split the text entry into parts
|
||||
String genotypeSplit[] = VCFEntry.split(":");
|
||||
private static Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
|
||||
|
||||
Set<Allele> aList = new TreeSet<Allele>();
|
||||
Map<String, Object> attributes = new LinkedHashMap<String,Object>();
|
||||
static long cacheHit = 0, gtParse = 0;
|
||||
|
||||
// for each entry in the vcf field (we drive by this so that dropped fields aren't processed
|
||||
for (int index = 0; index < genotypeSplit.length; index++) {
|
||||
if (formatStrings[index].toUpperCase().equals("GT")) {
|
||||
String[] genotypes = genotypeSplit[index].split("[\\\\|\\/]+");
|
||||
for (String g : genotypes) {
|
||||
int altIndex = Integer.valueOf(g);
|
||||
if (altIndex == 0)
|
||||
aList.add(new Allele(reference,true));
|
||||
else
|
||||
aList.add(new Allele(altAlleleList[altIndex-1]));
|
||||
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
||||
// this should cache results [since they are immutable] and return a single object for each genotype
|
||||
if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
|
||||
List<Allele> GTAlleles = cache.get(GT);
|
||||
if ( GTAlleles == null ) {
|
||||
GTAlleles = Arrays.asList(oneAllele(GT.charAt(0), alleles), oneAllele(GT.charAt(2), alleles));
|
||||
cache.put(GT, GTAlleles);
|
||||
}
|
||||
// else {
|
||||
// cacheHit++;
|
||||
// }
|
||||
// gtParse++;
|
||||
//
|
||||
// if ( cacheHit % 10000 == 0 )
|
||||
// System.out.printf("Cache hit %d %d %.2f%n", cacheHit, gtParse, (100.0*cacheHit) / gtParse);
|
||||
|
||||
return GTAlleles;
|
||||
}
|
||||
|
||||
private Map<String, Object> parseInfo(String infoField, String id) {
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
||||
if ( ! infoField.equals(".") ) { // empty info field
|
||||
for ( String field : infoField.split(";") ) {
|
||||
int eqI = field.indexOf("=");
|
||||
String key = null;
|
||||
Object value = null;
|
||||
if ( eqI != -1 ) {
|
||||
key = field.substring(0, eqI);
|
||||
value = field.substring(eqI+1, field.length()); // todo -- needs to convert to int, double, etc
|
||||
//System.out.printf("%s %s%n", key, value);
|
||||
} else {
|
||||
key = field;
|
||||
value = 1;
|
||||
}
|
||||
} else {
|
||||
attributes.put(formatStrings[index],genotypeSplit[index]);
|
||||
|
||||
attributes.put(key, value);
|
||||
}
|
||||
}
|
||||
return new Genotype(name,new ArrayList(aList),0.0,new HashSet<String>(),attributes,false);
|
||||
attributes.put("ID", id);
|
||||
return attributes;
|
||||
}
|
||||
|
||||
//private static String[] CachedGTKey = new String[100];
|
||||
private static String[] CachedGTValues = new String[100];
|
||||
|
||||
public static boolean parseGenotypesToo = false; // for performance testing purposes
|
||||
public static boolean validate = true; // for performance testing purposes
|
||||
private static boolean REQUIRE_HEADER = false;
|
||||
|
||||
// a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
|
||||
private String[] parts = null;
|
||||
|
||||
public Feature decode(String line) {
|
||||
if ( parts == null )
|
||||
parts = REQUIRE_HEADER ? new String[header.getColumnCount()] : new String[10000]; // todo -- remove require header
|
||||
|
||||
int nParts = ParsingUtils.split(line, parts, '\t');
|
||||
|
||||
if (REQUIRE_HEADER) { // todo -- remove require header
|
||||
// our header cannot be null, we need the genotype sample names and counts
|
||||
if ( header == null) throw new IllegalStateException("VCF Header cannot be null");
|
||||
|
||||
// check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
|
||||
if (nParts != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + nParts + " for line " + line);
|
||||
}
|
||||
|
||||
return parseVCFLine(parts, nParts);
|
||||
}
|
||||
|
||||
private static Double parseQual(String qualString) {
|
||||
return qualString.equals("-1") ? VariantContext.NO_NEG_LOG_10PERROR : Double.valueOf(qualString) / 10;
|
||||
}
|
||||
|
||||
private VariantContext parseVCFLine(String[] parts, int nParts) {
|
||||
// chr5 157273992 rs1211159 C T 0.00 0 . GT 0/0 0/0 0
|
||||
String contig = parts[0];
|
||||
long pos = Long.valueOf(parts[1]);
|
||||
String id = parts[2];
|
||||
String ref = parts[3];
|
||||
String alts = parts[4];
|
||||
Double qual = parseQual(parts[5]);
|
||||
String filter = parts[6];
|
||||
String info = parts[7];
|
||||
String GT = parts[8];
|
||||
int genotypesStart = 9;
|
||||
|
||||
// add the reference allele
|
||||
if ( ! Allele.acceptableAlleleBases(ref) ) {
|
||||
System.out.printf("Excluding vcf record %s%n", ref);
|
||||
return null;
|
||||
}
|
||||
|
||||
Set<String> filters = ! filter.equals(".") ? new HashSet<String>(Arrays.asList(filter.split(";"))) : null;
|
||||
Map<String, Object> attributes = parseInfo(info, id);
|
||||
|
||||
// add all of the alt alleles
|
||||
|
||||
// todo -- use Allele factor method, not new, so we can keep a cache of the alleles since they are always the same
|
||||
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
|
||||
Allele refAllele = new Allele(ref, true);
|
||||
alleles.add(refAllele);
|
||||
|
||||
for ( String alt : alts.split(",") ) {
|
||||
if ( ! Allele.acceptableAlleleBases(alt) ) {
|
||||
//System.out.printf("Excluding vcf record %s%n", vcf);
|
||||
return null;
|
||||
}
|
||||
|
||||
Allele allele = new Allele(alt, false);
|
||||
if ( ! allele.isNoCall() )
|
||||
alleles.add(allele);
|
||||
}
|
||||
|
||||
String[] GTKeys = GT.split(":"); // to performance issue
|
||||
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(nParts);
|
||||
if ( parseGenotypesToo ) {
|
||||
alleleMap.clear();
|
||||
for ( int genotypeOffset = genotypesStart; genotypeOffset < nParts; genotypeOffset++ ) {
|
||||
String sample = parts[genotypeOffset];
|
||||
String[] GTValues = CachedGTValues;
|
||||
ParsingUtils.split(sample, GTValues, ':'); // to performance issue
|
||||
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValues[0], alleles, alleleMap);
|
||||
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
|
||||
|
||||
// todo -- the parsing of attributes could be made lazy for performance
|
||||
Map<String, String> gtAttributes = null;
|
||||
if ( GTKeys.length > 1 ) {
|
||||
gtAttributes = new HashMap<String, String>(GTKeys.length - 1);
|
||||
for ( int i = 1; i < GTKeys.length; i++ ) {
|
||||
if ( GTKeys[i].equals("GQ") ) {
|
||||
GTQual = parseQual(GTValues[i]);
|
||||
} else {
|
||||
gtAttributes.put(GTKeys[i], GTValues[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Set<String> genotypeFilters = null;
|
||||
// genotypeFilters = new HashSet<String>();
|
||||
// if ( vcfG.isFiltered() ) // setup the FL genotype filter fields
|
||||
// genotypeFilters.addAll(Arrays.asList(vcfG.getFields().get(VCFGenotypeRecord.GENOTYPE_FILTER_KEY).split(";")));
|
||||
|
||||
boolean phased = GTKeys[0].charAt(1) == '|';
|
||||
Genotype g = new Genotype("X" + genotypeOffset, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
|
||||
genotypes.put(g.getSampleName(), g);
|
||||
}
|
||||
}
|
||||
|
||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc(contig,pos,pos+refAllele.length()-1);
|
||||
|
||||
VariantContext vc = new VariantContext("foo", loc, alleles, genotypes, qual, filters, attributes);
|
||||
if ( validate ) vc.validate();
|
||||
return vc;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -159,6 +248,6 @@ public class VCF4Codec implements FeatureCodec {
|
|||
*/
|
||||
@Override
|
||||
public Class getFeatureType() {
|
||||
return VCF4Record.class;
|
||||
return VariantContext.class;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,45 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.refdata.features.vcf4;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* simple variant context wrapped as VCF4
|
||||
*/
|
||||
public class VCF4Record extends VariantContext implements Feature {
|
||||
/**
|
||||
* create a VCF4Record, which is really a variant context
|
||||
* @param name the name of the record
|
||||
* @param loc it's location
|
||||
* @param alleles the set of alleles
|
||||
* @param genotypes any genotypes for this record
|
||||
* @param negLog10PError the probability of being a wrong call
|
||||
* @param filters the set of filters applied to this variant
|
||||
* @param attributes any other attributes
|
||||
*/
|
||||
public VCF4Record(String name, GenomeLoc loc, Collection<Allele> alleles, Map<String, Genotype> genotypes, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
|
||||
super(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getChr() {
|
||||
return getLocation().getContig();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getStart() {
|
||||
return (int)getLocation().getStart();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getEnd() {
|
||||
return (int)getLocation().getStop();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,149 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broad.tribble.util.AsciiLineReader;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.vcf4.VCF4Codec;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import com.sun.xml.internal.ws.wsdl.parser.ParserUtil;
|
||||
|
||||
/**
|
||||
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||
*
|
||||
* @Author chartl
|
||||
* @Date Apr 13, 2010
|
||||
*/
|
||||
public class VCF4ReaderTestWalker extends RodWalker<VCFRecord,Long> {
|
||||
@Argument(shortName="MR", doc="", required=false)
|
||||
int maxRecords = -1;
|
||||
@Argument(shortName="vcf", doc="", required=true)
|
||||
File vcfFile = null;
|
||||
@Argument(shortName="Parse", doc="", required=true)
|
||||
ParsingStatus splitFile = ParsingStatus.NONE;
|
||||
@Argument(shortName="DontValidate", doc="", required=false)
|
||||
boolean DontValidate = false;
|
||||
|
||||
@Argument(shortName="USE_VCF3", doc="", required=false)
|
||||
boolean USE_VCF3 = false;
|
||||
|
||||
|
||||
public enum ParsingStatus { NONE, SPLIT_LINES, VARIANTS, GENOTYPES }
|
||||
|
||||
public void initialize() {
|
||||
}
|
||||
|
||||
public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext context, AlignmentContext alicon) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public Long reduce(VCFRecord con, Long num) {
|
||||
if ( con == null ) {
|
||||
return num;
|
||||
}
|
||||
|
||||
return 1 + num;
|
||||
}
|
||||
|
||||
public Long reduceInit() {
|
||||
return 0l;
|
||||
}
|
||||
|
||||
String[] parts = new String[10000];
|
||||
public void onTraversalDone(Long num){
|
||||
VCF4Codec vcf4codec = new VCF4Codec(true);
|
||||
VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES;
|
||||
VCF4Codec.validate = ! DontValidate;
|
||||
|
||||
VCFCodec vcf3codec = new VCFCodec();
|
||||
|
||||
FeatureCodec codec = USE_VCF3 ? vcf3codec : vcf4codec;
|
||||
|
||||
try {
|
||||
AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(vcfFile));
|
||||
int lineNumber = codec.readHeader(lineReader);
|
||||
out.printf("Read %d header lines%n", lineNumber);
|
||||
|
||||
while (true) {
|
||||
String line = lineReader.readLine();
|
||||
|
||||
if ( line == null )
|
||||
break;
|
||||
|
||||
lineNumber++;
|
||||
if ( lineNumber >= maxRecords && maxRecords != -1 ) {
|
||||
return;
|
||||
}
|
||||
|
||||
if ( line.charAt(0) == '#' )
|
||||
continue;
|
||||
|
||||
Object vc = null;
|
||||
if ( splitFile == ParsingStatus.NONE ) {
|
||||
|
||||
}
|
||||
else if ( splitFile == ParsingStatus.SPLIT_LINES ) {
|
||||
// todo -- look at header and determine number of elements that need to be parsed. Should be static per file
|
||||
int nParts = ParsingUtils.split(line, parts, '\t');
|
||||
} else {
|
||||
vc = codec.decode(line);
|
||||
if ( USE_VCF3 ) {
|
||||
VCFRecord rec = (VCFRecord)vc;
|
||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc(rec.getChr(), rec.getStart());
|
||||
ReferenceContext ref = new ReferenceContext(loc, (byte)rec.getReference().charAt(0));
|
||||
vc = VariantContextAdaptors.toVariantContext("X", vc, ref);
|
||||
}
|
||||
}
|
||||
|
||||
if ( lineNumber % 10000 == 0 ) {
|
||||
System.out.printf("%10d: %s%n", lineNumber, line.subSequence(0, 50));
|
||||
System.out.printf("%10d: %s%n", lineNumber, vc);
|
||||
}
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new StingException(e.getMessage());
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException(e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue