Optimized, nearly complete VCF4 reader 2-4x faster than the previous implementation, along with a VCF4 reader performance testing walker that can read 3/4 files, useful for benchmarking
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3487 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6482b87741
commit
b811e61ae1
|
|
@ -16,7 +16,7 @@ public class Genotype {
|
||||||
|
|
||||||
protected InferredGeneticContext commonInfo;
|
protected InferredGeneticContext commonInfo;
|
||||||
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
|
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
|
||||||
protected List<Allele> alleles = new ArrayList<Allele>();
|
protected List<Allele> alleles = null; // new ArrayList<Allele>();
|
||||||
private boolean genotypesArePhased = false;
|
private boolean genotypesArePhased = false;
|
||||||
|
|
||||||
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean genotypesArePhased) {
|
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean genotypesArePhased) {
|
||||||
|
|
|
||||||
|
|
@ -13,10 +13,13 @@ import java.util.*;
|
||||||
final class InferredGeneticContext {
|
final class InferredGeneticContext {
|
||||||
public static final double NO_NEG_LOG_10PERROR = -1.0;
|
public static final double NO_NEG_LOG_10PERROR = -1.0;
|
||||||
|
|
||||||
|
private static Set<String> NO_FILTERS = Collections.unmodifiableSet(new HashSet<String>());
|
||||||
|
private static Map<String, Object> NO_ATTRIBUTES = Collections.unmodifiableMap(new HashMap<String, Object>());
|
||||||
|
|
||||||
private double negLog10PError = NO_NEG_LOG_10PERROR;
|
private double negLog10PError = NO_NEG_LOG_10PERROR;
|
||||||
private String name = null;
|
private String name = null;
|
||||||
private Set<String> filters = new HashSet<String>();
|
private Set<String> filters = NO_FILTERS;
|
||||||
private Map<String, Object> attributes = new HashMap<String, Object>();
|
private Map<String, Object> attributes = NO_ATTRIBUTES;
|
||||||
|
|
||||||
// public InferredGeneticContext(String name) {
|
// public InferredGeneticContext(String name) {
|
||||||
// this.name = name;
|
// this.name = name;
|
||||||
|
|
@ -73,6 +76,9 @@ final class InferredGeneticContext {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addFilter(String filter) {
|
public void addFilter(String filter) {
|
||||||
|
if ( filters == NO_FILTERS ) // immutable -> mutable
|
||||||
|
filters = new HashSet<String>(filters);
|
||||||
|
|
||||||
if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this);
|
if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this);
|
||||||
if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this);
|
if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this);
|
||||||
filters.add(filter);
|
filters.add(filter);
|
||||||
|
|
@ -85,7 +91,10 @@ final class InferredGeneticContext {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void clearFilters() {
|
public void clearFilters() {
|
||||||
filters.clear();
|
if ( filters == NO_FILTERS )
|
||||||
|
filters = new HashSet<String>();
|
||||||
|
else
|
||||||
|
filters.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setFilters(Collection<String> filters) {
|
public void setFilters(Collection<String> filters) {
|
||||||
|
|
@ -123,7 +132,10 @@ final class InferredGeneticContext {
|
||||||
//
|
//
|
||||||
// ---------------------------------------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------------------------------------
|
||||||
public void clearAttributes() {
|
public void clearAttributes() {
|
||||||
this.attributes.clear();
|
if ( attributes == NO_ATTRIBUTES )
|
||||||
|
attributes = new HashMap<String, Object>();
|
||||||
|
else
|
||||||
|
this.attributes.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -136,7 +148,7 @@ final class InferredGeneticContext {
|
||||||
// todo -- define common attributes as enum
|
// todo -- define common attributes as enum
|
||||||
|
|
||||||
public void setAttributes(Map<String, ?> map) {
|
public void setAttributes(Map<String, ?> map) {
|
||||||
this.attributes.clear();
|
clearAttributes();
|
||||||
putAttributes(map);
|
putAttributes(map);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -148,10 +160,15 @@ final class InferredGeneticContext {
|
||||||
if ( hasAttribute(key) && ! allowOverwrites )
|
if ( hasAttribute(key) && ! allowOverwrites )
|
||||||
throw new StingException("Attempting to overwrite key->value binding: key = " + key + " this = " + this);
|
throw new StingException("Attempting to overwrite key->value binding: key = " + key + " this = " + this);
|
||||||
|
|
||||||
|
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
|
||||||
|
attributes = new HashMap<String, Object>(attributes);
|
||||||
|
|
||||||
this.attributes.put(key, value);
|
this.attributes.put(key, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void removeAttribute(String key) {
|
public void removeAttribute(String key) {
|
||||||
|
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
|
||||||
|
attributes = new HashMap<String, Object>(attributes);
|
||||||
this.attributes.remove(key);
|
this.attributes.remove(key);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broad.tribble.vcf.VCFRecord;
|
import org.broad.tribble.vcf.VCFRecord;
|
||||||
|
import org.broad.tribble.Feature;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
@ -160,7 +161,7 @@ import java.util.*;
|
||||||
*
|
*
|
||||||
* @author depristo
|
* @author depristo
|
||||||
*/
|
*/
|
||||||
public class VariantContext {
|
public class VariantContext implements Feature { // to enable tribble intergration
|
||||||
protected InferredGeneticContext commonInfo = null;
|
protected InferredGeneticContext commonInfo = null;
|
||||||
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
|
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
|
||||||
|
|
||||||
|
|
@ -997,4 +998,24 @@ public class VariantContext {
|
||||||
return dest;
|
return dest;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// tribble integration routines -- not for public consumption
|
||||||
|
//
|
||||||
|
// ---------------------------------------------------------------------------------------------------------
|
||||||
|
@Override
|
||||||
|
public String getChr() {
|
||||||
|
return getLocation().getContig();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getStart() {
|
||||||
|
return (int)getLocation().getStart();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getEnd() {
|
||||||
|
return (int)getLocation().getStop();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -4,6 +4,7 @@ import org.broad.tribble.Feature;
|
||||||
import org.broad.tribble.FeatureCodec;
|
import org.broad.tribble.FeatureCodec;
|
||||||
import org.broad.tribble.exception.CodecLineParsingException;
|
import org.broad.tribble.exception.CodecLineParsingException;
|
||||||
import org.broad.tribble.util.LineReader;
|
import org.broad.tribble.util.LineReader;
|
||||||
|
import org.broad.tribble.util.ParsingUtils;
|
||||||
import org.broad.tribble.vcf.VCFHeader;
|
import org.broad.tribble.vcf.VCFHeader;
|
||||||
import org.broad.tribble.vcf.VCFHeaderLine;
|
import org.broad.tribble.vcf.VCFHeaderLine;
|
||||||
import org.broad.tribble.vcf.VCFReaderUtils;
|
import org.broad.tribble.vcf.VCFReaderUtils;
|
||||||
|
|
@ -12,6 +13,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -30,60 +32,11 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
throw new StingException("DON'T USE THIS");
|
throw new StingException("DON'T USE THIS");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public VCF4Codec(boolean itsOKImTesting) {
|
||||||
* this a super hack like method to parse out what we need from a variant context
|
if ( ! itsOKImTesting )
|
||||||
* @param line the line to parse
|
throw new StingException("DON'T USE THIS");
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public Feature decode(String line) {
|
|
||||||
|
|
||||||
// our header cannot be null, we need the genotype sample names and counts
|
|
||||||
if (header == null) throw new IllegalStateException("VCF Header cannot be null");
|
|
||||||
|
|
||||||
// split the line on whitespace (Jim's parser will be faster, but it's broken right now)
|
|
||||||
String[] result = line.split("\\s+");
|
|
||||||
|
|
||||||
// check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
|
|
||||||
if (result.length != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + result.length+ " for line " + line);
|
|
||||||
|
|
||||||
// our genotype names
|
|
||||||
Iterator<String> iter = header.getGenotypeSamples().iterator();
|
|
||||||
|
|
||||||
// out genotype map, sample name to genotype
|
|
||||||
Map<String, Genotype> genotypes = new LinkedHashMap<String,Genotype>();
|
|
||||||
|
|
||||||
// our allele list, add the reference and the alts
|
|
||||||
List<Allele> alleles = new ArrayList<Allele>();
|
|
||||||
String[] alts = result[4].split(",");
|
|
||||||
for (String alt : alts)
|
|
||||||
alleles.add(new Allele(alt,false));
|
|
||||||
alleles.add(new Allele(result[3],true));
|
|
||||||
|
|
||||||
// parse out each of the genotypes
|
|
||||||
for (int genotypeIndex = 9; genotypeIndex < header.getColumnCount(); genotypeIndex++) {
|
|
||||||
if (!iter.hasNext()) throw new StingException("Wrong number of samples!");
|
|
||||||
String sample = iter.next();
|
|
||||||
genotypes.put(sample,createGenotypeFromString(sample,result[genotypeIndex],result[8].split(":"),alts,result[3]));
|
|
||||||
}
|
|
||||||
|
|
||||||
// make a new set of all the filters
|
|
||||||
Set<String> filters = new TreeSet<String>();
|
|
||||||
filters.addAll(Arrays.asList(result[5].split(",")));
|
|
||||||
|
|
||||||
// create, validate, and return the record
|
|
||||||
VCF4Record rec = new VCF4Record(result[2],
|
|
||||||
GenomeLocParser.createGenomeLoc(result[0],Long.valueOf(result[1])),
|
|
||||||
Collections.unmodifiableCollection(alleles),
|
|
||||||
genotypes,
|
|
||||||
Double.valueOf(result[5]),
|
|
||||||
filters,
|
|
||||||
new HashMap<String,Object>());
|
|
||||||
rec.validate();
|
|
||||||
return rec;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates
|
* this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates
|
||||||
* @param reader the line reader to take header lines from
|
* @param reader the line reader to take header lines from
|
||||||
|
|
@ -118,39 +71,175 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static int ZERO_CHAR = (byte)'0';
|
||||||
|
private static Allele oneAllele(char index, List<Allele> alleles) {
|
||||||
|
if ( index == '.' )
|
||||||
|
return Allele.NO_CALL;
|
||||||
|
else {
|
||||||
|
int i = ((byte)index) - ZERO_CHAR;
|
||||||
|
return alleles.get(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
private static Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
|
||||||
* create the genotype object from the VCF record
|
|
||||||
* @param name the name of the sample
|
|
||||||
* @param VCFEntry the entry; the text containing all the fields corresponding to the format fields
|
|
||||||
* @param formatStrings the format string
|
|
||||||
* @param altAlleleList our alt alleles
|
|
||||||
* @param reference the reference base(s)
|
|
||||||
* @return a Genotype object
|
|
||||||
*/
|
|
||||||
private Genotype createGenotypeFromString(String name, String VCFEntry, String[] formatStrings, String[] altAlleleList, String reference) {
|
|
||||||
// split the text entry into parts
|
|
||||||
String genotypeSplit[] = VCFEntry.split(":");
|
|
||||||
|
|
||||||
Set<Allele> aList = new TreeSet<Allele>();
|
static long cacheHit = 0, gtParse = 0;
|
||||||
Map<String, Object> attributes = new LinkedHashMap<String,Object>();
|
|
||||||
|
|
||||||
// for each entry in the vcf field (we drive by this so that dropped fields aren't processed
|
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
||||||
for (int index = 0; index < genotypeSplit.length; index++) {
|
// this should cache results [since they are immutable] and return a single object for each genotype
|
||||||
if (formatStrings[index].toUpperCase().equals("GT")) {
|
if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
|
||||||
String[] genotypes = genotypeSplit[index].split("[\\\\|\\/]+");
|
List<Allele> GTAlleles = cache.get(GT);
|
||||||
for (String g : genotypes) {
|
if ( GTAlleles == null ) {
|
||||||
int altIndex = Integer.valueOf(g);
|
GTAlleles = Arrays.asList(oneAllele(GT.charAt(0), alleles), oneAllele(GT.charAt(2), alleles));
|
||||||
if (altIndex == 0)
|
cache.put(GT, GTAlleles);
|
||||||
aList.add(new Allele(reference,true));
|
}
|
||||||
else
|
// else {
|
||||||
aList.add(new Allele(altAlleleList[altIndex-1]));
|
// cacheHit++;
|
||||||
|
// }
|
||||||
|
// gtParse++;
|
||||||
|
//
|
||||||
|
// if ( cacheHit % 10000 == 0 )
|
||||||
|
// System.out.printf("Cache hit %d %d %.2f%n", cacheHit, gtParse, (100.0*cacheHit) / gtParse);
|
||||||
|
|
||||||
|
return GTAlleles;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, Object> parseInfo(String infoField, String id) {
|
||||||
|
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||||
|
|
||||||
|
if ( ! infoField.equals(".") ) { // empty info field
|
||||||
|
for ( String field : infoField.split(";") ) {
|
||||||
|
int eqI = field.indexOf("=");
|
||||||
|
String key = null;
|
||||||
|
Object value = null;
|
||||||
|
if ( eqI != -1 ) {
|
||||||
|
key = field.substring(0, eqI);
|
||||||
|
value = field.substring(eqI+1, field.length()); // todo -- needs to convert to int, double, etc
|
||||||
|
//System.out.printf("%s %s%n", key, value);
|
||||||
|
} else {
|
||||||
|
key = field;
|
||||||
|
value = 1;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
attributes.put(formatStrings[index],genotypeSplit[index]);
|
attributes.put(key, value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return new Genotype(name,new ArrayList(aList),0.0,new HashSet<String>(),attributes,false);
|
attributes.put("ID", id);
|
||||||
|
return attributes;
|
||||||
|
}
|
||||||
|
|
||||||
|
//private static String[] CachedGTKey = new String[100];
|
||||||
|
private static String[] CachedGTValues = new String[100];
|
||||||
|
|
||||||
|
public static boolean parseGenotypesToo = false; // for performance testing purposes
|
||||||
|
public static boolean validate = true; // for performance testing purposes
|
||||||
|
private static boolean REQUIRE_HEADER = false;
|
||||||
|
|
||||||
|
// a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
|
||||||
|
private String[] parts = null;
|
||||||
|
|
||||||
|
public Feature decode(String line) {
|
||||||
|
if ( parts == null )
|
||||||
|
parts = REQUIRE_HEADER ? new String[header.getColumnCount()] : new String[10000]; // todo -- remove require header
|
||||||
|
|
||||||
|
int nParts = ParsingUtils.split(line, parts, '\t');
|
||||||
|
|
||||||
|
if (REQUIRE_HEADER) { // todo -- remove require header
|
||||||
|
// our header cannot be null, we need the genotype sample names and counts
|
||||||
|
if ( header == null) throw new IllegalStateException("VCF Header cannot be null");
|
||||||
|
|
||||||
|
// check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
|
||||||
|
if (nParts != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + nParts + " for line " + line);
|
||||||
|
}
|
||||||
|
|
||||||
|
return parseVCFLine(parts, nParts);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Double parseQual(String qualString) {
|
||||||
|
return qualString.equals("-1") ? VariantContext.NO_NEG_LOG_10PERROR : Double.valueOf(qualString) / 10;
|
||||||
|
}
|
||||||
|
|
||||||
|
private VariantContext parseVCFLine(String[] parts, int nParts) {
|
||||||
|
// chr5 157273992 rs1211159 C T 0.00 0 . GT 0/0 0/0 0
|
||||||
|
String contig = parts[0];
|
||||||
|
long pos = Long.valueOf(parts[1]);
|
||||||
|
String id = parts[2];
|
||||||
|
String ref = parts[3];
|
||||||
|
String alts = parts[4];
|
||||||
|
Double qual = parseQual(parts[5]);
|
||||||
|
String filter = parts[6];
|
||||||
|
String info = parts[7];
|
||||||
|
String GT = parts[8];
|
||||||
|
int genotypesStart = 9;
|
||||||
|
|
||||||
|
// add the reference allele
|
||||||
|
if ( ! Allele.acceptableAlleleBases(ref) ) {
|
||||||
|
System.out.printf("Excluding vcf record %s%n", ref);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> filters = ! filter.equals(".") ? new HashSet<String>(Arrays.asList(filter.split(";"))) : null;
|
||||||
|
Map<String, Object> attributes = parseInfo(info, id);
|
||||||
|
|
||||||
|
// add all of the alt alleles
|
||||||
|
|
||||||
|
// todo -- use Allele factor method, not new, so we can keep a cache of the alleles since they are always the same
|
||||||
|
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
|
||||||
|
Allele refAllele = new Allele(ref, true);
|
||||||
|
alleles.add(refAllele);
|
||||||
|
|
||||||
|
for ( String alt : alts.split(",") ) {
|
||||||
|
if ( ! Allele.acceptableAlleleBases(alt) ) {
|
||||||
|
//System.out.printf("Excluding vcf record %s%n", vcf);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
Allele allele = new Allele(alt, false);
|
||||||
|
if ( ! allele.isNoCall() )
|
||||||
|
alleles.add(allele);
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] GTKeys = GT.split(":"); // to performance issue
|
||||||
|
|
||||||
|
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(nParts);
|
||||||
|
if ( parseGenotypesToo ) {
|
||||||
|
alleleMap.clear();
|
||||||
|
for ( int genotypeOffset = genotypesStart; genotypeOffset < nParts; genotypeOffset++ ) {
|
||||||
|
String sample = parts[genotypeOffset];
|
||||||
|
String[] GTValues = CachedGTValues;
|
||||||
|
ParsingUtils.split(sample, GTValues, ':'); // to performance issue
|
||||||
|
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValues[0], alleles, alleleMap);
|
||||||
|
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
|
||||||
|
|
||||||
|
// todo -- the parsing of attributes could be made lazy for performance
|
||||||
|
Map<String, String> gtAttributes = null;
|
||||||
|
if ( GTKeys.length > 1 ) {
|
||||||
|
gtAttributes = new HashMap<String, String>(GTKeys.length - 1);
|
||||||
|
for ( int i = 1; i < GTKeys.length; i++ ) {
|
||||||
|
if ( GTKeys[i].equals("GQ") ) {
|
||||||
|
GTQual = parseQual(GTValues[i]);
|
||||||
|
} else {
|
||||||
|
gtAttributes.put(GTKeys[i], GTValues[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> genotypeFilters = null;
|
||||||
|
// genotypeFilters = new HashSet<String>();
|
||||||
|
// if ( vcfG.isFiltered() ) // setup the FL genotype filter fields
|
||||||
|
// genotypeFilters.addAll(Arrays.asList(vcfG.getFields().get(VCFGenotypeRecord.GENOTYPE_FILTER_KEY).split(";")));
|
||||||
|
|
||||||
|
boolean phased = GTKeys[0].charAt(1) == '|';
|
||||||
|
Genotype g = new Genotype("X" + genotypeOffset, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
|
||||||
|
genotypes.put(g.getSampleName(), g);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GenomeLoc loc = GenomeLocParser.createGenomeLoc(contig,pos,pos+refAllele.length()-1);
|
||||||
|
|
||||||
|
VariantContext vc = new VariantContext("foo", loc, alleles, genotypes, qual, filters, attributes);
|
||||||
|
if ( validate ) vc.validate();
|
||||||
|
return vc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -159,6 +248,6 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Class getFeatureType() {
|
public Class getFeatureType() {
|
||||||
return VCF4Record.class;
|
return VariantContext.class;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,45 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.refdata.features.vcf4;
|
|
||||||
|
|
||||||
import org.broad.tribble.Feature;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
||||||
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* simple variant context wrapped as VCF4
|
|
||||||
*/
|
|
||||||
public class VCF4Record extends VariantContext implements Feature {
|
|
||||||
/**
|
|
||||||
* create a VCF4Record, which is really a variant context
|
|
||||||
* @param name the name of the record
|
|
||||||
* @param loc it's location
|
|
||||||
* @param alleles the set of alleles
|
|
||||||
* @param genotypes any genotypes for this record
|
|
||||||
* @param negLog10PError the probability of being a wrong call
|
|
||||||
* @param filters the set of filters applied to this variant
|
|
||||||
* @param attributes any other attributes
|
|
||||||
*/
|
|
||||||
public VCF4Record(String name, GenomeLoc loc, Collection<Allele> alleles, Map<String, Genotype> genotypes, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
|
|
||||||
super(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getChr() {
|
|
||||||
return getLocation().getContig();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int getStart() {
|
|
||||||
return (int)getLocation().getStart();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int getEnd() {
|
|
||||||
return (int)getLocation().getStop();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -0,0 +1,149 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||||
|
|
||||||
|
import org.broad.tribble.vcf.*;
|
||||||
|
import org.broad.tribble.util.ParsingUtils;
|
||||||
|
import org.broad.tribble.util.AsciiLineReader;
|
||||||
|
import org.broad.tribble.FeatureCodec;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.features.vcf4.VCF4Codec;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
import com.sun.xml.internal.ws.wsdl.parser.ParserUtil;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||||
|
*
|
||||||
|
* @Author chartl
|
||||||
|
* @Date Apr 13, 2010
|
||||||
|
*/
|
||||||
|
public class VCF4ReaderTestWalker extends RodWalker<VCFRecord,Long> {
|
||||||
|
@Argument(shortName="MR", doc="", required=false)
|
||||||
|
int maxRecords = -1;
|
||||||
|
@Argument(shortName="vcf", doc="", required=true)
|
||||||
|
File vcfFile = null;
|
||||||
|
@Argument(shortName="Parse", doc="", required=true)
|
||||||
|
ParsingStatus splitFile = ParsingStatus.NONE;
|
||||||
|
@Argument(shortName="DontValidate", doc="", required=false)
|
||||||
|
boolean DontValidate = false;
|
||||||
|
|
||||||
|
@Argument(shortName="USE_VCF3", doc="", required=false)
|
||||||
|
boolean USE_VCF3 = false;
|
||||||
|
|
||||||
|
|
||||||
|
public enum ParsingStatus { NONE, SPLIT_LINES, VARIANTS, GENOTYPES }
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext context, AlignmentContext alicon) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Long reduce(VCFRecord con, Long num) {
|
||||||
|
if ( con == null ) {
|
||||||
|
return num;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1 + num;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Long reduceInit() {
|
||||||
|
return 0l;
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] parts = new String[10000];
|
||||||
|
public void onTraversalDone(Long num){
|
||||||
|
VCF4Codec vcf4codec = new VCF4Codec(true);
|
||||||
|
VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES;
|
||||||
|
VCF4Codec.validate = ! DontValidate;
|
||||||
|
|
||||||
|
VCFCodec vcf3codec = new VCFCodec();
|
||||||
|
|
||||||
|
FeatureCodec codec = USE_VCF3 ? vcf3codec : vcf4codec;
|
||||||
|
|
||||||
|
try {
|
||||||
|
AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(vcfFile));
|
||||||
|
int lineNumber = codec.readHeader(lineReader);
|
||||||
|
out.printf("Read %d header lines%n", lineNumber);
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
String line = lineReader.readLine();
|
||||||
|
|
||||||
|
if ( line == null )
|
||||||
|
break;
|
||||||
|
|
||||||
|
lineNumber++;
|
||||||
|
if ( lineNumber >= maxRecords && maxRecords != -1 ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( line.charAt(0) == '#' )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
Object vc = null;
|
||||||
|
if ( splitFile == ParsingStatus.NONE ) {
|
||||||
|
|
||||||
|
}
|
||||||
|
else if ( splitFile == ParsingStatus.SPLIT_LINES ) {
|
||||||
|
// todo -- look at header and determine number of elements that need to be parsed. Should be static per file
|
||||||
|
int nParts = ParsingUtils.split(line, parts, '\t');
|
||||||
|
} else {
|
||||||
|
vc = codec.decode(line);
|
||||||
|
if ( USE_VCF3 ) {
|
||||||
|
VCFRecord rec = (VCFRecord)vc;
|
||||||
|
GenomeLoc loc = GenomeLocParser.createGenomeLoc(rec.getChr(), rec.getStart());
|
||||||
|
ReferenceContext ref = new ReferenceContext(loc, (byte)rec.getReference().charAt(0));
|
||||||
|
vc = VariantContextAdaptors.toVariantContext("X", vc, ref);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( lineNumber % 10000 == 0 ) {
|
||||||
|
System.out.printf("%10d: %s%n", lineNumber, line.subSequence(0, 50));
|
||||||
|
System.out.printf("%10d: %s%n", lineNumber, vc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
throw new StingException(e.getMessage());
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new StingException(e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue