Optimized, nearly complete VCF4 reader 2-4x faster than the previous implementation, along with a VCF4 reader performance testing walker that can read 3/4 files, useful for benchmarking

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3487 348d0f76-0448-11de-a6fe-93d51630548a
2010-06-04 18:11:38 +00:00 · 2010-06-04 18:11:38 +00:00 · b811e61ae1
parent 6482b87741
commit b811e61ae1
6 changed files with 363 additions and 132 deletions
--- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java
+++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java
@ -16,7 +16,7 @@ public class Genotype {

    protected InferredGeneticContext commonInfo;
    public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
-    protected List<Allele> alleles = new ArrayList<Allele>();
+    protected List<Allele> alleles = null; // new ArrayList<Allele>();
    private boolean genotypesArePhased = false;

    public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean genotypesArePhased) {
--- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/InferredGeneticContext.java
+++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/InferredGeneticContext.java
@ -13,10 +13,13 @@ import java.util.*;
 final class InferredGeneticContext {
    public static final double NO_NEG_LOG_10PERROR = -1.0;

+    private static Set<String> NO_FILTERS = Collections.unmodifiableSet(new HashSet<String>());
+    private static Map<String, Object> NO_ATTRIBUTES = Collections.unmodifiableMap(new HashMap<String, Object>());
+
    private double negLog10PError = NO_NEG_LOG_10PERROR;
    private String name = null;
-    private Set<String> filters = new HashSet<String>();
-    private Map<String, Object> attributes = new HashMap<String, Object>();
+    private Set<String> filters = NO_FILTERS;
+    private Map<String, Object> attributes = NO_ATTRIBUTES;

 //    public InferredGeneticContext(String name) {
 //        this.name = name;
@ -73,6 +76,9 @@ final class InferredGeneticContext {
    }

    public void addFilter(String filter) {
+        if ( filters == NO_FILTERS ) // immutable -> mutable
+            filters = new HashSet<String>(filters);
+
        if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this);
        if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this);
        filters.add(filter);
@ -85,7 +91,10 @@ final class InferredGeneticContext {
    }

    public void clearFilters() {
-        filters.clear();
+        if ( filters == NO_FILTERS )
+            filters = new HashSet<String>();
+        else
+            filters.clear();
    }

    public void setFilters(Collection<String> filters) {
@ -123,7 +132,10 @@ final class InferredGeneticContext {
    //
    // ---------------------------------------------------------------------------------------------------------
    public void clearAttributes() {
-        this.attributes.clear();
+        if ( attributes == NO_ATTRIBUTES )
+             attributes = new HashMap<String, Object>();
+         else 
+             this.attributes.clear();
    }

    /**
@ -136,7 +148,7 @@ final class InferredGeneticContext {
    // todo -- define common attributes as enum

    public void setAttributes(Map<String, ?> map) {
-        this.attributes.clear();
+        clearAttributes();
        putAttributes(map);
    }

@ -148,10 +160,15 @@ final class InferredGeneticContext {
        if ( hasAttribute(key) && ! allowOverwrites )
            throw new StingException("Attempting to overwrite key->value binding: key = " + key + " this = " + this);

+        if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
+            attributes = new HashMap<String, Object>(attributes);
+        
        this.attributes.put(key, value);
    }

    public void removeAttribute(String key) {
+        if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
+            attributes = new HashMap<String, Object>(attributes);
        this.attributes.remove(key);
    }

--- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java
+++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java
@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.StingException;
 import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.Utils;
 import org.broad.tribble.vcf.VCFRecord;
+import org.broad.tribble.Feature;

 import java.util.*;

@ -160,7 +161,7 @@ import java.util.*;
 *
 * @author depristo
 */
-public class VariantContext {
+public class VariantContext implements Feature { // to enable tribble intergration
    protected InferredGeneticContext commonInfo = null;
    public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;

@ -997,4 +998,24 @@ public class VariantContext {
        return dest;
    }

+    // ---------------------------------------------------------------------------------------------------------
+    //
+    // tribble integration routines -- not for public consumption
+    //
+    // ---------------------------------------------------------------------------------------------------------
+    @Override
+    public String getChr() {
+        return getLocation().getContig();
+    }
+
+    @Override
+    public int getStart() {
+        return (int)getLocation().getStart();
+    }
+
+    @Override
+    public int getEnd() {
+        return (int)getLocation().getStop();
+    }
+
 }
--- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java
@ -4,6 +4,7 @@ import org.broad.tribble.Feature;
 import org.broad.tribble.FeatureCodec;
 import org.broad.tribble.exception.CodecLineParsingException;
 import org.broad.tribble.util.LineReader;
+import org.broad.tribble.util.ParsingUtils;
 import org.broad.tribble.vcf.VCFHeader;
 import org.broad.tribble.vcf.VCFHeaderLine;
 import org.broad.tribble.vcf.VCFReaderUtils;
@ -12,6 +13,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
 import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.StingException;
+import org.broadinstitute.sting.utils.GenomeLoc;

 import java.io.IOException;
 import java.util.*;
@ -30,60 +32,11 @@ public class VCF4Codec implements FeatureCodec {
        throw new StingException("DON'T USE THIS");
    }

-    /**
-     * this a super hack like method to parse out what we need from a variant context
-     * @param line the line to parse
-     * @return
-     */
-    @Override
-    public Feature decode(String line) {
-
-        // our header cannot be null, we need the genotype sample names and counts
-        if (header == null) throw new IllegalStateException("VCF Header cannot be null");
-
-        // split the line on whitespace (Jim's parser will be faster, but it's broken right now)
-        String[] result = line.split("\\s+");
-
-        // check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
-        if (result.length != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + result.length+ " for line " + line);
-
-        // our genotype names
-        Iterator<String> iter = header.getGenotypeSamples().iterator();
-
-        // out genotype map, sample name to genotype
-        Map<String, Genotype> genotypes = new LinkedHashMap<String,Genotype>();
-
-        // our allele list, add the reference and the alts
-        List<Allele> alleles = new ArrayList<Allele>();
-        String[] alts = result[4].split(",");
-        for (String alt : alts)
-            alleles.add(new Allele(alt,false));
-        alleles.add(new Allele(result[3],true));
-
-        // parse out each of the genotypes
-        for (int genotypeIndex = 9; genotypeIndex < header.getColumnCount(); genotypeIndex++) {
-            if (!iter.hasNext()) throw new StingException("Wrong number of samples!");
-            String sample = iter.next();
-            genotypes.put(sample,createGenotypeFromString(sample,result[genotypeIndex],result[8].split(":"),alts,result[3]));
-        }
-
-        // make a new set of all the filters
-        Set<String> filters = new TreeSet<String>();
-        filters.addAll(Arrays.asList(result[5].split(",")));
-
-        // create, validate, and return the record
-        VCF4Record rec = new VCF4Record(result[2],
-                GenomeLocParser.createGenomeLoc(result[0],Long.valueOf(result[1])),
-                        Collections.unmodifiableCollection(alleles),
-                        genotypes,
-                        Double.valueOf(result[5]),
-                        filters,
-                        new HashMap<String,Object>());
-        rec.validate();
-        return rec;
+    public VCF4Codec(boolean itsOKImTesting) {
+        if ( ! itsOKImTesting )
+            throw new StingException("DON'T USE THIS");
    }

-
    /**
     * this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates
     * @param reader the line reader to take header lines from
@ -118,39 +71,175 @@ public class VCF4Codec implements FeatureCodec {
        throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
    }

+    private static int ZERO_CHAR = (byte)'0';
+    private static Allele oneAllele(char index, List<Allele> alleles) {
+        if ( index == '.' )
+            return Allele.NO_CALL;
+        else {
+            int i = ((byte)index) - ZERO_CHAR;
+            return alleles.get(i);
+        }
+    }

-    /**
-     * create the genotype object from the VCF record
-     * @param name the name of the sample
-     * @param VCFEntry the entry; the text containing all the fields corresponding to the format fields
-     * @param formatStrings the format string
-     * @param altAlleleList our alt alleles
-     * @param reference the reference base(s)
-     * @return a Genotype object
-     */
-    private Genotype createGenotypeFromString(String name, String VCFEntry, String[] formatStrings, String[] altAlleleList, String reference) {
-        // split the text entry into parts
-        String genotypeSplit[] = VCFEntry.split(":");
+    private static Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);

-        Set<Allele> aList = new TreeSet<Allele>();
-        Map<String, Object> attributes = new LinkedHashMap<String,Object>();
+    static long cacheHit = 0, gtParse = 0;

-        // for each entry in the vcf field (we drive by this so that dropped fields aren't processed
-        for (int index = 0; index < genotypeSplit.length; index++) {
-            if (formatStrings[index].toUpperCase().equals("GT")) {
-                String[] genotypes = genotypeSplit[index].split("[\\\\|\\/]+");
-                for (String g : genotypes) {
-                    int altIndex = Integer.valueOf(g);
-                    if (altIndex == 0)
-                        aList.add(new Allele(reference,true));
-                    else
-                        aList.add(new Allele(altAlleleList[altIndex-1]));
+    private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
+        // this should cache results [since they are immutable] and return a single object for each genotype
+        if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
+        List<Allele> GTAlleles = cache.get(GT);
+        if ( GTAlleles == null ) {
+            GTAlleles = Arrays.asList(oneAllele(GT.charAt(0), alleles), oneAllele(GT.charAt(2), alleles));
+            cache.put(GT, GTAlleles);
+        }
+//        else {
+//            cacheHit++;
+//        }
+//        gtParse++;
+//
+//        if ( cacheHit % 10000 == 0 )
+//            System.out.printf("Cache hit %d %d %.2f%n", cacheHit, gtParse, (100.0*cacheHit) / gtParse);
+
+        return GTAlleles;
+    }
+
+    private Map<String, Object> parseInfo(String infoField, String id) {
+        Map<String, Object> attributes = new HashMap<String, Object>();
+
+        if ( ! infoField.equals(".") ) { // empty info field
+            for ( String field : infoField.split(";") ) {
+                int eqI = field.indexOf("=");
+                String key = null;
+                Object value = null;
+                if ( eqI != -1 ) {
+                    key = field.substring(0, eqI);
+                    value = field.substring(eqI+1, field.length()); // todo -- needs to convert to int, double, etc
+                    //System.out.printf("%s %s%n", key, value);
+                } else {
+                    key = field;
+                    value = 1;
                }
-            } else {
-                attributes.put(formatStrings[index],genotypeSplit[index]);
+
+                attributes.put(key, value);
            }
        }
-        return new Genotype(name,new ArrayList(aList),0.0,new HashSet<String>(),attributes,false);
+        attributes.put("ID", id);
+        return attributes;
+    }
+
+    //private static String[] CachedGTKey = new String[100];
+    private static String[] CachedGTValues = new String[100];
+
+    public static boolean parseGenotypesToo = false; // for performance testing purposes
+    public static boolean validate = true; // for performance testing purposes
+    private static boolean REQUIRE_HEADER = false;
+
+    // a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
+    private String[] parts = null;
+
+    public Feature decode(String line) {
+        if ( parts == null )
+            parts = REQUIRE_HEADER ? new String[header.getColumnCount()] : new String[10000];    // todo -- remove require header
+
+        int nParts = ParsingUtils.split(line, parts, '\t');
+
+        if (REQUIRE_HEADER) { // todo -- remove require header
+            // our header cannot be null, we need the genotype sample names and counts
+            if ( header == null) throw new IllegalStateException("VCF Header cannot be null");
+
+            // check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
+            if (nParts != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + nParts + " for line " + line);
+        }
+
+        return parseVCFLine(parts, nParts);
+    }
+
+    private static Double parseQual(String qualString) {
+        return qualString.equals("-1") ? VariantContext.NO_NEG_LOG_10PERROR : Double.valueOf(qualString) / 10;
+    }
+
+    private VariantContext parseVCFLine(String[] parts, int nParts) {
+        // chr5        157273992       rs1211159       C       T       0.00    0       .       GT      0/0     0/0     0
+        String contig = parts[0];
+        long pos = Long.valueOf(parts[1]);
+        String id = parts[2];
+        String ref = parts[3];
+        String alts = parts[4];
+        Double qual = parseQual(parts[5]);
+        String filter = parts[6];
+        String info = parts[7];
+        String GT = parts[8];
+        int genotypesStart = 9;
+
+        // add the reference allele
+        if ( ! Allele.acceptableAlleleBases(ref) ) {
+            System.out.printf("Excluding vcf record %s%n", ref);
+            return null;
+        }
+
+        Set<String> filters = ! filter.equals(".") ? new HashSet<String>(Arrays.asList(filter.split(";"))) : null;
+        Map<String, Object> attributes = parseInfo(info, id);
+
+        // add all of the alt alleles
+
+        // todo -- use Allele factor method, not new, so we can keep a cache of the alleles since they are always the same
+        List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
+        Allele refAllele = new Allele(ref, true);
+        alleles.add(refAllele);
+
+        for ( String alt : alts.split(",") ) {
+            if ( ! Allele.acceptableAlleleBases(alt) ) {
+                //System.out.printf("Excluding vcf record %s%n", vcf);
+                return null;
+            }
+
+            Allele allele = new Allele(alt, false);
+            if ( ! allele.isNoCall() )
+                alleles.add(allele);
+        }
+
+        String[] GTKeys = GT.split(":"); // to performance issue
+
+        Map<String, Genotype> genotypes = new HashMap<String, Genotype>(nParts);
+        if ( parseGenotypesToo ) {
+            alleleMap.clear();
+            for ( int genotypeOffset = genotypesStart; genotypeOffset < nParts; genotypeOffset++ ) {
+                String sample = parts[genotypeOffset];
+                String[] GTValues = CachedGTValues;
+                ParsingUtils.split(sample, GTValues, ':'); // to performance issue
+                List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValues[0], alleles, alleleMap);
+                double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
+                                
+                // todo -- the parsing of attributes could be made lazy for performance
+                Map<String, String> gtAttributes = null;
+                if ( GTKeys.length > 1 ) {
+                    gtAttributes = new HashMap<String, String>(GTKeys.length - 1);
+                    for ( int i = 1; i < GTKeys.length; i++ ) {
+                        if ( GTKeys[i].equals("GQ") ) {
+                            GTQual = parseQual(GTValues[i]);
+                        } else {
+                            gtAttributes.put(GTKeys[i], GTValues[i]);
+                        }
+                    }
+                }
+
+                Set<String> genotypeFilters = null;
+                // genotypeFilters = new HashSet<String>();
+//            if ( vcfG.isFiltered() ) // setup the FL genotype filter fields
+//                genotypeFilters.addAll(Arrays.asList(vcfG.getFields().get(VCFGenotypeRecord.GENOTYPE_FILTER_KEY).split(";")));
+
+                boolean phased = GTKeys[0].charAt(1) == '|';
+                Genotype g = new Genotype("X" + genotypeOffset, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
+                genotypes.put(g.getSampleName(), g);
+            }
+        }
+
+        GenomeLoc loc = GenomeLocParser.createGenomeLoc(contig,pos,pos+refAllele.length()-1);
+
+        VariantContext vc = new VariantContext("foo", loc, alleles, genotypes, qual, filters, attributes);
+        if ( validate ) vc.validate();
+        return vc;
    }

    /**
@ -159,6 +248,6 @@ public class VCF4Codec implements FeatureCodec {
     */
    @Override
    public Class getFeatureType() {
-        return VCF4Record.class;
+        return VariantContext.class;
    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Record.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Record.java
@ -1,45 +0,0 @@
-package org.broadinstitute.sting.gatk.refdata.features.vcf4;
-
-import org.broad.tribble.Feature;
-import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
-import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
-import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
-import org.broadinstitute.sting.utils.GenomeLoc;
-
-import java.util.Collection;
-import java.util.Map;
-import java.util.Set;
-
-/**
- * simple variant context wrapped as VCF4
- */
-public class VCF4Record extends VariantContext implements Feature {
-    /**
-     * create a VCF4Record, which is really a variant context
-     * @param name the name of the record
-     * @param loc it's location
-     * @param alleles the set of alleles
-     * @param genotypes any genotypes for this record
-     * @param negLog10PError the probability of being a wrong call
-     * @param filters the set of filters applied to this variant
-     * @param attributes any other attributes
-     */
-    public VCF4Record(String name, GenomeLoc loc, Collection<Allele> alleles, Map<String, Genotype> genotypes, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
-        super(name, loc, alleles, genotypes, negLog10PError, filters, attributes);
-    }
-
-    @Override
-    public String getChr() {
-        return getLocation().getContig();
-    }
-
-    @Override
-    public int getStart() {
-        return (int)getLocation().getStart();
-    }
-
-    @Override
-    public int getEnd() {
-        return (int)getLocation().getStop();
-    }
-}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4ReaderTestWalker.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4ReaderTestWalker.java
@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2010, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.oneoffprojects.walkers;
+
+import org.broad.tribble.vcf.*;
+import org.broad.tribble.util.ParsingUtils;
+import org.broad.tribble.util.AsciiLineReader;
+import org.broad.tribble.FeatureCodec;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
+import org.broadinstitute.sting.gatk.refdata.features.vcf4.VCF4Codec;
+import org.broadinstitute.sting.gatk.walkers.RodWalker;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.text.XReadLines;
+import org.broadinstitute.sting.utils.genotype.vcf.*;
+import org.broadinstitute.sting.commandline.Argument;
+
+import java.util.*;
+import java.io.*;
+
+import com.sun.xml.internal.ws.wsdl.parser.ParserUtil;
+
+/**
+ * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
+ *
+ * @Author chartl
+ * @Date Apr 13, 2010
+ */
+public class VCF4ReaderTestWalker extends RodWalker<VCFRecord,Long> {
+    @Argument(shortName="MR", doc="", required=false)
+    int maxRecords = -1;
+    @Argument(shortName="vcf", doc="", required=true)
+    File vcfFile = null;
+    @Argument(shortName="Parse", doc="", required=true)
+    ParsingStatus splitFile = ParsingStatus.NONE;
+    @Argument(shortName="DontValidate", doc="", required=false)
+    boolean DontValidate = false;
+
+    @Argument(shortName="USE_VCF3", doc="", required=false)
+    boolean USE_VCF3 = false;
+
+
+    public enum ParsingStatus { NONE, SPLIT_LINES, VARIANTS, GENOTYPES }
+
+    public void initialize() {
+    }
+
+    public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext context, AlignmentContext alicon) {
+        return null;
+    }
+
+    public Long reduce(VCFRecord con, Long num) {
+        if ( con == null ) {
+            return num;
+        }
+
+        return 1 + num;
+    }
+
+    public Long reduceInit() {
+        return 0l;
+    }
+
+    String[] parts = new String[10000];
+    public void onTraversalDone(Long num){
+        VCF4Codec vcf4codec = new VCF4Codec(true);
+        VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES;
+        VCF4Codec.validate = ! DontValidate;
+
+        VCFCodec  vcf3codec = new VCFCodec();
+
+        FeatureCodec codec = USE_VCF3 ? vcf3codec : vcf4codec;
+
+        try {
+            AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(vcfFile));
+            int lineNumber = codec.readHeader(lineReader);
+            out.printf("Read %d header lines%n", lineNumber);
+
+            while (true) {
+                String line = lineReader.readLine();
+
+                if ( line == null )
+                    break;
+
+                lineNumber++;
+                if ( lineNumber >= maxRecords && maxRecords != -1 ) {
+                    return;
+                }
+
+                if ( line.charAt(0) == '#' ) 
+                    continue;
+
+                Object vc = null;
+                if ( splitFile == ParsingStatus.NONE ) {
+
+                }
+                else if ( splitFile == ParsingStatus.SPLIT_LINES ) {
+                    // todo -- look at header and determine number of elements that need to be parsed.  Should be static per file
+                    int nParts = ParsingUtils.split(line, parts, '\t');
+                } else {
+                    vc = codec.decode(line);
+                    if ( USE_VCF3 ) {
+                        VCFRecord rec = (VCFRecord)vc;
+                        GenomeLoc loc = GenomeLocParser.createGenomeLoc(rec.getChr(), rec.getStart());
+                        ReferenceContext ref = new ReferenceContext(loc, (byte)rec.getReference().charAt(0));
+                        vc = VariantContextAdaptors.toVariantContext("X", vc, ref);
+                    }
+                }
+                
+                if ( lineNumber % 10000 == 0 ) {
+                    System.out.printf("%10d: %s%n", lineNumber, line.subSequence(0, 50));
+                    System.out.printf("%10d: %s%n", lineNumber, vc);
+                }
+            }
+        } catch ( FileNotFoundException e ) {
+            throw new StingException(e.getMessage());
+        } catch ( IOException e ) {
+            throw new StingException(e.getMessage());
+        }
+    }
+}