From 32f324a0091d9a341e8f48ed31cebf053ab7c541 Mon Sep 17 00:00:00 2001 From: aaron Date: Tue, 22 Jun 2010 06:31:05 +0000 Subject: [PATCH] incremental changes to the VCF4 codec, including allele clipping down to the minimum reference allele; adding unit testing for certain aspects of the parsing. Not ready for prime-time yet. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3604 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/refdata/features/vcf4/VCF4Codec.java | 312 ++++++++++++----- .../walkers/VCF4ReaderTestWalker.java | 2 +- .../org/broadinstitute/sting/BaseTest.java | 56 ++- .../refdata/features/vcf4/VCF4UnitTest.java | 329 ++++++++++++++++++ 4 files changed, 611 insertions(+), 88 deletions(-) create mode 100644 java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java index 432243459..dcddda97a 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -5,9 +5,7 @@ import org.broad.tribble.FeatureCodec; import org.broad.tribble.exception.CodecLineParsingException; import org.broad.tribble.util.LineReader; import org.broad.tribble.util.ParsingUtils; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFHeaderLine; -import org.broad.tribble.vcf.VCFReaderUtils; +import org.broad.tribble.vcf.*; import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; @@ -15,6 +13,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; import java.io.IOException; import java.util.*; @@ -27,19 +26,39 @@ import java.util.*; public class VCF4Codec implements FeatureCodec { // we have to store the list of strings that make up the header until they're needed - private List headerStrings = new ArrayList(); private VCFHeader header = null; - public VCF4Codec() { - this(true); - //throw new StingException("DON'T USE THIS"); - } + // used to subtract from the + private static int ZERO_CHAR = (byte)'0'; - // todo -- remove me when done - public VCF4Codec(boolean itsOKImTesting) { - if ( ! itsOKImTesting ) - throw new StingException("DON'T USE THIS"); - } + // a mapping of the allele + private static Map> alleleMap = new HashMap>(3); + + //private static String[] CachedGTKey = new String[100]; + private static String[] CachedGTValues = new String[100]; + + // for performance testing purposes + public static boolean parseGenotypesToo = true; + + // for performance testing purposes + public static boolean validate = true; + + // a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over + private String[] parts = null; + + // for performance we cache the hashmap of filter encodings for quick lookup + private HashMap> filterHash = new HashMap>(); + + // a set of the genotype keys? TODO: rename to something better + private String[] GTKeys = new String[100]; + + // a list of the info fields, filter fields, and format fields + ArrayList infoFields = new ArrayList(); + ArrayList formatFields = new ArrayList(); + ArrayList filterFields = new ArrayList(); + + // do we want to validate the info, format, and filter fields + private final boolean validateFromHeader = true; /** * this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates @@ -48,6 +67,8 @@ public class VCF4Codec implements FeatureCodec { */ @Override public int readHeader(LineReader reader) { + List headerStrings = new ArrayList(); + String line = ""; try { while ((line = reader.readLine()) != null) { @@ -56,12 +77,25 @@ public class VCF4Codec implements FeatureCodec { } else if (line.startsWith("#")) { headerStrings.add(line); - String[] genotypes = line.split("\\s+"); - Set genotypeSampleNames = new TreeSet(); - for (int x = 8; x < genotypes.length; x++) - genotypeSampleNames.add(genotypes[x]); - // this should be the next line -> header = VCFReaderUtils.createHeader(headerStrings); - header = new VCFHeader(new HashSet(),genotypeSampleNames); + header = VCFReaderUtils.createHeader(headerStrings, VCFHeaderVersion.VCF4_0); + + // load the parsing fields + Set headerLines = header.getMetaData(); + + // setup our look-up lists for validation + for (VCFHeaderLine hl : headerLines) { + if (hl.getClass() == VCFFilterHeaderLine.class) + this.filterFields.add(((VCFFilterHeaderLine)hl).getmName()); + if (hl.getClass() == VCFFormatHeaderLine.class) + this.formatFields.add(((VCFFormatHeaderLine)hl).getmName()); + if (hl.getClass() == VCFInfoHeaderLine.class) + this.infoFields.add(((VCFInfoHeaderLine)hl).getmName()); + } + // sort the lists + Collections.sort(filterFields); + Collections.sort(formatFields); + Collections.sort(infoFields); + return headerStrings.size(); } else { @@ -73,9 +107,41 @@ public class VCF4Codec implements FeatureCodec { throw new RuntimeException("IO Exception ", e); } throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file"); + } - private static int ZERO_CHAR = (byte)'0'; + public Feature decodeLoc(String line) { + return decode(line); + } + + /** + * decode the line into a feature (VariantContext) + * @param line the line + * @return a VariantContext + */ + public Feature decode(String line) { + if (parts == null) + parts = new String[header.getColumnCount()]; + + int nParts = ParsingUtils.split(line, parts, '\t'); + + // our header cannot be null, we need the genotype sample names and counts + if (header == null) throw new IllegalStateException("VCF Header cannot be null"); + + // check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes) + if (nParts != header.getColumnCount()) + throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + nParts + " for line " + line); + + + return parseVCFLine(parts); + } + + /** + * create a an allele from an index and an array of alleles + * @param index the index + * @param alleles the alleles + * @return an Allele + */ private static Allele oneAllele(char index, List alleles) { if ( index == '.' ) return Allele.NO_CALL; @@ -85,8 +151,14 @@ public class VCF4Codec implements FeatureCodec { } } - private static Map> alleleMap = new HashMap>(3); + /** + * parse genotype alleles from the genotype string + * @param GT + * @param alleles + * @param cache + * @return + */ private static List parseGenotypeAlleles(String GT, List alleles, Map> cache) { // this should cache results [since they are immutable] and return a single object for each genotype if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0 @@ -99,6 +171,12 @@ public class VCF4Codec implements FeatureCodec { return GTAlleles; } + /** + * parse out the info fields + * @param infoField the fields + * @param id the indentifier + * @return a mapping of keys to objects + */ private Map parseInfo(String infoField, String id) { Map attributes = new HashMap(); @@ -120,73 +198,71 @@ public class VCF4Codec implements FeatureCodec { attributes.put(key, value); } } + // validate the fields + validateFields(attributes.keySet(),infoFields); attributes.put("ID", id); return attributes; } - //private static String[] CachedGTKey = new String[100]; - private static String[] CachedGTValues = new String[100]; - - public static boolean parseGenotypesToo = false; // for performance testing purposes - public static boolean validate = true; // for performance testing purposes - private static boolean REQUIRE_HEADER = false; - - // a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over - private String[] parts = null; - - public Feature decodeLoc(String line) { - return decode(line); - } - - public Feature decode(String line) { - if ( parts == null ) - parts = REQUIRE_HEADER ? new String[header.getColumnCount()] : new String[10000]; // todo -- remove require header - - int nParts = ParsingUtils.split(line, parts, '\t'); - - if (REQUIRE_HEADER) { // todo -- remove require header - // our header cannot be null, we need the genotype sample names and counts - if ( header == null) throw new IllegalStateException("VCF Header cannot be null"); - - // check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes) - if (nParts != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + nParts + " for line " + line); + private void validateFields(Set attributes, List fields) { + // validate the info fields + if (validateFromHeader) { + int count = 0; + for (String attr : attributes) + if (Collections.binarySearch(fields,attr) < 0) + throw new StingException("Unable to find field descibing attribute " + attr); } - - return parseVCFLine(parts, nParts); } + /** + * parse out the qual value + * @param qualString the quality string + * @return return a double + */ private static Double parseQual(String qualString) { // todo -- remove double once we deal with annoying VCFs from 1KG return qualString.equals("-1") || qualString.equals("-1.0") ? VariantContext.NO_NEG_LOG_10PERROR : Double.valueOf(qualString) / 10; } + /** + * parse out the alleles + * @param ref the reference base + * @param alts a string of alternates to break into alleles + * @return a list of alleles, and a pair of the shortest and longest sequence + */ private List parseAlleles(String ref, String alts) { List alleles = new ArrayList(2); // we are almost always biallelic - // ref checkAllele(ref); Allele refAllele = Allele.create(ref, true); alleles.add(refAllele); - if ( alts.indexOf(",") == -1 ) { // only 1 alternatives, don't call string split - parse1Allele(alleles, alts); - } else { - for ( String alt : Utils.split(alts, ",") ) { - parse1Allele(alleles, alt); - } - } + if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split + parseSingleAllele(alleles, alts); + else + for ( String alt : Utils.split(alts, ",") ) + parseSingleAllele(alleles, alt); return alleles; } + /** + * check to make sure the allele is an acceptable allele + * @param allele the allele to check + */ private static void checkAllele(String allele) { if ( ! Allele.acceptableAlleleBases(allele) ) { throw new StingException("Unparsable vcf record with allele " + allele); } } - private void parse1Allele(List alleles, String alt) { + /** + * parse a single allele, given the allele list + * @param alleles the alleles available + * @param alt the allele to parse + */ + private void parseSingleAllele(List alleles, String alt) { checkAllele(alt); Allele allele = Allele.create(alt, false); @@ -194,25 +270,44 @@ public class VCF4Codec implements FeatureCodec { alleles.add(allele); } - // todo -- check a static map from filter String to HashSets to reuse objects and avoid parsing + /** + * parse the filter string, first checking to see if we already have parsed it in a previous attempt + * @param filterString the string to parse + * @return a set of the filters applied + */ private Set parseFilters(String filterString) { - if ( filterString.equals(".") ) + Set fFields; + + // a PASS is simple (no filters) + if ( filterString.equals("PASS") ) { return null; + } + // else do we have the filter string cached? + else if (filterHash.containsKey(filterString)) { + fFields = filterHash.get(filterString); + } + // otherwise we have to parse and cache the value else { - HashSet s = new HashSet(1); + LinkedHashSet s = new LinkedHashSet(1); if ( filterString.indexOf(";") == -1 ) { s.add(filterString); } else { s.addAll(Utils.split(filterString, ";")); } - - return s; + filterHash.put(filterString,s); + fFields = s; } + + validateFields(fFields,filterFields); + return fFields; } - private String[] GTKeys = new String[100]; - - private VariantContext parseVCFLine(String[] parts, int nParts) { + /** + * parse out the VCF line + * @param parts the parts split up + * @return a variant context object + */ + private VariantContext parseVCFLine(String[] parts) { String contig = parts[0]; long pos = Long.valueOf(parts[1]); String id = parts[2]; @@ -224,20 +319,27 @@ public class VCF4Codec implements FeatureCodec { String GT = parts[8]; int genotypesStart = 9; - List alleles = parseAlleles(ref, alts); + List alleles = parseAlleles(ref, alts); Set filters = parseFilters(filter); Map attributes = parseInfo(info, id); // parse genotypes int nGTKeys = ParsingUtils.split(GT, GTKeys, ':'); - Map genotypes = new HashMap(Math.max(nParts - genotypesStart, 1)); + Map genotypes = new HashMap(Math.max(parts.length - genotypesStart, 1)); + Iterator iter = header.getGenotypeSamples().iterator(); + + Pair> locAndAlleles = (ref.length() > 1) ? + clipAlleles(contig,pos,ref,alleles) : + new Pair>(GenomeLocParser.createGenomeLoc(contig,pos),alleles); + + if ( parseGenotypesToo ) { alleleMap.clear(); - for ( int genotypeOffset = genotypesStart; genotypeOffset < nParts; genotypeOffset++ ) { + for ( int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++ ) { String sample = parts[genotypeOffset]; String[] GTValues = CachedGTValues; ParsingUtils.split(sample, GTValues, ':'); - List genotypeAlleles = parseGenotypeAlleles(GTValues[0], alleles, alleleMap); + List genotypeAlleles = parseGenotypeAlleles(GTValues[0], locAndAlleles.second, alleleMap); double GTQual = VariantContext.NO_NEG_LOG_10PERROR; Set genotypeFilters = null; @@ -249,32 +351,69 @@ public class VCF4Codec implements FeatureCodec { if ( GTKeys[i].equals("GQ") ) { GTQual = parseQual(GTValues[i]); } if ( GTKeys[i].equals("FL") ) { // deal with genotype filters here - // todo -- get genotype filters working - // genotypeFilters = new HashSet(); -// if ( vcfG.isFiltered() ) // setup the FL genotype filter fields -// genotypeFilters.addAll(Arrays.asList(vcfG.getFields().get(VCFGenotypeRecord.GENOTYPE_FILTER_KEY).split(";"))); + genotypeFilters.addAll(parseFilters(GTValues[i])); } else { gtAttributes.put(GTKeys[i], GTValues[i]); } } + // validate the format fields + validateFields(gtAttributes.keySet(),formatFields); } boolean phased = GTKeys[0].charAt(1) == '|'; - // todo -- actually parse the header to get the sample name - Genotype g = new Genotype("X" + genotypeOffset, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased); + Genotype g = new Genotype(iter.next(), genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased); genotypes.put(g.getSampleName(), g); } } - // todo -- doesn't work for indels [the whole reason for VCF4] - GenomeLoc loc = GenomeLocParser.createGenomeLoc(contig,pos,pos + ref.length() - 1); - // todo -- we need access to our track name to name the variant context - VariantContext vc = new VariantContext("foo", loc, alleles, genotypes, qual, filters, attributes); - return vc; + return new VariantContext("foo", locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes); } + /** + * clip the alleles, based on the reference + * @param unclippedAlleles the list of alleles + * @return a list of alleles, clipped to the reference + */ + static Pair> clipAlleles(String contig, long position, String ref, List unclippedAlleles) { + List newAlleleList = new ArrayList(); + + // find the preceeding string common to all alleles and the reference + int forwardClipped = 0; + boolean clipping = true; + while (clipping) { + for (Allele a : unclippedAlleles) + if (forwardClipped > ref.length() - 1) + clipping = false; + else if (a.length() <= forwardClipped || (a.getBases()[forwardClipped] != ref.getBytes()[forwardClipped])) { + clipping = false; + } + if (clipping) forwardClipped++; + } + + int reverseClipped = 0; + clipping = true; + while (clipping) { + for (Allele a : unclippedAlleles) + if (a.length() - reverseClipped < 0 || a.length() - forwardClipped == 0) + clipping = false; + else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1]) + clipping = false; + if (clipping) reverseClipped++; + } + + // check to see if we're about to clip all the bases from the reference, if so back off the front clip a base + if (forwardClipped + reverseClipped >= ref.length()) + forwardClipped--; + + for (Allele a : unclippedAlleles) + newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipped,a.getBases().length-reverseClipped),a.isReference())); + return new Pair>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipped,(position+ref.length()-reverseClipped-1)), + newAlleleList); + } + + /** * * @return the type of record @@ -284,8 +423,15 @@ public class VCF4Codec implements FeatureCodec { return VariantContext.class; } + /** + * get the header + * @param clazz the class were expecting + * @return our VCFHeader + * @throws ClassCastException + */ @Override - public Object getHeader(Class clazz) throws ClassCastException { - return null; // TODO: fix this Aaron + public VCFHeader getHeader(Class clazz) throws ClassCastException { + if (clazz != VCFHeader.class) throw new ClassCastException("expecting class " + clazz + " but VCF4Codec provides " + VCFHeader.class); + return this.header; } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4ReaderTestWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4ReaderTestWalker.java index 23ca27ee5..691e7bd4b 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4ReaderTestWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4ReaderTestWalker.java @@ -91,7 +91,7 @@ public class VCF4ReaderTestWalker extends RodWalker { String[] parts = new String[10000]; public void onTraversalDone(Long num){ - VCF4Codec vcf4codec = new VCF4Codec(true); + VCF4Codec vcf4codec = new VCF4Codec(); VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES; VCF4Codec.validate = ! DontValidate; diff --git a/java/test/org/broadinstitute/sting/BaseTest.java b/java/test/org/broadinstitute/sting/BaseTest.java index eb9a11568..0aa21ca6c 100755 --- a/java/test/org/broadinstitute/sting/BaseTest.java +++ b/java/test/org/broadinstitute/sting/BaseTest.java @@ -2,12 +2,13 @@ package org.broadinstitute.sting; import org.apache.log4j.*; import org.apache.log4j.spi.LoggingEvent; +import org.broadinstitute.sting.utils.StingException; import org.junit.*; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; +import java.io.*; +import java.math.BigInteger; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; import java.util.Enumeration; /** @@ -175,4 +176,51 @@ public abstract class BaseTest { } } + /** + * a little utility function for all tests to md5sum a file + * Shameless taken from: + * + * http://www.javalobby.org/java/forums/t84420.html + * + * @param file the file + * @return a string + */ + public static String md5SumFile(File file) { + MessageDigest digest; + try { + digest = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + throw new StingException("Unable to find MD5 digest"); + } + InputStream is = null; + try { + is = new FileInputStream(file); + } catch (FileNotFoundException e) { + throw new StingException("Unable to open file " + file); + } + byte[] buffer = new byte[8192]; + int read = 0; + try { + while ((read = is.read(buffer)) > 0) { + digest.update(buffer, 0, read); + } + byte[] md5sum = digest.digest(); + BigInteger bigInt = new BigInteger(1, md5sum); + return bigInt.toString(16); + + } + catch (IOException e) { + throw new StingException("Unable to process file for MD5", e); + } + finally { + try { + is.close(); + } + catch (IOException e) { + throw new StingException("Unable to close input stream for MD5 calculation", e); + } + } + } + + } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java new file mode 100644 index 000000000..556b70f2c --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java @@ -0,0 +1,329 @@ +package org.broadinstitute.sting.gatk.refdata.features.vcf4; + +import org.broad.tribble.util.AsciiLineReader; +import org.broad.tribble.vcf.*; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +/** + * test out pieces of the VCF 4 codec. + */ +public class VCF4UnitTest extends BaseTest { + File vcfFile = new File("testdata/vcfExmaple.vcf4"); + + // setup the contig ordering + @BeforeClass + public static void setupContig() { + IndexedFastaSequenceFile seq; + try { + seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta")); + } catch (FileNotFoundException e) { + throw new StingException("unable to load the sequence dictionary"); + } + GenomeLocParser.setupRefContigOrdering(seq); + } + + @Test + public void testReadBasicHeader() { + TestSetup testSetup = new TestSetup().invoke(vcfFile); + + int seenRecords = 0; + + // check some field entries of each type + Set lines = testSetup.getHeader().getMetaData(); + + for (VCFHeaderLine line : lines) { + // check the vcf info header lines + if (line instanceof VCFInfoHeaderLine) { + VCFInfoHeaderLine ihLIne = (VCFInfoHeaderLine)line; + + // test a normal info line + if (ihLIne.getmName().equals("NS")) { + Assert.assertEquals(VCFInfoHeaderLine.INFO_TYPE.Integer,ihLIne.getmType()); + Assert.assertEquals(1,ihLIne.getmCount()); + Assert.assertTrue("Number of Samples With Data".equals(ihLIne.getmDescription())); + seenRecords++; + } + // test a info line that uses the period to represent an unbounded value + if (ihLIne.getmName().equals("AF")) { + Assert.assertEquals(VCFInfoHeaderLine.INFO_TYPE.Float,ihLIne.getmType()); + Assert.assertEquals(VCFInfoHeaderLine.UNBOUNDED,ihLIne.getmCount()); + Assert.assertTrue("Allele Frequency".equals(ihLIne.getmDescription())); + seenRecords++; + } + } + // check the vcf filter header lines + if (line instanceof VCFFilterHeaderLine) { + VCFFilterHeaderLine fhLIne = (VCFFilterHeaderLine)line; + if (fhLIne.getmName().equals("q10")) { + Assert.assertTrue("Quality below 10".equals(fhLIne.getmDescription())); + seenRecords++; + } + } + + // check the vcf info header lines + if (line instanceof VCFFormatHeaderLine) { + VCFFormatHeaderLine ifLIne = (VCFFormatHeaderLine)line; + if (ifLIne.getmName().equals("GT")) { + Assert.assertEquals(VCFFormatHeaderLine.FORMAT_TYPE.String,ifLIne.getmType()); + Assert.assertEquals(1,ifLIne.getmCount()); + Assert.assertTrue("Genotype".equals(ifLIne.getmDescription())); + seenRecords++; + } + } + } + + Assert.assertEquals("We expected to see three records (one of each type we check), but didn't.",4,seenRecords); + } + + @Test + public void testOutputHeader() { + TestSetup testSetup = new TestSetup().invoke(vcfFile); + + File tempFile = null; + try { + tempFile = File.createTempFile("VCF4Test","vcf"); + } catch (IOException e) { + Assert.fail("Couldn't create a temporary file "); + } + + // write it to disk + VCFWriter writer = new VCFWriter(tempFile); + writer.writeHeader(testSetup.getHeader()); + writer.close(); + + // md5 sum the file + Assert.assertTrue("expecting md5sum of 637f3c92806d993a9c7b7116da398d6, but got " + md5SumFile(tempFile),"637f3c92806d993a9c7b7116da398d6".equals(md5SumFile(tempFile))); + } + + @Test + public void testCountVCF4Records() { + TestSetup testSetup = new TestSetup().invoke(vcfFile); + AsciiLineReader reader = testSetup.getReader(); + VCF4Codec codec = testSetup.getCodec(); + + // now parse the lines + String line = null; + try { + line = reader.readLine(); + } catch (IOException e) { + Assert.fail("Failed to read a line"); + } + + // our record count + int recordCount = 0; + while (line != null) { + try { + //System.err.println(codec.decode(line).toString()); + recordCount++; + testSetup.codec.decode(line); + line = reader.readLine(); + } catch (IOException e) { + Assert.fail("Failed to read a line"); + } + } + Assert.assertEquals(6,recordCount); + } + + + // two constants for testing + String regularLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,."; + String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,."; + String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HG=12\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,."; + + @Test + public void testCheckInfoValidation() { + TestSetup testSetup = new TestSetup().invoke(vcfFile); + testSetup.codec.decode(regularLine); + } + + @Test + public void testCheckTwoFewInfoValidation() { + TestSetup testSetup = new TestSetup().invoke(vcfFile); + testSetup.codec.decode(twoFewInfoLine); + } + + @Test(expected=StingException.class) + public void testCheckTwoManyInfoValidation() { + TestSetup testSetup = new TestSetup().invoke(vcfFile); + testSetup.codec.decode(twoManyInfoLine); + } + + //File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf"); + + /*@Test + public void checkLargeVCF() { + TestSetup testSetup = new TestSetup().invoke(largeVCF); + AsciiLineReader reader = testSetup.getReader(); + VCF4Codec codec = testSetup.getCodec(); + + // now parse the lines + String line = null; + try { + line = reader.readLine(); + } catch (IOException e) { + Assert.fail("Failed to read a line"); + } + + // our record count + int recordCount = 0; + int badRecordCount = 0; + while (line != null) { + try { + recordCount++; + try { + testSetup.codec.decode(line); + } catch (Exception e) { + System.err.println(e.getMessage() + " -> " + line); + System.err.println(line); + badRecordCount++; + } + line = reader.readLine(); + if (recordCount % 1000 == 0) + System.err.println("record count == " + recordCount); + } catch (IOException e) { + Assert.fail("Failed to read a line"); + } + } + Assert.assertEquals(0,badRecordCount); + Assert.assertEquals(728075,recordCount); + } +*/ + /** + * test out the clipping of alleles (removing extra context provided by VCF implementation). + */ + @Test + public void testClippingOfAllelesDeletionAndInsertion() { + String ref = "GGTT"; + ArrayList alleles = new ArrayList(); + alleles.add(Allele.create(ref,true)); + alleles.add(Allele.create("GGAATT",false)); + alleles.add(Allele.create("GT",false)); + + Pair> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles); + Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,3))); + + // we know the ordering + //System.err.println(locAndList.second.get(0).toString()); + //System.err.println(locAndList.second.get(1).toString()); + //System.err.println(locAndList.second.get(2).toString()); + Assert.assertTrue(locAndList.second.get(0).toString().equals("GT*")); + Assert.assertTrue(locAndList.second.get(0).isReference()); + Assert.assertTrue(locAndList.second.get(1).toString().equals("GAAT")); + Assert.assertTrue(locAndList.second.get(2).toString().equals("-")); + } + + /** + * test out the clipping of alleles (removing extra context provided by VCF implementation). + */ + @Test + public void testClippingOfAllelesLongRefRepeat() { + String ref = "GGGG"; + ArrayList alleles = new ArrayList(); + alleles.add(Allele.create(ref,true)); + alleles.add(Allele.create("G",false)); + alleles.add(Allele.create("GG",false)); + + Pair> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles); + Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,4))); + + // we know the ordering + Assert.assertTrue(locAndList.second.get(0).toString().equals("GGG*")); + Assert.assertTrue(locAndList.second.get(0).isReference()); + Assert.assertTrue(locAndList.second.get(1).toString().equals("-")); + Assert.assertTrue(locAndList.second.get(2).toString().equals("G")); + } + + /** + * test out the clipping of alleles (removing extra context provided by VCF implementation). + */ + @Test + public void testClippingOfAllelesPlainPolyMorph() { + String ref = "C"; + ArrayList alleles = new ArrayList(); + alleles.add(Allele.create(ref,true)); + alleles.add(Allele.create("T",false)); + alleles.add(Allele.create("G",false)); + + Pair> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles); + Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",1,1))); + + // we know the ordering + Assert.assertTrue(locAndList.second.get(0).toString().equals("C*")); + Assert.assertTrue(locAndList.second.get(0).isReference()); + Assert.assertTrue(locAndList.second.get(1).toString().equals("T")); + Assert.assertTrue(locAndList.second.get(2).toString().equals("G")); + } + + /** + * test out the clipping of alleles (removing extra context provided by VCF implementation). + */ + @Test + public void testClippingOfAllelesInsertions() { + String ref = "C"; + ArrayList alleles = new ArrayList(); + alleles.add(Allele.create(ref,true)); + alleles.add(Allele.create("CTTTTT",false)); + alleles.add(Allele.create("GGGGGG",false)); + + Pair> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles); + Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",1,1))); + + // we know the ordering + Assert.assertTrue(locAndList.second.get(0).toString().equals("C*")); + Assert.assertTrue(locAndList.second.get(0).isReference()); + Assert.assertTrue(locAndList.second.get(1).toString().equals("CTTTTT")); + Assert.assertTrue(locAndList.second.get(2).toString().equals("GGGGGG")); + } + + /** + * a test setup for the VCF 4 codec + */ + private class TestSetup { + private AsciiLineReader reader; + private VCF4Codec codec; + private VCFHeader header; + + public AsciiLineReader getReader() { + return reader; + } + + public VCF4Codec getCodec() { + return codec; + } + + public VCFHeader getHeader() { + return header; + } + + public TestSetup invoke(File vcfFile) { + reader = null; + try { + reader = new AsciiLineReader(new FileInputStream(vcfFile)); + } catch (FileNotFoundException e) { + Assert.fail("Unable to parse out VCF file " + vcfFile); + } + codec = new VCF4Codec(); + codec.readHeader(reader); + header = codec.getHeader(VCFHeader.class); + return this; + } + } +}