incremental changes to the VCF4 codec, including allele clipping down to the minimum reference allele; adding unit testing for certain aspects of the parsing. Not ready for prime-time yet.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3604 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-06-22 06:31:05 +00:00
parent de9f1f575f
commit 32f324a009
4 changed files with 611 additions and 88 deletions

View File

@ -5,9 +5,7 @@ import org.broad.tribble.FeatureCodec;
import org.broad.tribble.exception.CodecLineParsingException;
import org.broad.tribble.util.LineReader;
import org.broad.tribble.util.ParsingUtils;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.VCFReaderUtils;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
@ -15,6 +13,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import java.io.IOException;
import java.util.*;
@ -27,19 +26,39 @@ import java.util.*;
public class VCF4Codec implements FeatureCodec {
// we have to store the list of strings that make up the header until they're needed
private List<String> headerStrings = new ArrayList<String>();
private VCFHeader header = null;
public VCF4Codec() {
this(true);
//throw new StingException("DON'T USE THIS");
}
// used to subtract from the
private static int ZERO_CHAR = (byte)'0';
// todo -- remove me when done
public VCF4Codec(boolean itsOKImTesting) {
if ( ! itsOKImTesting )
throw new StingException("DON'T USE THIS");
}
// a mapping of the allele
private static Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
//private static String[] CachedGTKey = new String[100];
private static String[] CachedGTValues = new String[100];
// for performance testing purposes
public static boolean parseGenotypesToo = true;
// for performance testing purposes
public static boolean validate = true;
// a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
private String[] parts = null;
// for performance we cache the hashmap of filter encodings for quick lookup
private HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>();
// a set of the genotype keys? TODO: rename to something better
private String[] GTKeys = new String[100];
// a list of the info fields, filter fields, and format fields
ArrayList<String> infoFields = new ArrayList<String>();
ArrayList<String> formatFields = new ArrayList<String>();
ArrayList<String> filterFields = new ArrayList<String>();
// do we want to validate the info, format, and filter fields
private final boolean validateFromHeader = true;
/**
* this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates
@ -48,6 +67,8 @@ public class VCF4Codec implements FeatureCodec {
*/
@Override
public int readHeader(LineReader reader) {
List<String> headerStrings = new ArrayList<String>();
String line = "";
try {
while ((line = reader.readLine()) != null) {
@ -56,12 +77,25 @@ public class VCF4Codec implements FeatureCodec {
}
else if (line.startsWith("#")) {
headerStrings.add(line);
String[] genotypes = line.split("\\s+");
Set<String> genotypeSampleNames = new TreeSet<String>();
for (int x = 8; x < genotypes.length; x++)
genotypeSampleNames.add(genotypes[x]);
// this should be the next line -> header = VCFReaderUtils.createHeader(headerStrings);
header = new VCFHeader(new HashSet<VCFHeaderLine>(),genotypeSampleNames);
header = VCFReaderUtils.createHeader(headerStrings, VCFHeaderVersion.VCF4_0);
// load the parsing fields
Set<VCFHeaderLine> headerLines = header.getMetaData();
// setup our look-up lists for validation
for (VCFHeaderLine hl : headerLines) {
if (hl.getClass() == VCFFilterHeaderLine.class)
this.filterFields.add(((VCFFilterHeaderLine)hl).getmName());
if (hl.getClass() == VCFFormatHeaderLine.class)
this.formatFields.add(((VCFFormatHeaderLine)hl).getmName());
if (hl.getClass() == VCFInfoHeaderLine.class)
this.infoFields.add(((VCFInfoHeaderLine)hl).getmName());
}
// sort the lists
Collections.sort(filterFields);
Collections.sort(formatFields);
Collections.sort(infoFields);
return headerStrings.size();
}
else {
@ -73,9 +107,41 @@ public class VCF4Codec implements FeatureCodec {
throw new RuntimeException("IO Exception ", e);
}
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
}
private static int ZERO_CHAR = (byte)'0';
public Feature decodeLoc(String line) {
return decode(line);
}
/**
* decode the line into a feature (VariantContext)
* @param line the line
* @return a VariantContext
*/
public Feature decode(String line) {
if (parts == null)
parts = new String[header.getColumnCount()];
int nParts = ParsingUtils.split(line, parts, '\t');
// our header cannot be null, we need the genotype sample names and counts
if (header == null) throw new IllegalStateException("VCF Header cannot be null");
// check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
if (nParts != header.getColumnCount())
throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + nParts + " for line " + line);
return parseVCFLine(parts);
}
/**
* create a an allele from an index and an array of alleles
* @param index the index
* @param alleles the alleles
* @return an Allele
*/
private static Allele oneAllele(char index, List<Allele> alleles) {
if ( index == '.' )
return Allele.NO_CALL;
@ -85,8 +151,14 @@ public class VCF4Codec implements FeatureCodec {
}
}
private static Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
/**
* parse genotype alleles from the genotype string
* @param GT
* @param alleles
* @param cache
* @return
*/
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
// this should cache results [since they are immutable] and return a single object for each genotype
if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
@ -99,6 +171,12 @@ public class VCF4Codec implements FeatureCodec {
return GTAlleles;
}
/**
* parse out the info fields
* @param infoField the fields
* @param id the indentifier
* @return a mapping of keys to objects
*/
private Map<String, Object> parseInfo(String infoField, String id) {
Map<String, Object> attributes = new HashMap<String, Object>();
@ -120,73 +198,71 @@ public class VCF4Codec implements FeatureCodec {
attributes.put(key, value);
}
}
// validate the fields
validateFields(attributes.keySet(),infoFields);
attributes.put("ID", id);
return attributes;
}
//private static String[] CachedGTKey = new String[100];
private static String[] CachedGTValues = new String[100];
public static boolean parseGenotypesToo = false; // for performance testing purposes
public static boolean validate = true; // for performance testing purposes
private static boolean REQUIRE_HEADER = false;
// a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
private String[] parts = null;
public Feature decodeLoc(String line) {
return decode(line);
}
public Feature decode(String line) {
if ( parts == null )
parts = REQUIRE_HEADER ? new String[header.getColumnCount()] : new String[10000]; // todo -- remove require header
int nParts = ParsingUtils.split(line, parts, '\t');
if (REQUIRE_HEADER) { // todo -- remove require header
// our header cannot be null, we need the genotype sample names and counts
if ( header == null) throw new IllegalStateException("VCF Header cannot be null");
// check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
if (nParts != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + nParts + " for line " + line);
private void validateFields(Set<String> attributes, List<String> fields) {
// validate the info fields
if (validateFromHeader) {
int count = 0;
for (String attr : attributes)
if (Collections.binarySearch(fields,attr) < 0)
throw new StingException("Unable to find field descibing attribute " + attr);
}
return parseVCFLine(parts, nParts);
}
/**
* parse out the qual value
* @param qualString the quality string
* @return return a double
*/
private static Double parseQual(String qualString) {
// todo -- remove double once we deal with annoying VCFs from 1KG
return qualString.equals("-1") || qualString.equals("-1.0") ? VariantContext.NO_NEG_LOG_10PERROR : Double.valueOf(qualString) / 10;
}
/**
* parse out the alleles
* @param ref the reference base
* @param alts a string of alternates to break into alleles
* @return a list of alleles, and a pair of the shortest and longest sequence
*/
private List<Allele> parseAlleles(String ref, String alts) {
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
// ref
checkAllele(ref);
Allele refAllele = Allele.create(ref, true);
alleles.add(refAllele);
if ( alts.indexOf(",") == -1 ) { // only 1 alternatives, don't call string split
parse1Allele(alleles, alts);
} else {
for ( String alt : Utils.split(alts, ",") ) {
parse1Allele(alleles, alt);
}
}
if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split
parseSingleAllele(alleles, alts);
else
for ( String alt : Utils.split(alts, ",") )
parseSingleAllele(alleles, alt);
return alleles;
}
/**
* check to make sure the allele is an acceptable allele
* @param allele the allele to check
*/
private static void checkAllele(String allele) {
if ( ! Allele.acceptableAlleleBases(allele) ) {
throw new StingException("Unparsable vcf record with allele " + allele);
}
}
private void parse1Allele(List<Allele> alleles, String alt) {
/**
* parse a single allele, given the allele list
* @param alleles the alleles available
* @param alt the allele to parse
*/
private void parseSingleAllele(List<Allele> alleles, String alt) {
checkAllele(alt);
Allele allele = Allele.create(alt, false);
@ -194,25 +270,44 @@ public class VCF4Codec implements FeatureCodec {
alleles.add(allele);
}
// todo -- check a static map from filter String to HashSets to reuse objects and avoid parsing
/**
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
* @param filterString the string to parse
* @return a set of the filters applied
*/
private Set<String> parseFilters(String filterString) {
if ( filterString.equals(".") )
Set<String> fFields;
// a PASS is simple (no filters)
if ( filterString.equals("PASS") ) {
return null;
}
// else do we have the filter string cached?
else if (filterHash.containsKey(filterString)) {
fFields = filterHash.get(filterString);
}
// otherwise we have to parse and cache the value
else {
HashSet<String> s = new HashSet<String>(1);
LinkedHashSet<String> s = new LinkedHashSet<String>(1);
if ( filterString.indexOf(";") == -1 ) {
s.add(filterString);
} else {
s.addAll(Utils.split(filterString, ";"));
}
return s;
filterHash.put(filterString,s);
fFields = s;
}
validateFields(fFields,filterFields);
return fFields;
}
private String[] GTKeys = new String[100];
private VariantContext parseVCFLine(String[] parts, int nParts) {
/**
* parse out the VCF line
* @param parts the parts split up
* @return a variant context object
*/
private VariantContext parseVCFLine(String[] parts) {
String contig = parts[0];
long pos = Long.valueOf(parts[1]);
String id = parts[2];
@ -224,20 +319,27 @@ public class VCF4Codec implements FeatureCodec {
String GT = parts[8];
int genotypesStart = 9;
List<Allele> alleles = parseAlleles(ref, alts);
List<Allele> alleles = parseAlleles(ref, alts);
Set<String> filters = parseFilters(filter);
Map<String, Object> attributes = parseInfo(info, id);
// parse genotypes
int nGTKeys = ParsingUtils.split(GT, GTKeys, ':');
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(Math.max(nParts - genotypesStart, 1));
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(Math.max(parts.length - genotypesStart, 1));
Iterator<String> iter = header.getGenotypeSamples().iterator();
Pair<GenomeLoc,List<Allele>> locAndAlleles = (ref.length() > 1) ?
clipAlleles(contig,pos,ref,alleles) :
new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,pos),alleles);
if ( parseGenotypesToo ) {
alleleMap.clear();
for ( int genotypeOffset = genotypesStart; genotypeOffset < nParts; genotypeOffset++ ) {
for ( int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++ ) {
String sample = parts[genotypeOffset];
String[] GTValues = CachedGTValues;
ParsingUtils.split(sample, GTValues, ':');
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValues[0], alleles, alleleMap);
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValues[0], locAndAlleles.second, alleleMap);
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
Set<String> genotypeFilters = null;
@ -249,32 +351,69 @@ public class VCF4Codec implements FeatureCodec {
if ( GTKeys[i].equals("GQ") ) {
GTQual = parseQual(GTValues[i]);
} if ( GTKeys[i].equals("FL") ) { // deal with genotype filters here
// todo -- get genotype filters working
// genotypeFilters = new HashSet<String>();
// if ( vcfG.isFiltered() ) // setup the FL genotype filter fields
// genotypeFilters.addAll(Arrays.asList(vcfG.getFields().get(VCFGenotypeRecord.GENOTYPE_FILTER_KEY).split(";")));
genotypeFilters.addAll(parseFilters(GTValues[i]));
} else {
gtAttributes.put(GTKeys[i], GTValues[i]);
}
}
// validate the format fields
validateFields(gtAttributes.keySet(),formatFields);
}
boolean phased = GTKeys[0].charAt(1) == '|';
// todo -- actually parse the header to get the sample name
Genotype g = new Genotype("X" + genotypeOffset, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
Genotype g = new Genotype(iter.next(), genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
genotypes.put(g.getSampleName(), g);
}
}
// todo -- doesn't work for indels [the whole reason for VCF4]
GenomeLoc loc = GenomeLocParser.createGenomeLoc(contig,pos,pos + ref.length() - 1);
// todo -- we need access to our track name to name the variant context
VariantContext vc = new VariantContext("foo", loc, alleles, genotypes, qual, filters, attributes);
return vc;
return new VariantContext("foo", locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
}
/**
* clip the alleles, based on the reference
* @param unclippedAlleles the list of alleles
* @return a list of alleles, clipped to the reference
*/
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
List<Allele> newAlleleList = new ArrayList<Allele>();
// find the preceeding string common to all alleles and the reference
int forwardClipped = 0;
boolean clipping = true;
while (clipping) {
for (Allele a : unclippedAlleles)
if (forwardClipped > ref.length() - 1)
clipping = false;
else if (a.length() <= forwardClipped || (a.getBases()[forwardClipped] != ref.getBytes()[forwardClipped])) {
clipping = false;
}
if (clipping) forwardClipped++;
}
int reverseClipped = 0;
clipping = true;
while (clipping) {
for (Allele a : unclippedAlleles)
if (a.length() - reverseClipped < 0 || a.length() - forwardClipped == 0)
clipping = false;
else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
clipping = false;
if (clipping) reverseClipped++;
}
// check to see if we're about to clip all the bases from the reference, if so back off the front clip a base
if (forwardClipped + reverseClipped >= ref.length())
forwardClipped--;
for (Allele a : unclippedAlleles)
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipped,a.getBases().length-reverseClipped),a.isReference()));
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipped,(position+ref.length()-reverseClipped-1)),
newAlleleList);
}
/**
*
* @return the type of record
@ -284,8 +423,15 @@ public class VCF4Codec implements FeatureCodec {
return VariantContext.class;
}
/**
* get the header
* @param clazz the class were expecting
* @return our VCFHeader
* @throws ClassCastException
*/
@Override
public Object getHeader(Class clazz) throws ClassCastException {
return null; // TODO: fix this Aaron
public VCFHeader getHeader(Class clazz) throws ClassCastException {
if (clazz != VCFHeader.class) throw new ClassCastException("expecting class " + clazz + " but VCF4Codec provides " + VCFHeader.class);
return this.header;
}
}

View File

@ -91,7 +91,7 @@ public class VCF4ReaderTestWalker extends RodWalker<VCFRecord,Long> {
String[] parts = new String[10000];
public void onTraversalDone(Long num){
VCF4Codec vcf4codec = new VCF4Codec(true);
VCF4Codec vcf4codec = new VCF4Codec();
VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES;
VCF4Codec.validate = ! DontValidate;

View File

@ -2,12 +2,13 @@ package org.broadinstitute.sting;
import org.apache.log4j.*;
import org.apache.log4j.spi.LoggingEvent;
import org.broadinstitute.sting.utils.StingException;
import org.junit.*;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.*;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Enumeration;
/**
@ -175,4 +176,51 @@ public abstract class BaseTest {
}
}
/**
* a little utility function for all tests to md5sum a file
* Shameless taken from:
*
* http://www.javalobby.org/java/forums/t84420.html
*
* @param file the file
* @return a string
*/
public static String md5SumFile(File file) {
MessageDigest digest;
try {
digest = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
throw new StingException("Unable to find MD5 digest");
}
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
throw new StingException("Unable to open file " + file);
}
byte[] buffer = new byte[8192];
int read = 0;
try {
while ((read = is.read(buffer)) > 0) {
digest.update(buffer, 0, read);
}
byte[] md5sum = digest.digest();
BigInteger bigInt = new BigInteger(1, md5sum);
return bigInt.toString(16);
}
catch (IOException e) {
throw new StingException("Unable to process file for MD5", e);
}
finally {
try {
is.close();
}
catch (IOException e) {
throw new StingException("Unable to close input stream for MD5 calculation", e);
}
}
}
}

View File

@ -0,0 +1,329 @@
package org.broadinstitute.sting.gatk.refdata.features.vcf4;
import org.broad.tribble.util.AsciiLineReader;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
/**
* test out pieces of the VCF 4 codec.
*/
public class VCF4UnitTest extends BaseTest {
File vcfFile = new File("testdata/vcfExmaple.vcf4");
// setup the contig ordering
@BeforeClass
public static void setupContig() {
IndexedFastaSequenceFile seq;
try {
seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta"));
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
@Test
public void testReadBasicHeader() {
TestSetup testSetup = new TestSetup().invoke(vcfFile);
int seenRecords = 0;
// check some field entries of each type
Set<VCFHeaderLine> lines = testSetup.getHeader().getMetaData();
for (VCFHeaderLine line : lines) {
// check the vcf info header lines
if (line instanceof VCFInfoHeaderLine) {
VCFInfoHeaderLine ihLIne = (VCFInfoHeaderLine)line;
// test a normal info line
if (ihLIne.getmName().equals("NS")) {
Assert.assertEquals(VCFInfoHeaderLine.INFO_TYPE.Integer,ihLIne.getmType());
Assert.assertEquals(1,ihLIne.getmCount());
Assert.assertTrue("Number of Samples With Data".equals(ihLIne.getmDescription()));
seenRecords++;
}
// test a info line that uses the period to represent an unbounded value
if (ihLIne.getmName().equals("AF")) {
Assert.assertEquals(VCFInfoHeaderLine.INFO_TYPE.Float,ihLIne.getmType());
Assert.assertEquals(VCFInfoHeaderLine.UNBOUNDED,ihLIne.getmCount());
Assert.assertTrue("Allele Frequency".equals(ihLIne.getmDescription()));
seenRecords++;
}
}
// check the vcf filter header lines
if (line instanceof VCFFilterHeaderLine) {
VCFFilterHeaderLine fhLIne = (VCFFilterHeaderLine)line;
if (fhLIne.getmName().equals("q10")) {
Assert.assertTrue("Quality below 10".equals(fhLIne.getmDescription()));
seenRecords++;
}
}
// check the vcf info header lines
if (line instanceof VCFFormatHeaderLine) {
VCFFormatHeaderLine ifLIne = (VCFFormatHeaderLine)line;
if (ifLIne.getmName().equals("GT")) {
Assert.assertEquals(VCFFormatHeaderLine.FORMAT_TYPE.String,ifLIne.getmType());
Assert.assertEquals(1,ifLIne.getmCount());
Assert.assertTrue("Genotype".equals(ifLIne.getmDescription()));
seenRecords++;
}
}
}
Assert.assertEquals("We expected to see three records (one of each type we check), but didn't.",4,seenRecords);
}
@Test
public void testOutputHeader() {
TestSetup testSetup = new TestSetup().invoke(vcfFile);
File tempFile = null;
try {
tempFile = File.createTempFile("VCF4Test","vcf");
} catch (IOException e) {
Assert.fail("Couldn't create a temporary file ");
}
// write it to disk
VCFWriter writer = new VCFWriter(tempFile);
writer.writeHeader(testSetup.getHeader());
writer.close();
// md5 sum the file
Assert.assertTrue("expecting md5sum of 637f3c92806d993a9c7b7116da398d6, but got " + md5SumFile(tempFile),"637f3c92806d993a9c7b7116da398d6".equals(md5SumFile(tempFile)));
}
@Test
public void testCountVCF4Records() {
TestSetup testSetup = new TestSetup().invoke(vcfFile);
AsciiLineReader reader = testSetup.getReader();
VCF4Codec codec = testSetup.getCodec();
// now parse the lines
String line = null;
try {
line = reader.readLine();
} catch (IOException e) {
Assert.fail("Failed to read a line");
}
// our record count
int recordCount = 0;
while (line != null) {
try {
//System.err.println(codec.decode(line).toString());
recordCount++;
testSetup.codec.decode(line);
line = reader.readLine();
} catch (IOException e) {
Assert.fail("Failed to read a line");
}
}
Assert.assertEquals(6,recordCount);
}
// two constants for testing
String regularLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HG=12\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
@Test
public void testCheckInfoValidation() {
TestSetup testSetup = new TestSetup().invoke(vcfFile);
testSetup.codec.decode(regularLine);
}
@Test
public void testCheckTwoFewInfoValidation() {
TestSetup testSetup = new TestSetup().invoke(vcfFile);
testSetup.codec.decode(twoFewInfoLine);
}
@Test(expected=StingException.class)
public void testCheckTwoManyInfoValidation() {
TestSetup testSetup = new TestSetup().invoke(vcfFile);
testSetup.codec.decode(twoManyInfoLine);
}
//File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
/*@Test
public void checkLargeVCF() {
TestSetup testSetup = new TestSetup().invoke(largeVCF);
AsciiLineReader reader = testSetup.getReader();
VCF4Codec codec = testSetup.getCodec();
// now parse the lines
String line = null;
try {
line = reader.readLine();
} catch (IOException e) {
Assert.fail("Failed to read a line");
}
// our record count
int recordCount = 0;
int badRecordCount = 0;
while (line != null) {
try {
recordCount++;
try {
testSetup.codec.decode(line);
} catch (Exception e) {
System.err.println(e.getMessage() + " -> " + line);
System.err.println(line);
badRecordCount++;
}
line = reader.readLine();
if (recordCount % 1000 == 0)
System.err.println("record count == " + recordCount);
} catch (IOException e) {
Assert.fail("Failed to read a line");
}
}
Assert.assertEquals(0,badRecordCount);
Assert.assertEquals(728075,recordCount);
}
*/
/**
* test out the clipping of alleles (removing extra context provided by VCF implementation).
*/
@Test
public void testClippingOfAllelesDeletionAndInsertion() {
String ref = "GGTT";
ArrayList<Allele> alleles = new ArrayList<Allele>();
alleles.add(Allele.create(ref,true));
alleles.add(Allele.create("GGAATT",false));
alleles.add(Allele.create("GT",false));
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,3)));
// we know the ordering
//System.err.println(locAndList.second.get(0).toString());
//System.err.println(locAndList.second.get(1).toString());
//System.err.println(locAndList.second.get(2).toString());
Assert.assertTrue(locAndList.second.get(0).toString().equals("GT*"));
Assert.assertTrue(locAndList.second.get(0).isReference());
Assert.assertTrue(locAndList.second.get(1).toString().equals("GAAT"));
Assert.assertTrue(locAndList.second.get(2).toString().equals("-"));
}
/**
* test out the clipping of alleles (removing extra context provided by VCF implementation).
*/
@Test
public void testClippingOfAllelesLongRefRepeat() {
String ref = "GGGG";
ArrayList<Allele> alleles = new ArrayList<Allele>();
alleles.add(Allele.create(ref,true));
alleles.add(Allele.create("G",false));
alleles.add(Allele.create("GG",false));
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,4)));
// we know the ordering
Assert.assertTrue(locAndList.second.get(0).toString().equals("GGG*"));
Assert.assertTrue(locAndList.second.get(0).isReference());
Assert.assertTrue(locAndList.second.get(1).toString().equals("-"));
Assert.assertTrue(locAndList.second.get(2).toString().equals("G"));
}
/**
* test out the clipping of alleles (removing extra context provided by VCF implementation).
*/
@Test
public void testClippingOfAllelesPlainPolyMorph() {
String ref = "C";
ArrayList<Allele> alleles = new ArrayList<Allele>();
alleles.add(Allele.create(ref,true));
alleles.add(Allele.create("T",false));
alleles.add(Allele.create("G",false));
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",1,1)));
// we know the ordering
Assert.assertTrue(locAndList.second.get(0).toString().equals("C*"));
Assert.assertTrue(locAndList.second.get(0).isReference());
Assert.assertTrue(locAndList.second.get(1).toString().equals("T"));
Assert.assertTrue(locAndList.second.get(2).toString().equals("G"));
}
/**
* test out the clipping of alleles (removing extra context provided by VCF implementation).
*/
@Test
public void testClippingOfAllelesInsertions() {
String ref = "C";
ArrayList<Allele> alleles = new ArrayList<Allele>();
alleles.add(Allele.create(ref,true));
alleles.add(Allele.create("CTTTTT",false));
alleles.add(Allele.create("GGGGGG",false));
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",1,1)));
// we know the ordering
Assert.assertTrue(locAndList.second.get(0).toString().equals("C*"));
Assert.assertTrue(locAndList.second.get(0).isReference());
Assert.assertTrue(locAndList.second.get(1).toString().equals("CTTTTT"));
Assert.assertTrue(locAndList.second.get(2).toString().equals("GGGGGG"));
}
/**
* a test setup for the VCF 4 codec
*/
private class TestSetup {
private AsciiLineReader reader;
private VCF4Codec codec;
private VCFHeader header;
public AsciiLineReader getReader() {
return reader;
}
public VCF4Codec getCodec() {
return codec;
}
public VCFHeader getHeader() {
return header;
}
public TestSetup invoke(File vcfFile) {
reader = null;
try {
reader = new AsciiLineReader(new FileInputStream(vcfFile));
} catch (FileNotFoundException e) {
Assert.fail("Unable to parse out VCF file " + vcfFile);
}
codec = new VCF4Codec();
codec.readHeader(reader);
header = codec.getHeader(VCFHeader.class);
return this;
}
}
}