incremental changes to the VCF4 codec, including allele clipping down to the minimum reference allele; adding unit testing for certain aspects of the parsing. Not ready for prime-time yet.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3604 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
de9f1f575f
commit
32f324a009
|
|
@ -5,9 +5,7 @@ import org.broad.tribble.FeatureCodec;
|
|||
import org.broad.tribble.exception.CodecLineParsingException;
|
||||
import org.broad.tribble.util.LineReader;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broad.tribble.vcf.VCFHeader;
|
||||
import org.broad.tribble.vcf.VCFHeaderLine;
|
||||
import org.broad.tribble.vcf.VCFReaderUtils;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
|
|
@ -15,6 +13,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
|
|||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
|
@ -27,19 +26,39 @@ import java.util.*;
|
|||
public class VCF4Codec implements FeatureCodec {
|
||||
|
||||
// we have to store the list of strings that make up the header until they're needed
|
||||
private List<String> headerStrings = new ArrayList<String>();
|
||||
private VCFHeader header = null;
|
||||
|
||||
public VCF4Codec() {
|
||||
this(true);
|
||||
//throw new StingException("DON'T USE THIS");
|
||||
}
|
||||
// used to subtract from the
|
||||
private static int ZERO_CHAR = (byte)'0';
|
||||
|
||||
// todo -- remove me when done
|
||||
public VCF4Codec(boolean itsOKImTesting) {
|
||||
if ( ! itsOKImTesting )
|
||||
throw new StingException("DON'T USE THIS");
|
||||
}
|
||||
// a mapping of the allele
|
||||
private static Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
|
||||
|
||||
//private static String[] CachedGTKey = new String[100];
|
||||
private static String[] CachedGTValues = new String[100];
|
||||
|
||||
// for performance testing purposes
|
||||
public static boolean parseGenotypesToo = true;
|
||||
|
||||
// for performance testing purposes
|
||||
public static boolean validate = true;
|
||||
|
||||
// a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
|
||||
private String[] parts = null;
|
||||
|
||||
// for performance we cache the hashmap of filter encodings for quick lookup
|
||||
private HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>();
|
||||
|
||||
// a set of the genotype keys? TODO: rename to something better
|
||||
private String[] GTKeys = new String[100];
|
||||
|
||||
// a list of the info fields, filter fields, and format fields
|
||||
ArrayList<String> infoFields = new ArrayList<String>();
|
||||
ArrayList<String> formatFields = new ArrayList<String>();
|
||||
ArrayList<String> filterFields = new ArrayList<String>();
|
||||
|
||||
// do we want to validate the info, format, and filter fields
|
||||
private final boolean validateFromHeader = true;
|
||||
|
||||
/**
|
||||
* this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates
|
||||
|
|
@ -48,6 +67,8 @@ public class VCF4Codec implements FeatureCodec {
|
|||
*/
|
||||
@Override
|
||||
public int readHeader(LineReader reader) {
|
||||
List<String> headerStrings = new ArrayList<String>();
|
||||
|
||||
String line = "";
|
||||
try {
|
||||
while ((line = reader.readLine()) != null) {
|
||||
|
|
@ -56,12 +77,25 @@ public class VCF4Codec implements FeatureCodec {
|
|||
}
|
||||
else if (line.startsWith("#")) {
|
||||
headerStrings.add(line);
|
||||
String[] genotypes = line.split("\\s+");
|
||||
Set<String> genotypeSampleNames = new TreeSet<String>();
|
||||
for (int x = 8; x < genotypes.length; x++)
|
||||
genotypeSampleNames.add(genotypes[x]);
|
||||
// this should be the next line -> header = VCFReaderUtils.createHeader(headerStrings);
|
||||
header = new VCFHeader(new HashSet<VCFHeaderLine>(),genotypeSampleNames);
|
||||
header = VCFReaderUtils.createHeader(headerStrings, VCFHeaderVersion.VCF4_0);
|
||||
|
||||
// load the parsing fields
|
||||
Set<VCFHeaderLine> headerLines = header.getMetaData();
|
||||
|
||||
// setup our look-up lists for validation
|
||||
for (VCFHeaderLine hl : headerLines) {
|
||||
if (hl.getClass() == VCFFilterHeaderLine.class)
|
||||
this.filterFields.add(((VCFFilterHeaderLine)hl).getmName());
|
||||
if (hl.getClass() == VCFFormatHeaderLine.class)
|
||||
this.formatFields.add(((VCFFormatHeaderLine)hl).getmName());
|
||||
if (hl.getClass() == VCFInfoHeaderLine.class)
|
||||
this.infoFields.add(((VCFInfoHeaderLine)hl).getmName());
|
||||
}
|
||||
// sort the lists
|
||||
Collections.sort(filterFields);
|
||||
Collections.sort(formatFields);
|
||||
Collections.sort(infoFields);
|
||||
|
||||
return headerStrings.size();
|
||||
}
|
||||
else {
|
||||
|
|
@ -73,9 +107,41 @@ public class VCF4Codec implements FeatureCodec {
|
|||
throw new RuntimeException("IO Exception ", e);
|
||||
}
|
||||
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||
|
||||
}
|
||||
|
||||
private static int ZERO_CHAR = (byte)'0';
|
||||
public Feature decodeLoc(String line) {
|
||||
return decode(line);
|
||||
}
|
||||
|
||||
/**
|
||||
* decode the line into a feature (VariantContext)
|
||||
* @param line the line
|
||||
* @return a VariantContext
|
||||
*/
|
||||
public Feature decode(String line) {
|
||||
if (parts == null)
|
||||
parts = new String[header.getColumnCount()];
|
||||
|
||||
int nParts = ParsingUtils.split(line, parts, '\t');
|
||||
|
||||
// our header cannot be null, we need the genotype sample names and counts
|
||||
if (header == null) throw new IllegalStateException("VCF Header cannot be null");
|
||||
|
||||
// check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
|
||||
if (nParts != header.getColumnCount())
|
||||
throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + nParts + " for line " + line);
|
||||
|
||||
|
||||
return parseVCFLine(parts);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a an allele from an index and an array of alleles
|
||||
* @param index the index
|
||||
* @param alleles the alleles
|
||||
* @return an Allele
|
||||
*/
|
||||
private static Allele oneAllele(char index, List<Allele> alleles) {
|
||||
if ( index == '.' )
|
||||
return Allele.NO_CALL;
|
||||
|
|
@ -85,8 +151,14 @@ public class VCF4Codec implements FeatureCodec {
|
|||
}
|
||||
}
|
||||
|
||||
private static Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
|
||||
|
||||
/**
|
||||
* parse genotype alleles from the genotype string
|
||||
* @param GT
|
||||
* @param alleles
|
||||
* @param cache
|
||||
* @return
|
||||
*/
|
||||
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
||||
// this should cache results [since they are immutable] and return a single object for each genotype
|
||||
if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
|
||||
|
|
@ -99,6 +171,12 @@ public class VCF4Codec implements FeatureCodec {
|
|||
return GTAlleles;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse out the info fields
|
||||
* @param infoField the fields
|
||||
* @param id the indentifier
|
||||
* @return a mapping of keys to objects
|
||||
*/
|
||||
private Map<String, Object> parseInfo(String infoField, String id) {
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
||||
|
|
@ -120,73 +198,71 @@ public class VCF4Codec implements FeatureCodec {
|
|||
attributes.put(key, value);
|
||||
}
|
||||
}
|
||||
// validate the fields
|
||||
validateFields(attributes.keySet(),infoFields);
|
||||
|
||||
attributes.put("ID", id);
|
||||
return attributes;
|
||||
}
|
||||
|
||||
//private static String[] CachedGTKey = new String[100];
|
||||
private static String[] CachedGTValues = new String[100];
|
||||
|
||||
public static boolean parseGenotypesToo = false; // for performance testing purposes
|
||||
public static boolean validate = true; // for performance testing purposes
|
||||
private static boolean REQUIRE_HEADER = false;
|
||||
|
||||
// a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
|
||||
private String[] parts = null;
|
||||
|
||||
public Feature decodeLoc(String line) {
|
||||
return decode(line);
|
||||
}
|
||||
|
||||
public Feature decode(String line) {
|
||||
if ( parts == null )
|
||||
parts = REQUIRE_HEADER ? new String[header.getColumnCount()] : new String[10000]; // todo -- remove require header
|
||||
|
||||
int nParts = ParsingUtils.split(line, parts, '\t');
|
||||
|
||||
if (REQUIRE_HEADER) { // todo -- remove require header
|
||||
// our header cannot be null, we need the genotype sample names and counts
|
||||
if ( header == null) throw new IllegalStateException("VCF Header cannot be null");
|
||||
|
||||
// check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
|
||||
if (nParts != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + nParts + " for line " + line);
|
||||
private void validateFields(Set<String> attributes, List<String> fields) {
|
||||
// validate the info fields
|
||||
if (validateFromHeader) {
|
||||
int count = 0;
|
||||
for (String attr : attributes)
|
||||
if (Collections.binarySearch(fields,attr) < 0)
|
||||
throw new StingException("Unable to find field descibing attribute " + attr);
|
||||
}
|
||||
|
||||
return parseVCFLine(parts, nParts);
|
||||
}
|
||||
|
||||
/**
|
||||
* parse out the qual value
|
||||
* @param qualString the quality string
|
||||
* @return return a double
|
||||
*/
|
||||
private static Double parseQual(String qualString) {
|
||||
// todo -- remove double once we deal with annoying VCFs from 1KG
|
||||
return qualString.equals("-1") || qualString.equals("-1.0") ? VariantContext.NO_NEG_LOG_10PERROR : Double.valueOf(qualString) / 10;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse out the alleles
|
||||
* @param ref the reference base
|
||||
* @param alts a string of alternates to break into alleles
|
||||
* @return a list of alleles, and a pair of the shortest and longest sequence
|
||||
*/
|
||||
private List<Allele> parseAlleles(String ref, String alts) {
|
||||
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
|
||||
|
||||
// ref
|
||||
checkAllele(ref);
|
||||
Allele refAllele = Allele.create(ref, true);
|
||||
alleles.add(refAllele);
|
||||
|
||||
if ( alts.indexOf(",") == -1 ) { // only 1 alternatives, don't call string split
|
||||
parse1Allele(alleles, alts);
|
||||
} else {
|
||||
for ( String alt : Utils.split(alts, ",") ) {
|
||||
parse1Allele(alleles, alt);
|
||||
}
|
||||
}
|
||||
if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split
|
||||
parseSingleAllele(alleles, alts);
|
||||
else
|
||||
for ( String alt : Utils.split(alts, ",") )
|
||||
parseSingleAllele(alleles, alt);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
/**
|
||||
* check to make sure the allele is an acceptable allele
|
||||
* @param allele the allele to check
|
||||
*/
|
||||
private static void checkAllele(String allele) {
|
||||
if ( ! Allele.acceptableAlleleBases(allele) ) {
|
||||
throw new StingException("Unparsable vcf record with allele " + allele);
|
||||
}
|
||||
}
|
||||
|
||||
private void parse1Allele(List<Allele> alleles, String alt) {
|
||||
/**
|
||||
* parse a single allele, given the allele list
|
||||
* @param alleles the alleles available
|
||||
* @param alt the allele to parse
|
||||
*/
|
||||
private void parseSingleAllele(List<Allele> alleles, String alt) {
|
||||
checkAllele(alt);
|
||||
|
||||
Allele allele = Allele.create(alt, false);
|
||||
|
|
@ -194,25 +270,44 @@ public class VCF4Codec implements FeatureCodec {
|
|||
alleles.add(allele);
|
||||
}
|
||||
|
||||
// todo -- check a static map from filter String to HashSets to reuse objects and avoid parsing
|
||||
/**
|
||||
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
|
||||
* @param filterString the string to parse
|
||||
* @return a set of the filters applied
|
||||
*/
|
||||
private Set<String> parseFilters(String filterString) {
|
||||
if ( filterString.equals(".") )
|
||||
Set<String> fFields;
|
||||
|
||||
// a PASS is simple (no filters)
|
||||
if ( filterString.equals("PASS") ) {
|
||||
return null;
|
||||
}
|
||||
// else do we have the filter string cached?
|
||||
else if (filterHash.containsKey(filterString)) {
|
||||
fFields = filterHash.get(filterString);
|
||||
}
|
||||
// otherwise we have to parse and cache the value
|
||||
else {
|
||||
HashSet<String> s = new HashSet<String>(1);
|
||||
LinkedHashSet<String> s = new LinkedHashSet<String>(1);
|
||||
if ( filterString.indexOf(";") == -1 ) {
|
||||
s.add(filterString);
|
||||
} else {
|
||||
s.addAll(Utils.split(filterString, ";"));
|
||||
}
|
||||
|
||||
return s;
|
||||
filterHash.put(filterString,s);
|
||||
fFields = s;
|
||||
}
|
||||
|
||||
validateFields(fFields,filterFields);
|
||||
return fFields;
|
||||
}
|
||||
|
||||
private String[] GTKeys = new String[100];
|
||||
|
||||
private VariantContext parseVCFLine(String[] parts, int nParts) {
|
||||
/**
|
||||
* parse out the VCF line
|
||||
* @param parts the parts split up
|
||||
* @return a variant context object
|
||||
*/
|
||||
private VariantContext parseVCFLine(String[] parts) {
|
||||
String contig = parts[0];
|
||||
long pos = Long.valueOf(parts[1]);
|
||||
String id = parts[2];
|
||||
|
|
@ -224,20 +319,27 @@ public class VCF4Codec implements FeatureCodec {
|
|||
String GT = parts[8];
|
||||
int genotypesStart = 9;
|
||||
|
||||
List<Allele> alleles = parseAlleles(ref, alts);
|
||||
List<Allele> alleles = parseAlleles(ref, alts);
|
||||
Set<String> filters = parseFilters(filter);
|
||||
Map<String, Object> attributes = parseInfo(info, id);
|
||||
|
||||
// parse genotypes
|
||||
int nGTKeys = ParsingUtils.split(GT, GTKeys, ':');
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(Math.max(nParts - genotypesStart, 1));
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(Math.max(parts.length - genotypesStart, 1));
|
||||
Iterator<String> iter = header.getGenotypeSamples().iterator();
|
||||
|
||||
Pair<GenomeLoc,List<Allele>> locAndAlleles = (ref.length() > 1) ?
|
||||
clipAlleles(contig,pos,ref,alleles) :
|
||||
new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,pos),alleles);
|
||||
|
||||
|
||||
if ( parseGenotypesToo ) {
|
||||
alleleMap.clear();
|
||||
for ( int genotypeOffset = genotypesStart; genotypeOffset < nParts; genotypeOffset++ ) {
|
||||
for ( int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++ ) {
|
||||
String sample = parts[genotypeOffset];
|
||||
String[] GTValues = CachedGTValues;
|
||||
ParsingUtils.split(sample, GTValues, ':');
|
||||
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValues[0], alleles, alleleMap);
|
||||
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValues[0], locAndAlleles.second, alleleMap);
|
||||
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
|
||||
Set<String> genotypeFilters = null;
|
||||
|
||||
|
|
@ -249,32 +351,69 @@ public class VCF4Codec implements FeatureCodec {
|
|||
if ( GTKeys[i].equals("GQ") ) {
|
||||
GTQual = parseQual(GTValues[i]);
|
||||
} if ( GTKeys[i].equals("FL") ) { // deal with genotype filters here
|
||||
// todo -- get genotype filters working
|
||||
// genotypeFilters = new HashSet<String>();
|
||||
// if ( vcfG.isFiltered() ) // setup the FL genotype filter fields
|
||||
// genotypeFilters.addAll(Arrays.asList(vcfG.getFields().get(VCFGenotypeRecord.GENOTYPE_FILTER_KEY).split(";")));
|
||||
genotypeFilters.addAll(parseFilters(GTValues[i]));
|
||||
} else {
|
||||
gtAttributes.put(GTKeys[i], GTValues[i]);
|
||||
}
|
||||
}
|
||||
// validate the format fields
|
||||
validateFields(gtAttributes.keySet(),formatFields);
|
||||
}
|
||||
|
||||
boolean phased = GTKeys[0].charAt(1) == '|';
|
||||
|
||||
// todo -- actually parse the header to get the sample name
|
||||
Genotype g = new Genotype("X" + genotypeOffset, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
|
||||
Genotype g = new Genotype(iter.next(), genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
|
||||
genotypes.put(g.getSampleName(), g);
|
||||
}
|
||||
}
|
||||
|
||||
// todo -- doesn't work for indels [the whole reason for VCF4]
|
||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc(contig,pos,pos + ref.length() - 1);
|
||||
|
||||
// todo -- we need access to our track name to name the variant context
|
||||
VariantContext vc = new VariantContext("foo", loc, alleles, genotypes, qual, filters, attributes);
|
||||
return vc;
|
||||
return new VariantContext("foo", locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
|
||||
}
|
||||
|
||||
/**
|
||||
* clip the alleles, based on the reference
|
||||
* @param unclippedAlleles the list of alleles
|
||||
* @return a list of alleles, clipped to the reference
|
||||
*/
|
||||
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
|
||||
List<Allele> newAlleleList = new ArrayList<Allele>();
|
||||
|
||||
// find the preceeding string common to all alleles and the reference
|
||||
int forwardClipped = 0;
|
||||
boolean clipping = true;
|
||||
while (clipping) {
|
||||
for (Allele a : unclippedAlleles)
|
||||
if (forwardClipped > ref.length() - 1)
|
||||
clipping = false;
|
||||
else if (a.length() <= forwardClipped || (a.getBases()[forwardClipped] != ref.getBytes()[forwardClipped])) {
|
||||
clipping = false;
|
||||
}
|
||||
if (clipping) forwardClipped++;
|
||||
}
|
||||
|
||||
int reverseClipped = 0;
|
||||
clipping = true;
|
||||
while (clipping) {
|
||||
for (Allele a : unclippedAlleles)
|
||||
if (a.length() - reverseClipped < 0 || a.length() - forwardClipped == 0)
|
||||
clipping = false;
|
||||
else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
|
||||
clipping = false;
|
||||
if (clipping) reverseClipped++;
|
||||
}
|
||||
|
||||
// check to see if we're about to clip all the bases from the reference, if so back off the front clip a base
|
||||
if (forwardClipped + reverseClipped >= ref.length())
|
||||
forwardClipped--;
|
||||
|
||||
for (Allele a : unclippedAlleles)
|
||||
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipped,a.getBases().length-reverseClipped),a.isReference()));
|
||||
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipped,(position+ref.length()-reverseClipped-1)),
|
||||
newAlleleList);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @return the type of record
|
||||
|
|
@ -284,8 +423,15 @@ public class VCF4Codec implements FeatureCodec {
|
|||
return VariantContext.class;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the header
|
||||
* @param clazz the class were expecting
|
||||
* @return our VCFHeader
|
||||
* @throws ClassCastException
|
||||
*/
|
||||
@Override
|
||||
public Object getHeader(Class clazz) throws ClassCastException {
|
||||
return null; // TODO: fix this Aaron
|
||||
public VCFHeader getHeader(Class clazz) throws ClassCastException {
|
||||
if (clazz != VCFHeader.class) throw new ClassCastException("expecting class " + clazz + " but VCF4Codec provides " + VCFHeader.class);
|
||||
return this.header;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -91,7 +91,7 @@ public class VCF4ReaderTestWalker extends RodWalker<VCFRecord,Long> {
|
|||
|
||||
String[] parts = new String[10000];
|
||||
public void onTraversalDone(Long num){
|
||||
VCF4Codec vcf4codec = new VCF4Codec(true);
|
||||
VCF4Codec vcf4codec = new VCF4Codec();
|
||||
VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES;
|
||||
VCF4Codec.validate = ! DontValidate;
|
||||
|
||||
|
|
|
|||
|
|
@ -2,12 +2,13 @@ package org.broadinstitute.sting;
|
|||
|
||||
import org.apache.log4j.*;
|
||||
import org.apache.log4j.spi.LoggingEvent;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.junit.*;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.*;
|
||||
import java.math.BigInteger;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.Enumeration;
|
||||
|
||||
/**
|
||||
|
|
@ -175,4 +176,51 @@ public abstract class BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* a little utility function for all tests to md5sum a file
|
||||
* Shameless taken from:
|
||||
*
|
||||
* http://www.javalobby.org/java/forums/t84420.html
|
||||
*
|
||||
* @param file the file
|
||||
* @return a string
|
||||
*/
|
||||
public static String md5SumFile(File file) {
|
||||
MessageDigest digest;
|
||||
try {
|
||||
digest = MessageDigest.getInstance("MD5");
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new StingException("Unable to find MD5 digest");
|
||||
}
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = new FileInputStream(file);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Unable to open file " + file);
|
||||
}
|
||||
byte[] buffer = new byte[8192];
|
||||
int read = 0;
|
||||
try {
|
||||
while ((read = is.read(buffer)) > 0) {
|
||||
digest.update(buffer, 0, read);
|
||||
}
|
||||
byte[] md5sum = digest.digest();
|
||||
BigInteger bigInt = new BigInteger(1, md5sum);
|
||||
return bigInt.toString(16);
|
||||
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new StingException("Unable to process file for MD5", e);
|
||||
}
|
||||
finally {
|
||||
try {
|
||||
is.close();
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new StingException("Unable to close input stream for MD5 calculation", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,329 @@
|
|||
package org.broadinstitute.sting.gatk.refdata.features.vcf4;
|
||||
|
||||
import org.broad.tribble.util.AsciiLineReader;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
|
||||
import org.junit.Assert;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* test out pieces of the VCF 4 codec.
|
||||
*/
|
||||
public class VCF4UnitTest extends BaseTest {
|
||||
File vcfFile = new File("testdata/vcfExmaple.vcf4");
|
||||
|
||||
// setup the contig ordering
|
||||
@BeforeClass
|
||||
public static void setupContig() {
|
||||
IndexedFastaSequenceFile seq;
|
||||
try {
|
||||
seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta"));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("unable to load the sequence dictionary");
|
||||
}
|
||||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadBasicHeader() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfFile);
|
||||
|
||||
int seenRecords = 0;
|
||||
|
||||
// check some field entries of each type
|
||||
Set<VCFHeaderLine> lines = testSetup.getHeader().getMetaData();
|
||||
|
||||
for (VCFHeaderLine line : lines) {
|
||||
// check the vcf info header lines
|
||||
if (line instanceof VCFInfoHeaderLine) {
|
||||
VCFInfoHeaderLine ihLIne = (VCFInfoHeaderLine)line;
|
||||
|
||||
// test a normal info line
|
||||
if (ihLIne.getmName().equals("NS")) {
|
||||
Assert.assertEquals(VCFInfoHeaderLine.INFO_TYPE.Integer,ihLIne.getmType());
|
||||
Assert.assertEquals(1,ihLIne.getmCount());
|
||||
Assert.assertTrue("Number of Samples With Data".equals(ihLIne.getmDescription()));
|
||||
seenRecords++;
|
||||
}
|
||||
// test a info line that uses the period to represent an unbounded value
|
||||
if (ihLIne.getmName().equals("AF")) {
|
||||
Assert.assertEquals(VCFInfoHeaderLine.INFO_TYPE.Float,ihLIne.getmType());
|
||||
Assert.assertEquals(VCFInfoHeaderLine.UNBOUNDED,ihLIne.getmCount());
|
||||
Assert.assertTrue("Allele Frequency".equals(ihLIne.getmDescription()));
|
||||
seenRecords++;
|
||||
}
|
||||
}
|
||||
// check the vcf filter header lines
|
||||
if (line instanceof VCFFilterHeaderLine) {
|
||||
VCFFilterHeaderLine fhLIne = (VCFFilterHeaderLine)line;
|
||||
if (fhLIne.getmName().equals("q10")) {
|
||||
Assert.assertTrue("Quality below 10".equals(fhLIne.getmDescription()));
|
||||
seenRecords++;
|
||||
}
|
||||
}
|
||||
|
||||
// check the vcf info header lines
|
||||
if (line instanceof VCFFormatHeaderLine) {
|
||||
VCFFormatHeaderLine ifLIne = (VCFFormatHeaderLine)line;
|
||||
if (ifLIne.getmName().equals("GT")) {
|
||||
Assert.assertEquals(VCFFormatHeaderLine.FORMAT_TYPE.String,ifLIne.getmType());
|
||||
Assert.assertEquals(1,ifLIne.getmCount());
|
||||
Assert.assertTrue("Genotype".equals(ifLIne.getmDescription()));
|
||||
seenRecords++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertEquals("We expected to see three records (one of each type we check), but didn't.",4,seenRecords);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOutputHeader() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfFile);
|
||||
|
||||
File tempFile = null;
|
||||
try {
|
||||
tempFile = File.createTempFile("VCF4Test","vcf");
|
||||
} catch (IOException e) {
|
||||
Assert.fail("Couldn't create a temporary file ");
|
||||
}
|
||||
|
||||
// write it to disk
|
||||
VCFWriter writer = new VCFWriter(tempFile);
|
||||
writer.writeHeader(testSetup.getHeader());
|
||||
writer.close();
|
||||
|
||||
// md5 sum the file
|
||||
Assert.assertTrue("expecting md5sum of 637f3c92806d993a9c7b7116da398d6, but got " + md5SumFile(tempFile),"637f3c92806d993a9c7b7116da398d6".equals(md5SumFile(tempFile)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCountVCF4Records() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfFile);
|
||||
AsciiLineReader reader = testSetup.getReader();
|
||||
VCF4Codec codec = testSetup.getCodec();
|
||||
|
||||
// now parse the lines
|
||||
String line = null;
|
||||
try {
|
||||
line = reader.readLine();
|
||||
} catch (IOException e) {
|
||||
Assert.fail("Failed to read a line");
|
||||
}
|
||||
|
||||
// our record count
|
||||
int recordCount = 0;
|
||||
while (line != null) {
|
||||
try {
|
||||
//System.err.println(codec.decode(line).toString());
|
||||
recordCount++;
|
||||
testSetup.codec.decode(line);
|
||||
line = reader.readLine();
|
||||
} catch (IOException e) {
|
||||
Assert.fail("Failed to read a line");
|
||||
}
|
||||
}
|
||||
Assert.assertEquals(6,recordCount);
|
||||
}
|
||||
|
||||
|
||||
// two constants for testing
|
||||
String regularLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||
String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||
String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HG=12\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||
|
||||
@Test
|
||||
public void testCheckInfoValidation() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfFile);
|
||||
testSetup.codec.decode(regularLine);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckTwoFewInfoValidation() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfFile);
|
||||
testSetup.codec.decode(twoFewInfoLine);
|
||||
}
|
||||
|
||||
@Test(expected=StingException.class)
|
||||
public void testCheckTwoManyInfoValidation() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfFile);
|
||||
testSetup.codec.decode(twoManyInfoLine);
|
||||
}
|
||||
|
||||
//File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
|
||||
|
||||
/*@Test
|
||||
public void checkLargeVCF() {
|
||||
TestSetup testSetup = new TestSetup().invoke(largeVCF);
|
||||
AsciiLineReader reader = testSetup.getReader();
|
||||
VCF4Codec codec = testSetup.getCodec();
|
||||
|
||||
// now parse the lines
|
||||
String line = null;
|
||||
try {
|
||||
line = reader.readLine();
|
||||
} catch (IOException e) {
|
||||
Assert.fail("Failed to read a line");
|
||||
}
|
||||
|
||||
// our record count
|
||||
int recordCount = 0;
|
||||
int badRecordCount = 0;
|
||||
while (line != null) {
|
||||
try {
|
||||
recordCount++;
|
||||
try {
|
||||
testSetup.codec.decode(line);
|
||||
} catch (Exception e) {
|
||||
System.err.println(e.getMessage() + " -> " + line);
|
||||
System.err.println(line);
|
||||
badRecordCount++;
|
||||
}
|
||||
line = reader.readLine();
|
||||
if (recordCount % 1000 == 0)
|
||||
System.err.println("record count == " + recordCount);
|
||||
} catch (IOException e) {
|
||||
Assert.fail("Failed to read a line");
|
||||
}
|
||||
}
|
||||
Assert.assertEquals(0,badRecordCount);
|
||||
Assert.assertEquals(728075,recordCount);
|
||||
}
|
||||
*/
|
||||
/**
|
||||
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||
*/
|
||||
@Test
|
||||
public void testClippingOfAllelesDeletionAndInsertion() {
|
||||
String ref = "GGTT";
|
||||
ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(Allele.create(ref,true));
|
||||
alleles.add(Allele.create("GGAATT",false));
|
||||
alleles.add(Allele.create("GT",false));
|
||||
|
||||
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
||||
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,3)));
|
||||
|
||||
// we know the ordering
|
||||
//System.err.println(locAndList.second.get(0).toString());
|
||||
//System.err.println(locAndList.second.get(1).toString());
|
||||
//System.err.println(locAndList.second.get(2).toString());
|
||||
Assert.assertTrue(locAndList.second.get(0).toString().equals("GT*"));
|
||||
Assert.assertTrue(locAndList.second.get(0).isReference());
|
||||
Assert.assertTrue(locAndList.second.get(1).toString().equals("GAAT"));
|
||||
Assert.assertTrue(locAndList.second.get(2).toString().equals("-"));
|
||||
}
|
||||
|
||||
/**
|
||||
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||
*/
|
||||
@Test
|
||||
public void testClippingOfAllelesLongRefRepeat() {
|
||||
String ref = "GGGG";
|
||||
ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(Allele.create(ref,true));
|
||||
alleles.add(Allele.create("G",false));
|
||||
alleles.add(Allele.create("GG",false));
|
||||
|
||||
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
||||
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,4)));
|
||||
|
||||
// we know the ordering
|
||||
Assert.assertTrue(locAndList.second.get(0).toString().equals("GGG*"));
|
||||
Assert.assertTrue(locAndList.second.get(0).isReference());
|
||||
Assert.assertTrue(locAndList.second.get(1).toString().equals("-"));
|
||||
Assert.assertTrue(locAndList.second.get(2).toString().equals("G"));
|
||||
}
|
||||
|
||||
/**
|
||||
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||
*/
|
||||
@Test
|
||||
public void testClippingOfAllelesPlainPolyMorph() {
|
||||
String ref = "C";
|
||||
ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(Allele.create(ref,true));
|
||||
alleles.add(Allele.create("T",false));
|
||||
alleles.add(Allele.create("G",false));
|
||||
|
||||
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
||||
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",1,1)));
|
||||
|
||||
// we know the ordering
|
||||
Assert.assertTrue(locAndList.second.get(0).toString().equals("C*"));
|
||||
Assert.assertTrue(locAndList.second.get(0).isReference());
|
||||
Assert.assertTrue(locAndList.second.get(1).toString().equals("T"));
|
||||
Assert.assertTrue(locAndList.second.get(2).toString().equals("G"));
|
||||
}
|
||||
|
||||
/**
|
||||
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||
*/
|
||||
@Test
|
||||
public void testClippingOfAllelesInsertions() {
|
||||
String ref = "C";
|
||||
ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(Allele.create(ref,true));
|
||||
alleles.add(Allele.create("CTTTTT",false));
|
||||
alleles.add(Allele.create("GGGGGG",false));
|
||||
|
||||
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
||||
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",1,1)));
|
||||
|
||||
// we know the ordering
|
||||
Assert.assertTrue(locAndList.second.get(0).toString().equals("C*"));
|
||||
Assert.assertTrue(locAndList.second.get(0).isReference());
|
||||
Assert.assertTrue(locAndList.second.get(1).toString().equals("CTTTTT"));
|
||||
Assert.assertTrue(locAndList.second.get(2).toString().equals("GGGGGG"));
|
||||
}
|
||||
|
||||
/**
|
||||
* a test setup for the VCF 4 codec
|
||||
*/
|
||||
private class TestSetup {
|
||||
private AsciiLineReader reader;
|
||||
private VCF4Codec codec;
|
||||
private VCFHeader header;
|
||||
|
||||
public AsciiLineReader getReader() {
|
||||
return reader;
|
||||
}
|
||||
|
||||
public VCF4Codec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
|
||||
public VCFHeader getHeader() {
|
||||
return header;
|
||||
}
|
||||
|
||||
public TestSetup invoke(File vcfFile) {
|
||||
reader = null;
|
||||
try {
|
||||
reader = new AsciiLineReader(new FileInputStream(vcfFile));
|
||||
} catch (FileNotFoundException e) {
|
||||
Assert.fail("Unable to parse out VCF file " + vcfFile);
|
||||
}
|
||||
codec = new VCF4Codec();
|
||||
codec.readHeader(reader);
|
||||
header = codec.getHeader(VCFHeader.class);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue