incremental changes to the VCF4 codec, including allele clipping down to the minimum reference allele; adding unit testing for certain aspects of the parsing. Not ready for prime-time yet.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3604 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
de9f1f575f
commit
32f324a009
|
|
@ -5,9 +5,7 @@ import org.broad.tribble.FeatureCodec;
|
||||||
import org.broad.tribble.exception.CodecLineParsingException;
|
import org.broad.tribble.exception.CodecLineParsingException;
|
||||||
import org.broad.tribble.util.LineReader;
|
import org.broad.tribble.util.LineReader;
|
||||||
import org.broad.tribble.util.ParsingUtils;
|
import org.broad.tribble.util.ParsingUtils;
|
||||||
import org.broad.tribble.vcf.VCFHeader;
|
import org.broad.tribble.vcf.*;
|
||||||
import org.broad.tribble.vcf.VCFHeaderLine;
|
|
||||||
import org.broad.tribble.vcf.VCFReaderUtils;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||||
|
|
@ -15,6 +13,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -27,19 +26,39 @@ import java.util.*;
|
||||||
public class VCF4Codec implements FeatureCodec {
|
public class VCF4Codec implements FeatureCodec {
|
||||||
|
|
||||||
// we have to store the list of strings that make up the header until they're needed
|
// we have to store the list of strings that make up the header until they're needed
|
||||||
private List<String> headerStrings = new ArrayList<String>();
|
|
||||||
private VCFHeader header = null;
|
private VCFHeader header = null;
|
||||||
|
|
||||||
public VCF4Codec() {
|
// used to subtract from the
|
||||||
this(true);
|
private static int ZERO_CHAR = (byte)'0';
|
||||||
//throw new StingException("DON'T USE THIS");
|
|
||||||
}
|
|
||||||
|
|
||||||
// todo -- remove me when done
|
// a mapping of the allele
|
||||||
public VCF4Codec(boolean itsOKImTesting) {
|
private static Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
|
||||||
if ( ! itsOKImTesting )
|
|
||||||
throw new StingException("DON'T USE THIS");
|
//private static String[] CachedGTKey = new String[100];
|
||||||
}
|
private static String[] CachedGTValues = new String[100];
|
||||||
|
|
||||||
|
// for performance testing purposes
|
||||||
|
public static boolean parseGenotypesToo = true;
|
||||||
|
|
||||||
|
// for performance testing purposes
|
||||||
|
public static boolean validate = true;
|
||||||
|
|
||||||
|
// a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
|
||||||
|
private String[] parts = null;
|
||||||
|
|
||||||
|
// for performance we cache the hashmap of filter encodings for quick lookup
|
||||||
|
private HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>();
|
||||||
|
|
||||||
|
// a set of the genotype keys? TODO: rename to something better
|
||||||
|
private String[] GTKeys = new String[100];
|
||||||
|
|
||||||
|
// a list of the info fields, filter fields, and format fields
|
||||||
|
ArrayList<String> infoFields = new ArrayList<String>();
|
||||||
|
ArrayList<String> formatFields = new ArrayList<String>();
|
||||||
|
ArrayList<String> filterFields = new ArrayList<String>();
|
||||||
|
|
||||||
|
// do we want to validate the info, format, and filter fields
|
||||||
|
private final boolean validateFromHeader = true;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates
|
* this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates
|
||||||
|
|
@ -48,6 +67,8 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public int readHeader(LineReader reader) {
|
public int readHeader(LineReader reader) {
|
||||||
|
List<String> headerStrings = new ArrayList<String>();
|
||||||
|
|
||||||
String line = "";
|
String line = "";
|
||||||
try {
|
try {
|
||||||
while ((line = reader.readLine()) != null) {
|
while ((line = reader.readLine()) != null) {
|
||||||
|
|
@ -56,12 +77,25 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
}
|
}
|
||||||
else if (line.startsWith("#")) {
|
else if (line.startsWith("#")) {
|
||||||
headerStrings.add(line);
|
headerStrings.add(line);
|
||||||
String[] genotypes = line.split("\\s+");
|
header = VCFReaderUtils.createHeader(headerStrings, VCFHeaderVersion.VCF4_0);
|
||||||
Set<String> genotypeSampleNames = new TreeSet<String>();
|
|
||||||
for (int x = 8; x < genotypes.length; x++)
|
// load the parsing fields
|
||||||
genotypeSampleNames.add(genotypes[x]);
|
Set<VCFHeaderLine> headerLines = header.getMetaData();
|
||||||
// this should be the next line -> header = VCFReaderUtils.createHeader(headerStrings);
|
|
||||||
header = new VCFHeader(new HashSet<VCFHeaderLine>(),genotypeSampleNames);
|
// setup our look-up lists for validation
|
||||||
|
for (VCFHeaderLine hl : headerLines) {
|
||||||
|
if (hl.getClass() == VCFFilterHeaderLine.class)
|
||||||
|
this.filterFields.add(((VCFFilterHeaderLine)hl).getmName());
|
||||||
|
if (hl.getClass() == VCFFormatHeaderLine.class)
|
||||||
|
this.formatFields.add(((VCFFormatHeaderLine)hl).getmName());
|
||||||
|
if (hl.getClass() == VCFInfoHeaderLine.class)
|
||||||
|
this.infoFields.add(((VCFInfoHeaderLine)hl).getmName());
|
||||||
|
}
|
||||||
|
// sort the lists
|
||||||
|
Collections.sort(filterFields);
|
||||||
|
Collections.sort(formatFields);
|
||||||
|
Collections.sort(infoFields);
|
||||||
|
|
||||||
return headerStrings.size();
|
return headerStrings.size();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
@ -73,9 +107,41 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
throw new RuntimeException("IO Exception ", e);
|
throw new RuntimeException("IO Exception ", e);
|
||||||
}
|
}
|
||||||
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int ZERO_CHAR = (byte)'0';
|
public Feature decodeLoc(String line) {
|
||||||
|
return decode(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* decode the line into a feature (VariantContext)
|
||||||
|
* @param line the line
|
||||||
|
* @return a VariantContext
|
||||||
|
*/
|
||||||
|
public Feature decode(String line) {
|
||||||
|
if (parts == null)
|
||||||
|
parts = new String[header.getColumnCount()];
|
||||||
|
|
||||||
|
int nParts = ParsingUtils.split(line, parts, '\t');
|
||||||
|
|
||||||
|
// our header cannot be null, we need the genotype sample names and counts
|
||||||
|
if (header == null) throw new IllegalStateException("VCF Header cannot be null");
|
||||||
|
|
||||||
|
// check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
|
||||||
|
if (nParts != header.getColumnCount())
|
||||||
|
throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + nParts + " for line " + line);
|
||||||
|
|
||||||
|
|
||||||
|
return parseVCFLine(parts);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a an allele from an index and an array of alleles
|
||||||
|
* @param index the index
|
||||||
|
* @param alleles the alleles
|
||||||
|
* @return an Allele
|
||||||
|
*/
|
||||||
private static Allele oneAllele(char index, List<Allele> alleles) {
|
private static Allele oneAllele(char index, List<Allele> alleles) {
|
||||||
if ( index == '.' )
|
if ( index == '.' )
|
||||||
return Allele.NO_CALL;
|
return Allele.NO_CALL;
|
||||||
|
|
@ -85,8 +151,14 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* parse genotype alleles from the genotype string
|
||||||
|
* @param GT
|
||||||
|
* @param alleles
|
||||||
|
* @param cache
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
||||||
// this should cache results [since they are immutable] and return a single object for each genotype
|
// this should cache results [since they are immutable] and return a single object for each genotype
|
||||||
if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
|
if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
|
||||||
|
|
@ -99,6 +171,12 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
return GTAlleles;
|
return GTAlleles;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* parse out the info fields
|
||||||
|
* @param infoField the fields
|
||||||
|
* @param id the indentifier
|
||||||
|
* @return a mapping of keys to objects
|
||||||
|
*/
|
||||||
private Map<String, Object> parseInfo(String infoField, String id) {
|
private Map<String, Object> parseInfo(String infoField, String id) {
|
||||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||||
|
|
||||||
|
|
@ -120,73 +198,71 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
attributes.put(key, value);
|
attributes.put(key, value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// validate the fields
|
||||||
|
validateFields(attributes.keySet(),infoFields);
|
||||||
|
|
||||||
attributes.put("ID", id);
|
attributes.put("ID", id);
|
||||||
return attributes;
|
return attributes;
|
||||||
}
|
}
|
||||||
|
|
||||||
//private static String[] CachedGTKey = new String[100];
|
private void validateFields(Set<String> attributes, List<String> fields) {
|
||||||
private static String[] CachedGTValues = new String[100];
|
// validate the info fields
|
||||||
|
if (validateFromHeader) {
|
||||||
public static boolean parseGenotypesToo = false; // for performance testing purposes
|
int count = 0;
|
||||||
public static boolean validate = true; // for performance testing purposes
|
for (String attr : attributes)
|
||||||
private static boolean REQUIRE_HEADER = false;
|
if (Collections.binarySearch(fields,attr) < 0)
|
||||||
|
throw new StingException("Unable to find field descibing attribute " + attr);
|
||||||
// a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
|
|
||||||
private String[] parts = null;
|
|
||||||
|
|
||||||
public Feature decodeLoc(String line) {
|
|
||||||
return decode(line);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Feature decode(String line) {
|
|
||||||
if ( parts == null )
|
|
||||||
parts = REQUIRE_HEADER ? new String[header.getColumnCount()] : new String[10000]; // todo -- remove require header
|
|
||||||
|
|
||||||
int nParts = ParsingUtils.split(line, parts, '\t');
|
|
||||||
|
|
||||||
if (REQUIRE_HEADER) { // todo -- remove require header
|
|
||||||
// our header cannot be null, we need the genotype sample names and counts
|
|
||||||
if ( header == null) throw new IllegalStateException("VCF Header cannot be null");
|
|
||||||
|
|
||||||
// check to make sure the split resulted in the correct number of fields (8 + (1 + genotytpe counts if it has genotypes)
|
|
||||||
if (nParts != header.getColumnCount()) throw new IllegalArgumentException("we expected " + header.getColumnCount() + " columns and we got " + nParts + " for line " + line);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return parseVCFLine(parts, nParts);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* parse out the qual value
|
||||||
|
* @param qualString the quality string
|
||||||
|
* @return return a double
|
||||||
|
*/
|
||||||
private static Double parseQual(String qualString) {
|
private static Double parseQual(String qualString) {
|
||||||
// todo -- remove double once we deal with annoying VCFs from 1KG
|
// todo -- remove double once we deal with annoying VCFs from 1KG
|
||||||
return qualString.equals("-1") || qualString.equals("-1.0") ? VariantContext.NO_NEG_LOG_10PERROR : Double.valueOf(qualString) / 10;
|
return qualString.equals("-1") || qualString.equals("-1.0") ? VariantContext.NO_NEG_LOG_10PERROR : Double.valueOf(qualString) / 10;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* parse out the alleles
|
||||||
|
* @param ref the reference base
|
||||||
|
* @param alts a string of alternates to break into alleles
|
||||||
|
* @return a list of alleles, and a pair of the shortest and longest sequence
|
||||||
|
*/
|
||||||
private List<Allele> parseAlleles(String ref, String alts) {
|
private List<Allele> parseAlleles(String ref, String alts) {
|
||||||
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
|
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
|
||||||
|
|
||||||
// ref
|
// ref
|
||||||
checkAllele(ref);
|
checkAllele(ref);
|
||||||
Allele refAllele = Allele.create(ref, true);
|
Allele refAllele = Allele.create(ref, true);
|
||||||
alleles.add(refAllele);
|
alleles.add(refAllele);
|
||||||
|
|
||||||
if ( alts.indexOf(",") == -1 ) { // only 1 alternatives, don't call string split
|
if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split
|
||||||
parse1Allele(alleles, alts);
|
parseSingleAllele(alleles, alts);
|
||||||
} else {
|
else
|
||||||
for ( String alt : Utils.split(alts, ",") ) {
|
for ( String alt : Utils.split(alts, ",") )
|
||||||
parse1Allele(alleles, alt);
|
parseSingleAllele(alleles, alt);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return alleles;
|
return alleles;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* check to make sure the allele is an acceptable allele
|
||||||
|
* @param allele the allele to check
|
||||||
|
*/
|
||||||
private static void checkAllele(String allele) {
|
private static void checkAllele(String allele) {
|
||||||
if ( ! Allele.acceptableAlleleBases(allele) ) {
|
if ( ! Allele.acceptableAlleleBases(allele) ) {
|
||||||
throw new StingException("Unparsable vcf record with allele " + allele);
|
throw new StingException("Unparsable vcf record with allele " + allele);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void parse1Allele(List<Allele> alleles, String alt) {
|
/**
|
||||||
|
* parse a single allele, given the allele list
|
||||||
|
* @param alleles the alleles available
|
||||||
|
* @param alt the allele to parse
|
||||||
|
*/
|
||||||
|
private void parseSingleAllele(List<Allele> alleles, String alt) {
|
||||||
checkAllele(alt);
|
checkAllele(alt);
|
||||||
|
|
||||||
Allele allele = Allele.create(alt, false);
|
Allele allele = Allele.create(alt, false);
|
||||||
|
|
@ -194,25 +270,44 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
alleles.add(allele);
|
alleles.add(allele);
|
||||||
}
|
}
|
||||||
|
|
||||||
// todo -- check a static map from filter String to HashSets to reuse objects and avoid parsing
|
/**
|
||||||
|
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
|
||||||
|
* @param filterString the string to parse
|
||||||
|
* @return a set of the filters applied
|
||||||
|
*/
|
||||||
private Set<String> parseFilters(String filterString) {
|
private Set<String> parseFilters(String filterString) {
|
||||||
if ( filterString.equals(".") )
|
Set<String> fFields;
|
||||||
|
|
||||||
|
// a PASS is simple (no filters)
|
||||||
|
if ( filterString.equals("PASS") ) {
|
||||||
return null;
|
return null;
|
||||||
|
}
|
||||||
|
// else do we have the filter string cached?
|
||||||
|
else if (filterHash.containsKey(filterString)) {
|
||||||
|
fFields = filterHash.get(filterString);
|
||||||
|
}
|
||||||
|
// otherwise we have to parse and cache the value
|
||||||
else {
|
else {
|
||||||
HashSet<String> s = new HashSet<String>(1);
|
LinkedHashSet<String> s = new LinkedHashSet<String>(1);
|
||||||
if ( filterString.indexOf(";") == -1 ) {
|
if ( filterString.indexOf(";") == -1 ) {
|
||||||
s.add(filterString);
|
s.add(filterString);
|
||||||
} else {
|
} else {
|
||||||
s.addAll(Utils.split(filterString, ";"));
|
s.addAll(Utils.split(filterString, ";"));
|
||||||
}
|
}
|
||||||
|
filterHash.put(filterString,s);
|
||||||
return s;
|
fFields = s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
validateFields(fFields,filterFields);
|
||||||
|
return fFields;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String[] GTKeys = new String[100];
|
/**
|
||||||
|
* parse out the VCF line
|
||||||
private VariantContext parseVCFLine(String[] parts, int nParts) {
|
* @param parts the parts split up
|
||||||
|
* @return a variant context object
|
||||||
|
*/
|
||||||
|
private VariantContext parseVCFLine(String[] parts) {
|
||||||
String contig = parts[0];
|
String contig = parts[0];
|
||||||
long pos = Long.valueOf(parts[1]);
|
long pos = Long.valueOf(parts[1]);
|
||||||
String id = parts[2];
|
String id = parts[2];
|
||||||
|
|
@ -224,20 +319,27 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
String GT = parts[8];
|
String GT = parts[8];
|
||||||
int genotypesStart = 9;
|
int genotypesStart = 9;
|
||||||
|
|
||||||
List<Allele> alleles = parseAlleles(ref, alts);
|
List<Allele> alleles = parseAlleles(ref, alts);
|
||||||
Set<String> filters = parseFilters(filter);
|
Set<String> filters = parseFilters(filter);
|
||||||
Map<String, Object> attributes = parseInfo(info, id);
|
Map<String, Object> attributes = parseInfo(info, id);
|
||||||
|
|
||||||
// parse genotypes
|
// parse genotypes
|
||||||
int nGTKeys = ParsingUtils.split(GT, GTKeys, ':');
|
int nGTKeys = ParsingUtils.split(GT, GTKeys, ':');
|
||||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(Math.max(nParts - genotypesStart, 1));
|
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(Math.max(parts.length - genotypesStart, 1));
|
||||||
|
Iterator<String> iter = header.getGenotypeSamples().iterator();
|
||||||
|
|
||||||
|
Pair<GenomeLoc,List<Allele>> locAndAlleles = (ref.length() > 1) ?
|
||||||
|
clipAlleles(contig,pos,ref,alleles) :
|
||||||
|
new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,pos),alleles);
|
||||||
|
|
||||||
|
|
||||||
if ( parseGenotypesToo ) {
|
if ( parseGenotypesToo ) {
|
||||||
alleleMap.clear();
|
alleleMap.clear();
|
||||||
for ( int genotypeOffset = genotypesStart; genotypeOffset < nParts; genotypeOffset++ ) {
|
for ( int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++ ) {
|
||||||
String sample = parts[genotypeOffset];
|
String sample = parts[genotypeOffset];
|
||||||
String[] GTValues = CachedGTValues;
|
String[] GTValues = CachedGTValues;
|
||||||
ParsingUtils.split(sample, GTValues, ':');
|
ParsingUtils.split(sample, GTValues, ':');
|
||||||
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValues[0], alleles, alleleMap);
|
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValues[0], locAndAlleles.second, alleleMap);
|
||||||
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
|
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
|
||||||
Set<String> genotypeFilters = null;
|
Set<String> genotypeFilters = null;
|
||||||
|
|
||||||
|
|
@ -249,32 +351,69 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
if ( GTKeys[i].equals("GQ") ) {
|
if ( GTKeys[i].equals("GQ") ) {
|
||||||
GTQual = parseQual(GTValues[i]);
|
GTQual = parseQual(GTValues[i]);
|
||||||
} if ( GTKeys[i].equals("FL") ) { // deal with genotype filters here
|
} if ( GTKeys[i].equals("FL") ) { // deal with genotype filters here
|
||||||
// todo -- get genotype filters working
|
genotypeFilters.addAll(parseFilters(GTValues[i]));
|
||||||
// genotypeFilters = new HashSet<String>();
|
|
||||||
// if ( vcfG.isFiltered() ) // setup the FL genotype filter fields
|
|
||||||
// genotypeFilters.addAll(Arrays.asList(vcfG.getFields().get(VCFGenotypeRecord.GENOTYPE_FILTER_KEY).split(";")));
|
|
||||||
} else {
|
} else {
|
||||||
gtAttributes.put(GTKeys[i], GTValues[i]);
|
gtAttributes.put(GTKeys[i], GTValues[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// validate the format fields
|
||||||
|
validateFields(gtAttributes.keySet(),formatFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean phased = GTKeys[0].charAt(1) == '|';
|
boolean phased = GTKeys[0].charAt(1) == '|';
|
||||||
|
|
||||||
// todo -- actually parse the header to get the sample name
|
Genotype g = new Genotype(iter.next(), genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
|
||||||
Genotype g = new Genotype("X" + genotypeOffset, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
|
|
||||||
genotypes.put(g.getSampleName(), g);
|
genotypes.put(g.getSampleName(), g);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// todo -- doesn't work for indels [the whole reason for VCF4]
|
|
||||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc(contig,pos,pos + ref.length() - 1);
|
|
||||||
|
|
||||||
// todo -- we need access to our track name to name the variant context
|
// todo -- we need access to our track name to name the variant context
|
||||||
VariantContext vc = new VariantContext("foo", loc, alleles, genotypes, qual, filters, attributes);
|
return new VariantContext("foo", locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
|
||||||
return vc;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* clip the alleles, based on the reference
|
||||||
|
* @param unclippedAlleles the list of alleles
|
||||||
|
* @return a list of alleles, clipped to the reference
|
||||||
|
*/
|
||||||
|
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
|
||||||
|
List<Allele> newAlleleList = new ArrayList<Allele>();
|
||||||
|
|
||||||
|
// find the preceeding string common to all alleles and the reference
|
||||||
|
int forwardClipped = 0;
|
||||||
|
boolean clipping = true;
|
||||||
|
while (clipping) {
|
||||||
|
for (Allele a : unclippedAlleles)
|
||||||
|
if (forwardClipped > ref.length() - 1)
|
||||||
|
clipping = false;
|
||||||
|
else if (a.length() <= forwardClipped || (a.getBases()[forwardClipped] != ref.getBytes()[forwardClipped])) {
|
||||||
|
clipping = false;
|
||||||
|
}
|
||||||
|
if (clipping) forwardClipped++;
|
||||||
|
}
|
||||||
|
|
||||||
|
int reverseClipped = 0;
|
||||||
|
clipping = true;
|
||||||
|
while (clipping) {
|
||||||
|
for (Allele a : unclippedAlleles)
|
||||||
|
if (a.length() - reverseClipped < 0 || a.length() - forwardClipped == 0)
|
||||||
|
clipping = false;
|
||||||
|
else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
|
||||||
|
clipping = false;
|
||||||
|
if (clipping) reverseClipped++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check to see if we're about to clip all the bases from the reference, if so back off the front clip a base
|
||||||
|
if (forwardClipped + reverseClipped >= ref.length())
|
||||||
|
forwardClipped--;
|
||||||
|
|
||||||
|
for (Allele a : unclippedAlleles)
|
||||||
|
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipped,a.getBases().length-reverseClipped),a.isReference()));
|
||||||
|
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipped,(position+ref.length()-reverseClipped-1)),
|
||||||
|
newAlleleList);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @return the type of record
|
* @return the type of record
|
||||||
|
|
@ -284,8 +423,15 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
return VariantContext.class;
|
return VariantContext.class;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the header
|
||||||
|
* @param clazz the class were expecting
|
||||||
|
* @return our VCFHeader
|
||||||
|
* @throws ClassCastException
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Object getHeader(Class clazz) throws ClassCastException {
|
public VCFHeader getHeader(Class clazz) throws ClassCastException {
|
||||||
return null; // TODO: fix this Aaron
|
if (clazz != VCFHeader.class) throw new ClassCastException("expecting class " + clazz + " but VCF4Codec provides " + VCFHeader.class);
|
||||||
|
return this.header;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -91,7 +91,7 @@ public class VCF4ReaderTestWalker extends RodWalker<VCFRecord,Long> {
|
||||||
|
|
||||||
String[] parts = new String[10000];
|
String[] parts = new String[10000];
|
||||||
public void onTraversalDone(Long num){
|
public void onTraversalDone(Long num){
|
||||||
VCF4Codec vcf4codec = new VCF4Codec(true);
|
VCF4Codec vcf4codec = new VCF4Codec();
|
||||||
VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES;
|
VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES;
|
||||||
VCF4Codec.validate = ! DontValidate;
|
VCF4Codec.validate = ! DontValidate;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,12 +2,13 @@ package org.broadinstitute.sting;
|
||||||
|
|
||||||
import org.apache.log4j.*;
|
import org.apache.log4j.*;
|
||||||
import org.apache.log4j.spi.LoggingEvent;
|
import org.apache.log4j.spi.LoggingEvent;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.junit.*;
|
import org.junit.*;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.*;
|
||||||
import java.io.File;
|
import java.math.BigInteger;
|
||||||
import java.io.FileReader;
|
import java.security.MessageDigest;
|
||||||
import java.io.IOException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
import java.util.Enumeration;
|
import java.util.Enumeration;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -175,4 +176,51 @@ public abstract class BaseTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* a little utility function for all tests to md5sum a file
|
||||||
|
* Shameless taken from:
|
||||||
|
*
|
||||||
|
* http://www.javalobby.org/java/forums/t84420.html
|
||||||
|
*
|
||||||
|
* @param file the file
|
||||||
|
* @return a string
|
||||||
|
*/
|
||||||
|
public static String md5SumFile(File file) {
|
||||||
|
MessageDigest digest;
|
||||||
|
try {
|
||||||
|
digest = MessageDigest.getInstance("MD5");
|
||||||
|
} catch (NoSuchAlgorithmException e) {
|
||||||
|
throw new StingException("Unable to find MD5 digest");
|
||||||
|
}
|
||||||
|
InputStream is = null;
|
||||||
|
try {
|
||||||
|
is = new FileInputStream(file);
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
throw new StingException("Unable to open file " + file);
|
||||||
|
}
|
||||||
|
byte[] buffer = new byte[8192];
|
||||||
|
int read = 0;
|
||||||
|
try {
|
||||||
|
while ((read = is.read(buffer)) > 0) {
|
||||||
|
digest.update(buffer, 0, read);
|
||||||
|
}
|
||||||
|
byte[] md5sum = digest.digest();
|
||||||
|
BigInteger bigInt = new BigInteger(1, md5sum);
|
||||||
|
return bigInt.toString(16);
|
||||||
|
|
||||||
|
}
|
||||||
|
catch (IOException e) {
|
||||||
|
throw new StingException("Unable to process file for MD5", e);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
try {
|
||||||
|
is.close();
|
||||||
|
}
|
||||||
|
catch (IOException e) {
|
||||||
|
throw new StingException("Unable to close input stream for MD5 calculation", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,329 @@
|
||||||
|
package org.broadinstitute.sting.gatk.refdata.features.vcf4;
|
||||||
|
|
||||||
|
import org.broad.tribble.util.AsciiLineReader;
|
||||||
|
import org.broad.tribble.vcf.*;
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* test out pieces of the VCF 4 codec.
|
||||||
|
*/
|
||||||
|
public class VCF4UnitTest extends BaseTest {
|
||||||
|
File vcfFile = new File("testdata/vcfExmaple.vcf4");
|
||||||
|
|
||||||
|
// setup the contig ordering
|
||||||
|
@BeforeClass
|
||||||
|
public static void setupContig() {
|
||||||
|
IndexedFastaSequenceFile seq;
|
||||||
|
try {
|
||||||
|
seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta"));
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
throw new StingException("unable to load the sequence dictionary");
|
||||||
|
}
|
||||||
|
GenomeLocParser.setupRefContigOrdering(seq);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testReadBasicHeader() {
|
||||||
|
TestSetup testSetup = new TestSetup().invoke(vcfFile);
|
||||||
|
|
||||||
|
int seenRecords = 0;
|
||||||
|
|
||||||
|
// check some field entries of each type
|
||||||
|
Set<VCFHeaderLine> lines = testSetup.getHeader().getMetaData();
|
||||||
|
|
||||||
|
for (VCFHeaderLine line : lines) {
|
||||||
|
// check the vcf info header lines
|
||||||
|
if (line instanceof VCFInfoHeaderLine) {
|
||||||
|
VCFInfoHeaderLine ihLIne = (VCFInfoHeaderLine)line;
|
||||||
|
|
||||||
|
// test a normal info line
|
||||||
|
if (ihLIne.getmName().equals("NS")) {
|
||||||
|
Assert.assertEquals(VCFInfoHeaderLine.INFO_TYPE.Integer,ihLIne.getmType());
|
||||||
|
Assert.assertEquals(1,ihLIne.getmCount());
|
||||||
|
Assert.assertTrue("Number of Samples With Data".equals(ihLIne.getmDescription()));
|
||||||
|
seenRecords++;
|
||||||
|
}
|
||||||
|
// test a info line that uses the period to represent an unbounded value
|
||||||
|
if (ihLIne.getmName().equals("AF")) {
|
||||||
|
Assert.assertEquals(VCFInfoHeaderLine.INFO_TYPE.Float,ihLIne.getmType());
|
||||||
|
Assert.assertEquals(VCFInfoHeaderLine.UNBOUNDED,ihLIne.getmCount());
|
||||||
|
Assert.assertTrue("Allele Frequency".equals(ihLIne.getmDescription()));
|
||||||
|
seenRecords++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// check the vcf filter header lines
|
||||||
|
if (line instanceof VCFFilterHeaderLine) {
|
||||||
|
VCFFilterHeaderLine fhLIne = (VCFFilterHeaderLine)line;
|
||||||
|
if (fhLIne.getmName().equals("q10")) {
|
||||||
|
Assert.assertTrue("Quality below 10".equals(fhLIne.getmDescription()));
|
||||||
|
seenRecords++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check the vcf info header lines
|
||||||
|
if (line instanceof VCFFormatHeaderLine) {
|
||||||
|
VCFFormatHeaderLine ifLIne = (VCFFormatHeaderLine)line;
|
||||||
|
if (ifLIne.getmName().equals("GT")) {
|
||||||
|
Assert.assertEquals(VCFFormatHeaderLine.FORMAT_TYPE.String,ifLIne.getmType());
|
||||||
|
Assert.assertEquals(1,ifLIne.getmCount());
|
||||||
|
Assert.assertTrue("Genotype".equals(ifLIne.getmDescription()));
|
||||||
|
seenRecords++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert.assertEquals("We expected to see three records (one of each type we check), but didn't.",4,seenRecords);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOutputHeader() {
|
||||||
|
TestSetup testSetup = new TestSetup().invoke(vcfFile);
|
||||||
|
|
||||||
|
File tempFile = null;
|
||||||
|
try {
|
||||||
|
tempFile = File.createTempFile("VCF4Test","vcf");
|
||||||
|
} catch (IOException e) {
|
||||||
|
Assert.fail("Couldn't create a temporary file ");
|
||||||
|
}
|
||||||
|
|
||||||
|
// write it to disk
|
||||||
|
VCFWriter writer = new VCFWriter(tempFile);
|
||||||
|
writer.writeHeader(testSetup.getHeader());
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
// md5 sum the file
|
||||||
|
Assert.assertTrue("expecting md5sum of 637f3c92806d993a9c7b7116da398d6, but got " + md5SumFile(tempFile),"637f3c92806d993a9c7b7116da398d6".equals(md5SumFile(tempFile)));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCountVCF4Records() {
|
||||||
|
TestSetup testSetup = new TestSetup().invoke(vcfFile);
|
||||||
|
AsciiLineReader reader = testSetup.getReader();
|
||||||
|
VCF4Codec codec = testSetup.getCodec();
|
||||||
|
|
||||||
|
// now parse the lines
|
||||||
|
String line = null;
|
||||||
|
try {
|
||||||
|
line = reader.readLine();
|
||||||
|
} catch (IOException e) {
|
||||||
|
Assert.fail("Failed to read a line");
|
||||||
|
}
|
||||||
|
|
||||||
|
// our record count
|
||||||
|
int recordCount = 0;
|
||||||
|
while (line != null) {
|
||||||
|
try {
|
||||||
|
//System.err.println(codec.decode(line).toString());
|
||||||
|
recordCount++;
|
||||||
|
testSetup.codec.decode(line);
|
||||||
|
line = reader.readLine();
|
||||||
|
} catch (IOException e) {
|
||||||
|
Assert.fail("Failed to read a line");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Assert.assertEquals(6,recordCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// two constants for testing
|
||||||
|
String regularLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||||
|
String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||||
|
String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HG=12\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCheckInfoValidation() {
|
||||||
|
TestSetup testSetup = new TestSetup().invoke(vcfFile);
|
||||||
|
testSetup.codec.decode(regularLine);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCheckTwoFewInfoValidation() {
|
||||||
|
TestSetup testSetup = new TestSetup().invoke(vcfFile);
|
||||||
|
testSetup.codec.decode(twoFewInfoLine);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expected=StingException.class)
|
||||||
|
public void testCheckTwoManyInfoValidation() {
|
||||||
|
TestSetup testSetup = new TestSetup().invoke(vcfFile);
|
||||||
|
testSetup.codec.decode(twoManyInfoLine);
|
||||||
|
}
|
||||||
|
|
||||||
|
//File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
|
||||||
|
|
||||||
|
/*@Test
|
||||||
|
public void checkLargeVCF() {
|
||||||
|
TestSetup testSetup = new TestSetup().invoke(largeVCF);
|
||||||
|
AsciiLineReader reader = testSetup.getReader();
|
||||||
|
VCF4Codec codec = testSetup.getCodec();
|
||||||
|
|
||||||
|
// now parse the lines
|
||||||
|
String line = null;
|
||||||
|
try {
|
||||||
|
line = reader.readLine();
|
||||||
|
} catch (IOException e) {
|
||||||
|
Assert.fail("Failed to read a line");
|
||||||
|
}
|
||||||
|
|
||||||
|
// our record count
|
||||||
|
int recordCount = 0;
|
||||||
|
int badRecordCount = 0;
|
||||||
|
while (line != null) {
|
||||||
|
try {
|
||||||
|
recordCount++;
|
||||||
|
try {
|
||||||
|
testSetup.codec.decode(line);
|
||||||
|
} catch (Exception e) {
|
||||||
|
System.err.println(e.getMessage() + " -> " + line);
|
||||||
|
System.err.println(line);
|
||||||
|
badRecordCount++;
|
||||||
|
}
|
||||||
|
line = reader.readLine();
|
||||||
|
if (recordCount % 1000 == 0)
|
||||||
|
System.err.println("record count == " + recordCount);
|
||||||
|
} catch (IOException e) {
|
||||||
|
Assert.fail("Failed to read a line");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Assert.assertEquals(0,badRecordCount);
|
||||||
|
Assert.assertEquals(728075,recordCount);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testClippingOfAllelesDeletionAndInsertion() {
|
||||||
|
String ref = "GGTT";
|
||||||
|
ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||||
|
alleles.add(Allele.create(ref,true));
|
||||||
|
alleles.add(Allele.create("GGAATT",false));
|
||||||
|
alleles.add(Allele.create("GT",false));
|
||||||
|
|
||||||
|
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
||||||
|
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,3)));
|
||||||
|
|
||||||
|
// we know the ordering
|
||||||
|
//System.err.println(locAndList.second.get(0).toString());
|
||||||
|
//System.err.println(locAndList.second.get(1).toString());
|
||||||
|
//System.err.println(locAndList.second.get(2).toString());
|
||||||
|
Assert.assertTrue(locAndList.second.get(0).toString().equals("GT*"));
|
||||||
|
Assert.assertTrue(locAndList.second.get(0).isReference());
|
||||||
|
Assert.assertTrue(locAndList.second.get(1).toString().equals("GAAT"));
|
||||||
|
Assert.assertTrue(locAndList.second.get(2).toString().equals("-"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testClippingOfAllelesLongRefRepeat() {
|
||||||
|
String ref = "GGGG";
|
||||||
|
ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||||
|
alleles.add(Allele.create(ref,true));
|
||||||
|
alleles.add(Allele.create("G",false));
|
||||||
|
alleles.add(Allele.create("GG",false));
|
||||||
|
|
||||||
|
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
||||||
|
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,4)));
|
||||||
|
|
||||||
|
// we know the ordering
|
||||||
|
Assert.assertTrue(locAndList.second.get(0).toString().equals("GGG*"));
|
||||||
|
Assert.assertTrue(locAndList.second.get(0).isReference());
|
||||||
|
Assert.assertTrue(locAndList.second.get(1).toString().equals("-"));
|
||||||
|
Assert.assertTrue(locAndList.second.get(2).toString().equals("G"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testClippingOfAllelesPlainPolyMorph() {
|
||||||
|
String ref = "C";
|
||||||
|
ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||||
|
alleles.add(Allele.create(ref,true));
|
||||||
|
alleles.add(Allele.create("T",false));
|
||||||
|
alleles.add(Allele.create("G",false));
|
||||||
|
|
||||||
|
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
||||||
|
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",1,1)));
|
||||||
|
|
||||||
|
// we know the ordering
|
||||||
|
Assert.assertTrue(locAndList.second.get(0).toString().equals("C*"));
|
||||||
|
Assert.assertTrue(locAndList.second.get(0).isReference());
|
||||||
|
Assert.assertTrue(locAndList.second.get(1).toString().equals("T"));
|
||||||
|
Assert.assertTrue(locAndList.second.get(2).toString().equals("G"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testClippingOfAllelesInsertions() {
|
||||||
|
String ref = "C";
|
||||||
|
ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||||
|
alleles.add(Allele.create(ref,true));
|
||||||
|
alleles.add(Allele.create("CTTTTT",false));
|
||||||
|
alleles.add(Allele.create("GGGGGG",false));
|
||||||
|
|
||||||
|
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
||||||
|
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",1,1)));
|
||||||
|
|
||||||
|
// we know the ordering
|
||||||
|
Assert.assertTrue(locAndList.second.get(0).toString().equals("C*"));
|
||||||
|
Assert.assertTrue(locAndList.second.get(0).isReference());
|
||||||
|
Assert.assertTrue(locAndList.second.get(1).toString().equals("CTTTTT"));
|
||||||
|
Assert.assertTrue(locAndList.second.get(2).toString().equals("GGGGGG"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* a test setup for the VCF 4 codec
|
||||||
|
*/
|
||||||
|
private class TestSetup {
|
||||||
|
private AsciiLineReader reader;
|
||||||
|
private VCF4Codec codec;
|
||||||
|
private VCFHeader header;
|
||||||
|
|
||||||
|
public AsciiLineReader getReader() {
|
||||||
|
return reader;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCF4Codec getCodec() {
|
||||||
|
return codec;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCFHeader getHeader() {
|
||||||
|
return header;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TestSetup invoke(File vcfFile) {
|
||||||
|
reader = null;
|
||||||
|
try {
|
||||||
|
reader = new AsciiLineReader(new FileInputStream(vcfFile));
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
Assert.fail("Unable to parse out VCF file " + vcfFile);
|
||||||
|
}
|
||||||
|
codec = new VCF4Codec();
|
||||||
|
codec.readHeader(reader);
|
||||||
|
header = codec.getHeader(VCFHeader.class);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue