the start to the VCF implementation
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1425 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d4808433a1
commit
8403618846
|
|
@ -0,0 +1,123 @@
|
||||||
|
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.LinkedHashSet;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class VCFHeader
|
||||||
|
* <p/>
|
||||||
|
* A descriptions should go here. Blame aaron if it's missing.
|
||||||
|
*/
|
||||||
|
public class VCFHeader {
|
||||||
|
|
||||||
|
// the manditory header fields
|
||||||
|
public enum HEADER_FIELDS {
|
||||||
|
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
|
||||||
|
}
|
||||||
|
|
||||||
|
// our header field ordering, as a linked hash set to guarantee ordering
|
||||||
|
private Set<HEADER_FIELDS> mHeaderFields = new LinkedHashSet<HEADER_FIELDS>();
|
||||||
|
|
||||||
|
// the associated meta data
|
||||||
|
private final List<Pair<String, String>> mMetaData = new ArrayList<Pair<String, String>>();
|
||||||
|
|
||||||
|
// the list of auxillary tags
|
||||||
|
private final List<String> auxillaryTags = new ArrayList<String>();
|
||||||
|
|
||||||
|
// the character string that indicates meta data
|
||||||
|
public static final String METADATA_INDICATOR = "##";
|
||||||
|
|
||||||
|
// the header string indicator
|
||||||
|
public static final String HEADER_INDICATOR = "#";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* our log, which we want to capture anything from this class
|
||||||
|
*/
|
||||||
|
private static Logger logger = Logger.getLogger(VCFHeader.class);
|
||||||
|
|
||||||
|
// patterns we use for detecting meta data and header lines
|
||||||
|
private static Pattern pMeta = Pattern.compile("^" + METADATA_INDICATOR + "\\s*(\\S+)\\s*=\\s*(\\S+)\\s*$");
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCF header, given an array of strings that all start with at least the # character
|
||||||
|
*
|
||||||
|
* @param headerStrings a list of header strings
|
||||||
|
*/
|
||||||
|
public VCFHeader(List<String> headerStrings) {
|
||||||
|
try {
|
||||||
|
Thread.sleep(5000);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
|
||||||
|
// iterate over all the passed in strings
|
||||||
|
for (String str : headerStrings) {
|
||||||
|
Matcher matcher = pMeta.matcher(str);
|
||||||
|
if (matcher.matches()) {
|
||||||
|
String metaKey = "";
|
||||||
|
String metaValue = "";
|
||||||
|
if (matcher.groupCount() < 1) continue;
|
||||||
|
if (matcher.groupCount() == 2) metaValue = matcher.group(2);
|
||||||
|
metaKey = matcher.group(1);
|
||||||
|
mMetaData.add(new Pair<String, String>(metaKey, metaValue));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// iterate over all the passed in strings
|
||||||
|
for (String str : headerStrings) {
|
||||||
|
if (str.startsWith("#") && !str.startsWith("##")) {
|
||||||
|
String[] strings = str.substring(1).split("\\s+");
|
||||||
|
for (String s : strings) {
|
||||||
|
if (mHeaderFields.contains(s)) throw new StingException("Header field duplication is not allowed");
|
||||||
|
try {
|
||||||
|
mHeaderFields.add(HEADER_FIELDS.valueOf(s));
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
this.auxillaryTags.add(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (mHeaderFields.size() != HEADER_FIELDS.values().length) {
|
||||||
|
throw new StingException("The VCF header is missing " + (HEADER_FIELDS.values().length - mHeaderFields.size()) + " required fields");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the header fieldsm in order they're presented in the input file
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Set<HEADER_FIELDS> getHeaderFields() {
|
||||||
|
return mHeaderFields;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the meta data, associated with this header
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public List<Pair<String, String>> getMetaData() {
|
||||||
|
return mMetaData;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the auxillary tags
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public List<String> getAuxillaryTags() {
|
||||||
|
return auxillaryTags;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,88 @@
|
||||||
|
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The VCFReader class, which given a valid vcf file, parses out the header and VCF records
|
||||||
|
*/
|
||||||
|
public class VCFReader implements Iterator<VCFRecord> {
|
||||||
|
|
||||||
|
// our VCF header
|
||||||
|
private VCFHeader mHeader;
|
||||||
|
|
||||||
|
// our buffered input stream
|
||||||
|
private BufferedReader mReader;
|
||||||
|
|
||||||
|
// our next record
|
||||||
|
private VCFRecord mNextRecord = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a VCF reader, given a VCF file
|
||||||
|
*
|
||||||
|
* @param vcfFile
|
||||||
|
*/
|
||||||
|
public VCFReader(File vcfFile) {
|
||||||
|
Charset utf8 = Charset.forName("UTF-8");
|
||||||
|
try {
|
||||||
|
mReader = new BufferedReader(
|
||||||
|
new InputStreamReader(
|
||||||
|
new FileInputStream(vcfFile),
|
||||||
|
utf8));
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
throw new StingException("Unable to find VCF file: " + vcfFile, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
String line = null;
|
||||||
|
try {
|
||||||
|
ArrayList<String> lines = new ArrayList<String>();
|
||||||
|
line = mReader.readLine();
|
||||||
|
while (line.startsWith("#")) {
|
||||||
|
lines.add(line);
|
||||||
|
line = mReader.readLine();
|
||||||
|
}
|
||||||
|
mHeader = new VCFHeader(lines);
|
||||||
|
mNextRecord = new VCFRecord(mHeader, line);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new StingException("Failed to parse VCF File on line: " + line, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return true if we have another VCF record to return
|
||||||
|
*/
|
||||||
|
public boolean hasNext() {
|
||||||
|
return (mNextRecord != null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return the next available VCF record. Make sure to check availability with a call to hasNext!
|
||||||
|
* @return a VCFRecord, representing the next record in the file
|
||||||
|
*/
|
||||||
|
public VCFRecord next() {
|
||||||
|
VCFRecord rec = mNextRecord;
|
||||||
|
try {
|
||||||
|
String line = mReader.readLine();
|
||||||
|
if (line == null) mNextRecord = null;
|
||||||
|
else mNextRecord = new VCFRecord(mHeader, line);
|
||||||
|
} catch (IOException e) {
|
||||||
|
mNextRecord = null;
|
||||||
|
}
|
||||||
|
return rec;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove is not supported
|
||||||
|
*/
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("Unsupported operation");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,153 @@
|
||||||
|
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the basic VCF record type
|
||||||
|
*/
|
||||||
|
public class VCFRecord {
|
||||||
|
// required field values
|
||||||
|
private Map<VCFHeader.HEADER_FIELDS, String> mValues = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
||||||
|
|
||||||
|
// our auxillary values
|
||||||
|
private Map<String, String> mAuxValues = new HashMap<String, String>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCFRecord, given a VCF header and the the values in this field
|
||||||
|
*
|
||||||
|
* @param header the VCF header
|
||||||
|
* @param line the line to parse into individual fields
|
||||||
|
*/
|
||||||
|
public VCFRecord(VCFHeader header, String line) {
|
||||||
|
String tokens[] = line.split("\\s+");
|
||||||
|
if (tokens.length != (header.getAuxillaryTags().size() + header.getHeaderFields().size())) {
|
||||||
|
throw new StingException("Line:" + line + " didn't parse into " + (header.getAuxillaryTags().size() + header.getHeaderFields().size()) + " fields");
|
||||||
|
}
|
||||||
|
|
||||||
|
int tokenCount = 0;
|
||||||
|
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) {
|
||||||
|
mValues.put(field, tokens[tokenCount]);
|
||||||
|
tokenCount++;
|
||||||
|
}
|
||||||
|
for (String aux : header.getAuxillaryTags()) {
|
||||||
|
mAuxValues.put(aux, tokens[tokenCount]);
|
||||||
|
tokenCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* lookup a value, given it's column name
|
||||||
|
*
|
||||||
|
* @param key the column name, which is looked up in both the set columns and the auxillary columns
|
||||||
|
* @return a String representing the column values, or null if the field doesn't exist in this record
|
||||||
|
*/
|
||||||
|
public String getValue(String key) {
|
||||||
|
try {
|
||||||
|
return mValues.get(VCFHeader.HEADER_FIELDS.valueOf(key));
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
if (this.mAuxValues.containsKey(key)) {
|
||||||
|
return mAuxValues.get(key);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get a required field, given the field tag
|
||||||
|
*
|
||||||
|
* @param field
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public String getValue(VCFHeader.HEADER_FIELDS field) {
|
||||||
|
return mValues.get(field);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the string for the chromosome that this VCF record is associated with
|
||||||
|
*/
|
||||||
|
public String getChromosome() {
|
||||||
|
return this.mValues.get(VCFHeader.HEADER_FIELDS.CHROM);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return this VCF records position on the specified chromosome
|
||||||
|
*/
|
||||||
|
public long getPosition() {
|
||||||
|
return Long.valueOf(this.mValues.get(VCFHeader.HEADER_FIELDS.POS));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the ID value for this record
|
||||||
|
*/
|
||||||
|
public String getID() {
|
||||||
|
return this.mValues.get(VCFHeader.HEADER_FIELDS.ID);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the reference base
|
||||||
|
*
|
||||||
|
* @return either A, T, C, G, or N
|
||||||
|
*/
|
||||||
|
public char getReferenceBase() {
|
||||||
|
// TODO: this field isn't validated correctly
|
||||||
|
return this.mValues.get(VCFHeader.HEADER_FIELDS.REF).charAt(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the alternate allele strings
|
||||||
|
*
|
||||||
|
* @return an array of strings representing the alt alleles, or null if there are none
|
||||||
|
*/
|
||||||
|
public String[] getAlternateAlleles() {
|
||||||
|
if (this.mValues.get(VCFHeader.HEADER_FIELDS.ALT).trim().equals(".")) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return this.mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(",");
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasAlternateAllele() {
|
||||||
|
return getAlternateAlleles() != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the phred-scaled quality score
|
||||||
|
*/
|
||||||
|
public int getQual() {
|
||||||
|
return Integer.valueOf(this.mValues.get(VCFHeader.HEADER_FIELDS.QUAL));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the filter criteria
|
||||||
|
*
|
||||||
|
* @return an array of strings representing the filtering criteria, or null if none were applied
|
||||||
|
*/
|
||||||
|
public String[] getFilteringCodes() {
|
||||||
|
if (this.mValues.get(VCFHeader.HEADER_FIELDS.FILTER).trim().equals("0")) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return this.mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(";");
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasFilteringCodes() {
|
||||||
|
return getAlternateAlleles() != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the information key-value pairs as a Map<>
|
||||||
|
* @return a map, of the info key-value pairs
|
||||||
|
*/
|
||||||
|
public Map<String,String> getInfoValues() {
|
||||||
|
Map<String,String> ret = new HashMap<String,String>();
|
||||||
|
String infoSplit[] = mValues.get(VCFHeader.HEADER_FIELDS.INFO).split(";");
|
||||||
|
for (String s: infoSplit) {
|
||||||
|
String keyValue[] = s.split("=");
|
||||||
|
if (keyValue.length != 2) throw new StingException("Key value pairs must have both a key and a value; pair: " + s);
|
||||||
|
ret.put(keyValue[0],keyValue[1]);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,43 @@
|
||||||
|
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.Assert;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @author aaron
|
||||||
|
*
|
||||||
|
* Class VCFHeaderTest
|
||||||
|
*
|
||||||
|
* Test the VCF Header class
|
||||||
|
*/
|
||||||
|
public class VCFHeaderTest extends BaseTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test1() {
|
||||||
|
File in = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf");
|
||||||
|
if (!in.exists()) throw new StingException("vfc doesn't exist");
|
||||||
|
List<String> array = new ArrayList<String>();
|
||||||
|
try {
|
||||||
|
BufferedReader reader = new BufferedReader(new FileReader("vcfexample.vcf"));
|
||||||
|
String line = reader.readLine();
|
||||||
|
while (line.startsWith("#")) {
|
||||||
|
array.add(line);
|
||||||
|
line = reader.readLine();
|
||||||
|
}
|
||||||
|
VCFHeader header = new VCFHeader(array);
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
Assert.fail("File not found exception in VCFHeaderTest");
|
||||||
|
} catch (IOException e) {
|
||||||
|
Assert.fail("IO exception in VCFHeaderTest");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* test the VCFReader class test
|
||||||
|
*/
|
||||||
|
public class VCFReaderTest extends BaseTest {
|
||||||
|
|
||||||
|
private static File vcfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf");
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testVCFInput() {
|
||||||
|
VCFReader reader = new VCFReader(vcfFile);
|
||||||
|
int counter = 0;
|
||||||
|
while (reader.hasNext()) {
|
||||||
|
counter++;
|
||||||
|
reader.next();
|
||||||
|
System.err.println(counter);
|
||||||
|
}
|
||||||
|
Assert.assertEquals(5,counter);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue