diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java new file mode 100644 index 000000000..351076d05 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java @@ -0,0 +1,123 @@ +package org.broadinstitute.sting.utils.genotype.vcf; + +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.StingException; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.LinkedHashSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +/** + * @author aaron + *

+ * Class VCFHeader + *

+ * A descriptions should go here. Blame aaron if it's missing. + */ +public class VCFHeader { + + // the manditory header fields + public enum HEADER_FIELDS { + CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO + } + + // our header field ordering, as a linked hash set to guarantee ordering + private Set mHeaderFields = new LinkedHashSet(); + + // the associated meta data + private final List> mMetaData = new ArrayList>(); + + // the list of auxillary tags + private final List auxillaryTags = new ArrayList(); + + // the character string that indicates meta data + public static final String METADATA_INDICATOR = "##"; + + // the header string indicator + public static final String HEADER_INDICATOR = "#"; + + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(VCFHeader.class); + + // patterns we use for detecting meta data and header lines + private static Pattern pMeta = Pattern.compile("^" + METADATA_INDICATOR + "\\s*(\\S+)\\s*=\\s*(\\S+)\\s*$"); + + + /** + * create a VCF header, given an array of strings that all start with at least the # character + * + * @param headerStrings a list of header strings + */ + public VCFHeader(List headerStrings) { + try { + Thread.sleep(5000); + } catch (InterruptedException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + + // iterate over all the passed in strings + for (String str : headerStrings) { + Matcher matcher = pMeta.matcher(str); + if (matcher.matches()) { + String metaKey = ""; + String metaValue = ""; + if (matcher.groupCount() < 1) continue; + if (matcher.groupCount() == 2) metaValue = matcher.group(2); + metaKey = matcher.group(1); + mMetaData.add(new Pair(metaKey, metaValue)); + } + } + + // iterate over all the passed in strings + for (String str : headerStrings) { + if (str.startsWith("#") && !str.startsWith("##")) { + String[] strings = str.substring(1).split("\\s+"); + for (String s : strings) { + if (mHeaderFields.contains(s)) throw new StingException("Header field duplication is not allowed"); + try { + mHeaderFields.add(HEADER_FIELDS.valueOf(s)); + } catch (IllegalArgumentException e) { + this.auxillaryTags.add(s); + } + } + } + } + if (mHeaderFields.size() != HEADER_FIELDS.values().length) { + throw new StingException("The VCF header is missing " + (HEADER_FIELDS.values().length - mHeaderFields.size()) + " required fields"); + } + } + + /** + * get the header fieldsm in order they're presented in the input file + * @return + */ + public Set getHeaderFields() { + return mHeaderFields; + } + + /** + * get the meta data, associated with this header + * @return + */ + public List> getMetaData() { + return mMetaData; + } + + /** + * get the auxillary tags + * @return + */ + public List getAuxillaryTags() { + return auxillaryTags; + } +} + + + diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java new file mode 100644 index 000000000..375e33a3e --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java @@ -0,0 +1,88 @@ +package org.broadinstitute.sting.utils.genotype.vcf; + +import org.broadinstitute.sting.utils.StingException; + +import java.io.*; +import java.util.List; +import java.util.Iterator; +import java.util.ArrayList; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; + +/** + * The VCFReader class, which given a valid vcf file, parses out the header and VCF records + */ +public class VCFReader implements Iterator { + + // our VCF header + private VCFHeader mHeader; + + // our buffered input stream + private BufferedReader mReader; + + // our next record + private VCFRecord mNextRecord = null; + + /** + * Create a VCF reader, given a VCF file + * + * @param vcfFile + */ + public VCFReader(File vcfFile) { + Charset utf8 = Charset.forName("UTF-8"); + try { + mReader = new BufferedReader( + new InputStreamReader( + new FileInputStream(vcfFile), + utf8)); + } catch (FileNotFoundException e) { + throw new StingException("Unable to find VCF file: " + vcfFile, e); + } + + String line = null; + try { + ArrayList lines = new ArrayList(); + line = mReader.readLine(); + while (line.startsWith("#")) { + lines.add(line); + line = mReader.readLine(); + } + mHeader = new VCFHeader(lines); + mNextRecord = new VCFRecord(mHeader, line); + } catch (IOException e) { + throw new StingException("Failed to parse VCF File on line: " + line, e); + } + + } + + /** + * + * @return true if we have another VCF record to return + */ + public boolean hasNext() { + return (mNextRecord != null); + } + + /** + * return the next available VCF record. Make sure to check availability with a call to hasNext! + * @return a VCFRecord, representing the next record in the file + */ + public VCFRecord next() { + VCFRecord rec = mNextRecord; + try { + String line = mReader.readLine(); + if (line == null) mNextRecord = null; + else mNextRecord = new VCFRecord(mHeader, line); + } catch (IOException e) { + mNextRecord = null; + } + return rec; + } + + /** + * Remove is not supported + */ + public void remove() { + throw new UnsupportedOperationException("Unsupported operation"); + } +} diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java new file mode 100644 index 000000000..4181ebfd0 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -0,0 +1,153 @@ +package org.broadinstitute.sting.utils.genotype.vcf; + +import org.broadinstitute.sting.utils.StingException; + +import java.util.HashMap; +import java.util.Map; +import java.util.List; + +/** + * the basic VCF record type + */ +public class VCFRecord { + // required field values + private Map mValues = new HashMap(); + + // our auxillary values + private Map mAuxValues = new HashMap(); + + /** + * create a VCFRecord, given a VCF header and the the values in this field + * + * @param header the VCF header + * @param line the line to parse into individual fields + */ + public VCFRecord(VCFHeader header, String line) { + String tokens[] = line.split("\\s+"); + if (tokens.length != (header.getAuxillaryTags().size() + header.getHeaderFields().size())) { + throw new StingException("Line:" + line + " didn't parse into " + (header.getAuxillaryTags().size() + header.getHeaderFields().size()) + " fields"); + } + + int tokenCount = 0; + for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) { + mValues.put(field, tokens[tokenCount]); + tokenCount++; + } + for (String aux : header.getAuxillaryTags()) { + mAuxValues.put(aux, tokens[tokenCount]); + tokenCount++; + } + } + + /** + * lookup a value, given it's column name + * + * @param key the column name, which is looked up in both the set columns and the auxillary columns + * @return a String representing the column values, or null if the field doesn't exist in this record + */ + public String getValue(String key) { + try { + return mValues.get(VCFHeader.HEADER_FIELDS.valueOf(key)); + } catch (IllegalArgumentException e) { + if (this.mAuxValues.containsKey(key)) { + return mAuxValues.get(key); + } + return null; + } + } + + /** + * get a required field, given the field tag + * + * @param field + * @return + */ + public String getValue(VCFHeader.HEADER_FIELDS field) { + return mValues.get(field); + } + + /** + * @return the string for the chromosome that this VCF record is associated with + */ + public String getChromosome() { + return this.mValues.get(VCFHeader.HEADER_FIELDS.CHROM); + } + + /** + * @return this VCF records position on the specified chromosome + */ + public long getPosition() { + return Long.valueOf(this.mValues.get(VCFHeader.HEADER_FIELDS.POS)); + } + + /** + * @return the ID value for this record + */ + public String getID() { + return this.mValues.get(VCFHeader.HEADER_FIELDS.ID); + } + + /** + * get the reference base + * + * @return either A, T, C, G, or N + */ + public char getReferenceBase() { + // TODO: this field isn't validated correctly + return this.mValues.get(VCFHeader.HEADER_FIELDS.REF).charAt(0); + } + + /** + * get the alternate allele strings + * + * @return an array of strings representing the alt alleles, or null if there are none + */ + public String[] getAlternateAlleles() { + if (this.mValues.get(VCFHeader.HEADER_FIELDS.ALT).trim().equals(".")) { + return null; + } + return this.mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(","); + } + + public boolean hasAlternateAllele() { + return getAlternateAlleles() != null; + } + + /** + * @return the phred-scaled quality score + */ + public int getQual() { + return Integer.valueOf(this.mValues.get(VCFHeader.HEADER_FIELDS.QUAL)); + } + + /** + * get the filter criteria + * + * @return an array of strings representing the filtering criteria, or null if none were applied + */ + public String[] getFilteringCodes() { + if (this.mValues.get(VCFHeader.HEADER_FIELDS.FILTER).trim().equals("0")) { + return null; + } + return this.mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(";"); + } + + public boolean hasFilteringCodes() { + return getAlternateAlleles() != null; + } + + /** + * get the information key-value pairs as a Map<> + * @return a map, of the info key-value pairs + */ + public Map getInfoValues() { + Map ret = new HashMap(); + String infoSplit[] = mValues.get(VCFHeader.HEADER_FIELDS.INFO).split(";"); + for (String s: infoSplit) { + String keyValue[] = s.split("="); + if (keyValue.length != 2) throw new StingException("Key value pairs must have both a key and a value; pair: " + s); + ret.put(keyValue[0],keyValue[1]); + } + return ret; + } +} diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderTest.java new file mode 100644 index 000000000..f55ad0198 --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderTest.java @@ -0,0 +1,43 @@ +package org.broadinstitute.sting.utils.genotype.vcf; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.StingException; +import org.junit.Test; +import org.junit.Assert; + +import java.io.*; +import java.util.ArrayList; +import java.util.List; + + +/** + * + * @author aaron + * + * Class VCFHeaderTest + * + * Test the VCF Header class + */ +public class VCFHeaderTest extends BaseTest { + + @Test + public void test1() { + File in = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf"); + if (!in.exists()) throw new StingException("vfc doesn't exist"); + List array = new ArrayList(); + try { + BufferedReader reader = new BufferedReader(new FileReader("vcfexample.vcf")); + String line = reader.readLine(); + while (line.startsWith("#")) { + array.add(line); + line = reader.readLine(); + } + VCFHeader header = new VCFHeader(array); + } catch (FileNotFoundException e) { + Assert.fail("File not found exception in VCFHeaderTest"); + } catch (IOException e) { + Assert.fail("IO exception in VCFHeaderTest"); + } + } + +} diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java new file mode 100644 index 000000000..44f802f3f --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java @@ -0,0 +1,27 @@ +package org.broadinstitute.sting.utils.genotype.vcf; + +import org.junit.Test; +import org.junit.Assert; +import org.broadinstitute.sting.BaseTest; + +import java.io.File; + +/** + * test the VCFReader class test + */ +public class VCFReaderTest extends BaseTest { + + private static File vcfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf"); + + @Test + public void testVCFInput() { + VCFReader reader = new VCFReader(vcfFile); + int counter = 0; + while (reader.hasNext()) { + counter++; + reader.next(); + System.err.println(counter); + } + Assert.assertEquals(5,counter); + } +}