diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java index 351076d05..777ec5b19 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java @@ -1,15 +1,8 @@ package org.broadinstitute.sting.utils.genotype.vcf; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.StingException; import org.apache.log4j.Logger; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; -import java.util.LinkedHashSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.*; /** @@ -30,7 +23,7 @@ public class VCFHeader { private Set mHeaderFields = new LinkedHashSet(); // the associated meta data - private final List> mMetaData = new ArrayList>(); + private final Map mMetaData = new HashMap(); // the list of auxillary tags private final List auxillaryTags = new ArrayList(); @@ -41,62 +34,25 @@ public class VCFHeader { // the header string indicator public static final String HEADER_INDICATOR = "#"; - /** - * our log, which we want to capture anything from this class - */ + /** our log, which we use to capture anything from this class */ private static Logger logger = Logger.getLogger(VCFHeader.class); - // patterns we use for detecting meta data and header lines - private static Pattern pMeta = Pattern.compile("^" + METADATA_INDICATOR + "\\s*(\\S+)\\s*=\\s*(\\S+)\\s*$"); - - /** - * create a VCF header, given an array of strings that all start with at least the # character + * create a VCF header, given a list of meta data and auxillary tags * - * @param headerStrings a list of header strings + * @param metaData + * @param additionalColumns */ - public VCFHeader(List headerStrings) { - try { - Thread.sleep(5000); - } catch (InterruptedException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - } - - // iterate over all the passed in strings - for (String str : headerStrings) { - Matcher matcher = pMeta.matcher(str); - if (matcher.matches()) { - String metaKey = ""; - String metaValue = ""; - if (matcher.groupCount() < 1) continue; - if (matcher.groupCount() == 2) metaValue = matcher.group(2); - metaKey = matcher.group(1); - mMetaData.add(new Pair(metaKey, metaValue)); - } - } - - // iterate over all the passed in strings - for (String str : headerStrings) { - if (str.startsWith("#") && !str.startsWith("##")) { - String[] strings = str.substring(1).split("\\s+"); - for (String s : strings) { - if (mHeaderFields.contains(s)) throw new StingException("Header field duplication is not allowed"); - try { - mHeaderFields.add(HEADER_FIELDS.valueOf(s)); - } catch (IllegalArgumentException e) { - this.auxillaryTags.add(s); - } - } - } - } - if (mHeaderFields.size() != HEADER_FIELDS.values().length) { - throw new StingException("The VCF header is missing " + (HEADER_FIELDS.values().length - mHeaderFields.size()) + " required fields"); - } + public VCFHeader(Set headerFields, Map metaData, List additionalColumns) { + for (HEADER_FIELDS field : headerFields) mHeaderFields.add(field); + for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key)); + for (String col : additionalColumns) auxillaryTags.add(col); } /** - * get the header fieldsm in order they're presented in the input file - * @return + * get the header fields in order they're presented in the input file + * + * @return a set of the header fields, in order */ public Set getHeaderFields() { return mHeaderFields; @@ -104,15 +60,17 @@ public class VCFHeader { /** * get the meta data, associated with this header - * @return + * + * @return a map of the meta data */ - public List> getMetaData() { + public Map getMetaData() { return mMetaData; } /** * get the auxillary tags - * @return + * + * @return a list of the extra column names, in order */ public List getAuxillaryTags() { return auxillaryTags; diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java index 375e33a3e..5ca28ca3e 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java @@ -1,18 +1,16 @@ package org.broadinstitute.sting.utils.genotype.vcf; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Utils; import java.io.*; -import java.util.List; -import java.util.Iterator; -import java.util.ArrayList; -import java.nio.ByteBuffer; import java.nio.charset.Charset; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -/** - * The VCFReader class, which given a valid vcf file, parses out the header and VCF records - */ -public class VCFReader implements Iterator { +/** The VCFReader class, which given a valid vcf file, parses out the header and VCF records */ +public class VCFReader implements Iterator, Iterable { // our VCF header private VCFHeader mHeader; @@ -22,11 +20,14 @@ public class VCFReader implements Iterator { // our next record private VCFRecord mNextRecord = null; + + // a pattern we use for detecting meta data and header lines + private static Pattern pMeta = Pattern.compile("^" + VCFHeader.METADATA_INDICATOR + "\\s*(\\S+)\\s*=\\s*(\\S+)\\s*$"); /** * Create a VCF reader, given a VCF file * - * @param vcfFile + * @param vcfFile the vcf file to write */ public VCFReader(File vcfFile) { Charset utf8 = Charset.forName("UTF-8"); @@ -47,7 +48,7 @@ public class VCFReader implements Iterator { lines.add(line); line = mReader.readLine(); } - mHeader = new VCFHeader(lines); + mHeader = this.createHeader(lines); mNextRecord = new VCFRecord(mHeader, line); } catch (IOException e) { throw new StingException("Failed to parse VCF File on line: " + line, e); @@ -55,16 +56,14 @@ public class VCFReader implements Iterator { } - /** - * - * @return true if we have another VCF record to return - */ + /** @return true if we have another VCF record to return */ public boolean hasNext() { return (mNextRecord != null); } /** * return the next available VCF record. Make sure to check availability with a call to hasNext! + * * @return a VCFRecord, representing the next record in the file */ public VCFRecord next() { @@ -79,10 +78,74 @@ public class VCFReader implements Iterator { return rec; } - /** - * Remove is not supported - */ + /** Remove is not supported */ public void remove() { throw new UnsupportedOperationException("Unsupported operation"); } + + /** + * create a VCF header, given an array of strings that all start with at least the # character. This function is + * package protected so that the VCFReader can access this function + * + * @param headerStrings a list of header strings + */ + protected VCFHeader createHeader(List headerStrings) { + + Map metaData = new HashMap(); + Set headerFields = new LinkedHashSet(); + List auxTags = new ArrayList(); + // iterate over all the passed in strings + for (String str : headerStrings) { + Matcher matcher = pMeta.matcher(str); + if (matcher.matches()) { + String metaKey = ""; + String metaValue = ""; + if (matcher.groupCount() < 1) continue; + if (matcher.groupCount() == 2) metaValue = matcher.group(2); + metaKey = matcher.group(1); + metaData.put(metaKey, metaValue); + } + } + + // iterate over all the passed in strings + for (String str : headerStrings) { + if (str.startsWith("#") && !str.startsWith("##")) { + String[] strings = str.substring(1).split("\\s+"); + for (String s : strings) { + if (headerFields.contains(s)) throw new StingException("Header field duplication is not allowed"); + try { + headerFields.add(VCFHeader.HEADER_FIELDS.valueOf(s)); + } catch (IllegalArgumentException e) { + auxTags.add(s); + } + } + } + } + if (headerFields.size() != VCFHeader.HEADER_FIELDS.values().length) { + throw new StingException("The VCF header is missing " + (VCFHeader.HEADER_FIELDS.values().length - headerFields.size()) + " required fields"); + } + return new VCFHeader(headerFields,metaData,auxTags); + } + + /** + * + * @return get the header associated with this reader + */ + public VCFHeader getHeader() { + return this.mHeader; + } + + @Override + public Iterator iterator() { + return this; + } + + public void close() { + try { + mReader.close(); + } catch (IOException e) { + // we don't really care + Utils.warnUser("Unable to close VCF reader file, this is not fatal, but is worth noting"); + } + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index 4181ebfd0..20fcd5bf3 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -3,8 +3,8 @@ package org.broadinstitute.sting.utils.genotype.vcf; import org.broadinstitute.sting.utils.StingException; import java.util.HashMap; -import java.util.Map; import java.util.List; +import java.util.Map; /** * the basic VCF record type @@ -17,12 +17,14 @@ public class VCFRecord { private Map mAuxValues = new HashMap(); /** - * create a VCFRecord, given a VCF header and the the values in this field + * create a VCFRecord, given a VCF header and the the values in this field. THis is protected, so that the reader is + * the only accessing object + * TODO: this seems like a bad design * * @param header the VCF header * @param line the line to parse into individual fields */ - public VCFRecord(VCFHeader header, String line) { + protected VCFRecord(VCFHeader header, String line) { String tokens[] = line.split("\\s+"); if (tokens.length != (header.getAuxillaryTags().size() + header.getHeaderFields().size())) { throw new StingException("Line:" + line + " didn't parse into " + (header.getAuxillaryTags().size() + header.getHeaderFields().size()) + " fields"); @@ -39,6 +41,22 @@ public class VCFRecord { } } + public VCFRecord(VCFHeader header, List values) { + if (values.size() != (header.getAuxillaryTags().size() + header.getHeaderFields().size())) { + throw new StingException("The input list doesn't contain enough fields, it should have " + (header.getAuxillaryTags().size() + header.getHeaderFields().size()) + " fields"); + } + int index = 0; + for (VCFHeader.HEADER_FIELDS field: header.getHeaderFields()) { + mValues.put(field,values.get(index)); + index++; + } + for (String str: header.getAuxillaryTags()) { + mAuxValues.put(str,values.get(index)); + index++; + } + } + + /** * lookup a value, given it's column name * @@ -150,4 +168,12 @@ public class VCFRecord { } return ret; } + + /** + * + * @return the number of columnsof data we're storing + */ + public int getColumnCount() { + return this.mAuxValues.size() + this.mValues.size(); + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java new file mode 100644 index 000000000..c61444f64 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -0,0 +1,91 @@ +package org.broadinstitute.sting.utils.genotype.vcf; + +import org.broadinstitute.sting.utils.StingException; + +import java.io.*; +import java.nio.charset.Charset; + +/** this class writers VCF files */ +public class VCFWriter { + + // the VCF header we're storing + private VCFHeader mHeader; + + // the print stream we're writting to + BufferedWriter mWriter; + + /** + * create a VCF writer, given a VCF header and a file to write to + * + * @param header the VCF header + * @param location the file location to write to + */ + public VCFWriter(VCFHeader header, File location) { + this.mHeader = header; + Charset utf8 = Charset.forName("UTF-8"); + try { + mWriter = new BufferedWriter( + new OutputStreamWriter( + new FileOutputStream(location), + utf8)); + } catch (FileNotFoundException e) { + throw new StingException("Unable to create VCF file: " + location, e); + } + try { + + // write the header meta-data out + for (String metadata : header.getMetaData().keySet()) { + mWriter.write(VCFHeader.METADATA_INDICATOR + metadata + "=" + header.getMetaData().get(metadata) + "\n"); + } + // write out the column line + StringBuilder b = new StringBuilder(); + b.append(VCFHeader.HEADER_INDICATOR); + for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) b.append(field + "\t"); + for (String field : header.getAuxillaryTags()) b.append(field + "\t"); + mWriter.write(b.toString() + "\n"); + } + catch (IOException e) { + throw new StingException("IOException writing the VCF header", e); + } + } + + /** + * output a record to the VCF file + * @param record the record to output + */ + public void addRecord(VCFRecord record) { + if (record.getColumnCount() != mHeader.getAuxillaryTags().size() + mHeader.getHeaderFields().size()) { + throw new StingException("Record has " + record.getColumnCount() + + " columns, when is should have " + (mHeader.getAuxillaryTags().size() + + mHeader.getHeaderFields().size())); + } + StringBuilder builder = new StringBuilder(); + // first output the required fields in order + boolean first = true; + for (VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields()) { + if (first) { first = false; builder.append(record.getValue(field)); } + else builder.append("\t" + record.getValue(field)); + } + for (String auxTag : mHeader.getAuxillaryTags()) { + builder.append("\t" + record.getValue(auxTag)); + } + try { + mWriter.write(builder.toString() + "\n"); + } catch (IOException e) { + throw new StingException("Unable to write the VCF object to a file"); + } + } + + /** + * attempt to close the VCF file + */ + public void close() { + try { + mWriter.close(); + } catch (IOException e) { + throw new StingException("Unable to close VCFFile"); + } + } + + +} diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderTest.java index f55ad0198..83a3231c8 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderTest.java @@ -1,13 +1,10 @@ package org.broadinstitute.sting.utils.genotype.vcf; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.StingException; -import org.junit.Test; import org.junit.Assert; +import org.junit.Test; -import java.io.*; -import java.util.ArrayList; -import java.util.List; +import java.util.*; /** @@ -20,24 +17,45 @@ import java.util.List; */ public class VCFHeaderTest extends BaseTest { + private Set headerFields = new LinkedHashSet(); + private Map metaData = new HashMap(); + private List additionalColumns = new ArrayList(); + + /** + * give it fake data, and make sure we get back the right fake data + */ @Test - public void test1() { - File in = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf"); - if (!in.exists()) throw new StingException("vfc doesn't exist"); - List array = new ArrayList(); - try { - BufferedReader reader = new BufferedReader(new FileReader("vcfexample.vcf")); - String line = reader.readLine(); - while (line.startsWith("#")) { - array.add(line); - line = reader.readLine(); - } - VCFHeader header = new VCFHeader(array); - } catch (FileNotFoundException e) { - Assert.fail("File not found exception in VCFHeaderTest"); - } catch (IOException e) { - Assert.fail("IO exception in VCFHeaderTest"); + public void testHeaderConstructor() { + for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { + headerFields.add(field); } + metaData.put("one","1"); + metaData.put("two","2"); + additionalColumns.add("extra1"); + additionalColumns.add("extra2"); + // this should create a header that is valid + + VCFHeader header = new VCFHeader(headerFields, metaData, additionalColumns); + + // check the fields + int index = 0; + for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) { + Assert.assertEquals(VCFHeader.HEADER_FIELDS.values()[index],field); + index++; + } + index = 0; + for (String key: header.getMetaData().keySet()) { + Assert.assertEquals(header.getMetaData().get(key),metaData.get(key)); + index++; + } + Assert.assertEquals(metaData.size(),index); + index = 0; + for (String key: header.getAuxillaryTags()) { + Assert.assertTrue(additionalColumns.contains(key)); + index++; + } + Assert.assertEquals(additionalColumns.size(),index); } + } diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java new file mode 100644 index 000000000..123b632c7 --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java @@ -0,0 +1,96 @@ +package org.broadinstitute.sting.utils.genotype.vcf; + +import org.broadinstitute.sting.BaseTest; +import org.junit.Assert; +import org.junit.Test; + +import java.io.File; +import java.util.*; + + +/** + * @author aaron + *

+ * Class VCFWriterTest + *

+ * This class tests out the ability of the VCF writer to correctly write VCF files + */ +public class VCFWriterTest extends BaseTest { + private Set headerFields = new LinkedHashSet(); + private Map metaData = new HashMap(); + private List additionalColumns = new ArrayList(); + private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf"); + + /** test, using the writer and reader, that we can output and input a VCF file without problems */ + @Test + public void testBasicWriteAndRead() { + VCFHeader header = createFakeHeader(); + VCFWriter writer = new VCFWriter(header,fakeVCFFile); + writer.addRecord(createVCFRecord(header)); + writer.addRecord(createVCFRecord(header)); + writer.close(); + VCFReader reader = new VCFReader(fakeVCFFile); + int counter = 0; + // validate what we're reading in + validateHeader(reader.getHeader()); + for(VCFRecord rec :reader) { + counter++; + } + Assert.assertEquals(2,counter); + reader.close(); + fakeVCFFile.delete(); + + } + + /** + * create a fake header of known quantity + * @return a fake VCF header + */ + private VCFHeader createFakeHeader() { + for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { + headerFields.add(field); + } + metaData.put("one", "1"); + metaData.put("two", "2"); + additionalColumns.add("extra1"); + additionalColumns.add("extra2"); + // this should create a header that is valid + + return new VCFHeader(headerFields, metaData, additionalColumns); + } + + private VCFRecord createVCFRecord(VCFHeader header) { + int totalVals = header.getHeaderFields().size() + header.getAuxillaryTags().size(); + List array = new ArrayList(); + for (int x = 0; x < totalVals; x++) { + array.add(String.valueOf(x)); + } + return new VCFRecord(header,array); + } + + + /** + * validate a VCF header + * @param header the header to validate + */ + public void validateHeader(VCFHeader header) { + // check the fields + int index = 0; + for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) { + Assert.assertEquals(VCFHeader.HEADER_FIELDS.values()[index], field); + index++; + } + index = 0; + for (String key : header.getMetaData().keySet()) { + Assert.assertEquals(header.getMetaData().get(key), metaData.get(key)); + index++; + } + Assert.assertEquals(metaData.size(), index); + index = 0; + for (String key : header.getAuxillaryTags()) { + Assert.assertTrue(additionalColumns.contains(key)); + index++; + } + Assert.assertEquals(additionalColumns.size(), index); + } +}