Updated VCF Reader to parse VCFs according to the VCFv3.3 spec. Column headers are tab separated since sample names might have spaces.

Updated test files in /humgen/gsa-scr1/GATK_Data/Validation_Data/*.vcf to remove spaces except for when they are supposed to be in the sample name.
Added @Test before VCFReaderTest.testHeaderNoRecords()

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2809 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2010-02-08 22:55:59 +00:00
parent 935e76daa1
commit fc810a1800
2 changed files with 47 additions and 17 deletions

View File

@ -1,14 +1,27 @@
package org.broadinstitute.sting.utils.genotype.vcf; package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.utils.Utils; import java.io.BufferedReader;
import java.io.File;
import java.io.*; import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.*; import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import org.broadinstitute.sting.utils.Utils;
/** The VCFReader class, which given a valid vcf file, parses out the header and VCF records */ /** The VCFReader class, which given a valid vcf file, parses out the header and VCF records */
public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> { public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
@ -151,7 +164,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
// iterate over all the passed in strings // iterate over all the passed in strings
for ( String str : headerStrings ) { for ( String str : headerStrings ) {
if ( !str.startsWith("##") ) { if ( !str.startsWith("##") ) {
String[] strings = str.substring(1).split("\\s+"); String[] strings = str.substring(1).split("\t");
// the columns should be in order according to Richard Durbin // the columns should be in order according to Richard Durbin
int arrayIndex = 0; int arrayIndex = 0;
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
@ -197,7 +210,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
try { try {
// things we need to make a VCF record // things we need to make a VCF record
Map<VCFHeader.HEADER_FIELDS, String> values = new HashMap<VCFHeader.HEADER_FIELDS, String>(); Map<VCFHeader.HEADER_FIELDS, String> values = new HashMap<VCFHeader.HEADER_FIELDS, String>();
String tokens[] = line.split("\\s+"); String tokens[] = line.split("\t");
// check to ensure that the column count of tokens is right // check to ensure that the column count of tokens is right
if (tokens.length != mHeader.getColumnCount()) { if (tokens.length != mHeader.getColumnCount()) {

View File

@ -1,16 +1,24 @@
package org.broadinstitute.sting.utils.genotype.vcf; package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.BaseTest; import java.io.BufferedReader;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.io.File;
import org.broadinstitute.sting.utils.StingException; import java.io.FileNotFoundException;
import org.broadinstitute.sting.utils.GenomeLocParser; import java.io.FileReader;
import org.broadinstitute.sting.gatk.refdata.RodVCF; import java.io.IOException;
import org.junit.Assert; import java.util.ArrayList;
import org.junit.Test; import java.util.Iterator;
import org.junit.BeforeClass; import java.util.List;
import java.util.Map;
import java.util.Set;
import java.io.*; import org.broadinstitute.sting.BaseTest;
import java.util.*; import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
/** test the VCFReader class test */ /** test the VCFReader class test */
public class VCFReaderTest extends BaseTest { public class VCFReaderTest extends BaseTest {
@ -20,6 +28,7 @@ public class VCFReaderTest extends BaseTest {
private static final String VCF_MIXUP_FILE = validationDataLocation + "mixedup.v2.vcf"; private static final String VCF_MIXUP_FILE = validationDataLocation + "mixedup.v2.vcf";
private static final File complexFile = new File(validationDataLocation + "complexExample.vcf"); private static final File complexFile = new File(validationDataLocation + "complexExample.vcf");
private static final File headerNoRecordsFile = new File(validationDataLocation + "headerNoRecords.vcf"); private static final File headerNoRecordsFile = new File(validationDataLocation + "headerNoRecords.vcf");
private static final File headerSampleSpaceFile = new File(validationDataLocation + "headerSampleSpaceFile.vcf");
@BeforeClass @BeforeClass
@ -164,7 +173,7 @@ public class VCFReaderTest extends BaseTest {
BufferedReader breader = new BufferedReader(reader); BufferedReader breader = new BufferedReader(reader);
String line; String line;
while ((line = breader.readLine()) != null) { while ((line = breader.readLine()) != null) {
String[] pieces = line.split("\\s+"); String[] pieces = line.split("\t");
if (line.contains("##")) { if (line.contains("##")) {
continue; continue;
@ -340,10 +349,18 @@ public class VCFReaderTest extends BaseTest {
if (reader.hasNext()) Assert.fail("The reader should NOT have a record"); if (reader.hasNext()) Assert.fail("The reader should NOT have a record");
} }
@Test
public void testHeaderNoRecords() { public void testHeaderNoRecords() {
VCFReader reader = new VCFReader(headerNoRecordsFile); VCFReader reader = new VCFReader(headerNoRecordsFile);
Assert.assertTrue(reader.getHeader().getMetaData() != null); Assert.assertTrue(reader.getHeader().getMetaData() != null);
Assert.assertTrue(!reader.iterator().hasNext()); Assert.assertTrue(!reader.iterator().hasNext());
}
@Test
public void testHeaderSampleSpaceFile() {
VCFReader reader = new VCFReader(headerSampleSpaceFile);
Assert.assertTrue(reader.getHeader().hasGenotypingData());
Assert.assertTrue(reader.getHeader().getGenotypeSamples().size() == 1);
Assert.assertTrue(reader.getHeader().getGenotypeSamples().contains("SAMPLE NAME"));
} }
} }