Updated VCF Reader to parse VCFs according to the VCFv3.3 spec. Column headers are tab separated since sample names might have spaces.

Updated test files in /humgen/gsa-scr1/GATK_Data/Validation_Data/*.vcf to remove spaces except for when they are supposed to be in the sample name.
Added @Test before VCFReaderTest.testHeaderNoRecords()

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2809 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2010-02-08 22:55:59 +00:00
parent 935e76daa1
commit fc810a1800
2 changed files with 47 additions and 17 deletions

View File

@ -1,14 +1,27 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.utils.Utils;
import java.io.*;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import org.broadinstitute.sting.utils.Utils;
/** The VCFReader class, which given a valid vcf file, parses out the header and VCF records */
public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
@ -151,7 +164,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
// iterate over all the passed in strings
for ( String str : headerStrings ) {
if ( !str.startsWith("##") ) {
String[] strings = str.substring(1).split("\\s+");
String[] strings = str.substring(1).split("\t");
// the columns should be in order according to Richard Durbin
int arrayIndex = 0;
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
@ -197,7 +210,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
try {
// things we need to make a VCF record
Map<VCFHeader.HEADER_FIELDS, String> values = new HashMap<VCFHeader.HEADER_FIELDS, String>();
String tokens[] = line.split("\\s+");
String tokens[] = line.split("\t");
// check to ensure that the column count of tokens is right
if (tokens.length != mHeader.getColumnCount()) {

View File

@ -1,16 +1,24 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.junit.Assert;
import org.junit.Test;
import org.junit.BeforeClass;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.io.*;
import java.util.*;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
/** test the VCFReader class test */
public class VCFReaderTest extends BaseTest {
@ -20,6 +28,7 @@ public class VCFReaderTest extends BaseTest {
private static final String VCF_MIXUP_FILE = validationDataLocation + "mixedup.v2.vcf";
private static final File complexFile = new File(validationDataLocation + "complexExample.vcf");
private static final File headerNoRecordsFile = new File(validationDataLocation + "headerNoRecords.vcf");
private static final File headerSampleSpaceFile = new File(validationDataLocation + "headerSampleSpaceFile.vcf");
@BeforeClass
@ -164,7 +173,7 @@ public class VCFReaderTest extends BaseTest {
BufferedReader breader = new BufferedReader(reader);
String line;
while ((line = breader.readLine()) != null) {
String[] pieces = line.split("\\s+");
String[] pieces = line.split("\t");
if (line.contains("##")) {
continue;
@ -340,10 +349,18 @@ public class VCFReaderTest extends BaseTest {
if (reader.hasNext()) Assert.fail("The reader should NOT have a record");
}
@Test
public void testHeaderNoRecords() {
VCFReader reader = new VCFReader(headerNoRecordsFile);
Assert.assertTrue(reader.getHeader().getMetaData() != null);
Assert.assertTrue(!reader.iterator().hasNext());
}
@Test
public void testHeaderSampleSpaceFile() {
VCFReader reader = new VCFReader(headerSampleSpaceFile);
Assert.assertTrue(reader.getHeader().hasGenotypingData());
Assert.assertTrue(reader.getHeader().getGenotypeSamples().size() == 1);
Assert.assertTrue(reader.getHeader().getGenotypeSamples().contains("SAMPLE NAME"));
}
}