Updated VCF Reader to parse VCFs according to the VCFv3.3 spec. Column headers are tab separated since sample names might have spaces.
Updated test files in /humgen/gsa-scr1/GATK_Data/Validation_Data/*.vcf to remove spaces except for when they are supposed to be in the sample name. Added @Test before VCFReaderTest.testHeaderNoRecords() git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2809 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
935e76daa1
commit
fc810a1800
|
|
@ -1,14 +1,27 @@
|
|||
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
/** The VCFReader class, which given a valid vcf file, parses out the header and VCF records */
|
||||
public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||
|
||||
|
|
@ -151,7 +164,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
// iterate over all the passed in strings
|
||||
for ( String str : headerStrings ) {
|
||||
if ( !str.startsWith("##") ) {
|
||||
String[] strings = str.substring(1).split("\\s+");
|
||||
String[] strings = str.substring(1).split("\t");
|
||||
// the columns should be in order according to Richard Durbin
|
||||
int arrayIndex = 0;
|
||||
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
|
||||
|
|
@ -197,7 +210,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
try {
|
||||
// things we need to make a VCF record
|
||||
Map<VCFHeader.HEADER_FIELDS, String> values = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
||||
String tokens[] = line.split("\\s+");
|
||||
String tokens[] = line.split("\t");
|
||||
|
||||
// check to ensure that the column count of tokens is right
|
||||
if (tokens.length != mHeader.getColumnCount()) {
|
||||
|
|
|
|||
|
|
@ -1,16 +1,24 @@
|
|||
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import org.junit.BeforeClass;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.junit.Assert;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
/** test the VCFReader class test */
|
||||
public class VCFReaderTest extends BaseTest {
|
||||
|
|
@ -20,6 +28,7 @@ public class VCFReaderTest extends BaseTest {
|
|||
private static final String VCF_MIXUP_FILE = validationDataLocation + "mixedup.v2.vcf";
|
||||
private static final File complexFile = new File(validationDataLocation + "complexExample.vcf");
|
||||
private static final File headerNoRecordsFile = new File(validationDataLocation + "headerNoRecords.vcf");
|
||||
private static final File headerSampleSpaceFile = new File(validationDataLocation + "headerSampleSpaceFile.vcf");
|
||||
|
||||
|
||||
@BeforeClass
|
||||
|
|
@ -164,7 +173,7 @@ public class VCFReaderTest extends BaseTest {
|
|||
BufferedReader breader = new BufferedReader(reader);
|
||||
String line;
|
||||
while ((line = breader.readLine()) != null) {
|
||||
String[] pieces = line.split("\\s+");
|
||||
String[] pieces = line.split("\t");
|
||||
|
||||
if (line.contains("##")) {
|
||||
continue;
|
||||
|
|
@ -340,10 +349,18 @@ public class VCFReaderTest extends BaseTest {
|
|||
if (reader.hasNext()) Assert.fail("The reader should NOT have a record");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHeaderNoRecords() {
|
||||
VCFReader reader = new VCFReader(headerNoRecordsFile);
|
||||
Assert.assertTrue(reader.getHeader().getMetaData() != null);
|
||||
Assert.assertTrue(!reader.iterator().hasNext());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHeaderSampleSpaceFile() {
|
||||
VCFReader reader = new VCFReader(headerSampleSpaceFile);
|
||||
Assert.assertTrue(reader.getHeader().hasGenotypingData());
|
||||
Assert.assertTrue(reader.getHeader().getGenotypeSamples().size() == 1);
|
||||
Assert.assertTrue(reader.getHeader().getGenotypeSamples().contains("SAMPLE NAME"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue