Updated VCF Reader to parse VCFs according to the VCFv3.3 spec. Column headers are tab separated since sample names might have spaces.
Updated test files in /humgen/gsa-scr1/GATK_Data/Validation_Data/*.vcf to remove spaces except for when they are supposed to be in the sample name. Added @Test before VCFReaderTest.testHeaderNoRecords() git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2809 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
935e76daa1
commit
fc810a1800
|
|
@ -1,14 +1,27 @@
|
||||||
package org.broadinstitute.sting.utils.genotype.vcf;
|
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
import java.io.*;
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.LinkedHashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.TreeSet;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
||||||
/** The VCFReader class, which given a valid vcf file, parses out the header and VCF records */
|
/** The VCFReader class, which given a valid vcf file, parses out the header and VCF records */
|
||||||
public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
|
|
||||||
|
|
@ -151,7 +164,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
// iterate over all the passed in strings
|
// iterate over all the passed in strings
|
||||||
for ( String str : headerStrings ) {
|
for ( String str : headerStrings ) {
|
||||||
if ( !str.startsWith("##") ) {
|
if ( !str.startsWith("##") ) {
|
||||||
String[] strings = str.substring(1).split("\\s+");
|
String[] strings = str.substring(1).split("\t");
|
||||||
// the columns should be in order according to Richard Durbin
|
// the columns should be in order according to Richard Durbin
|
||||||
int arrayIndex = 0;
|
int arrayIndex = 0;
|
||||||
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
|
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
|
||||||
|
|
@ -197,7 +210,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
try {
|
try {
|
||||||
// things we need to make a VCF record
|
// things we need to make a VCF record
|
||||||
Map<VCFHeader.HEADER_FIELDS, String> values = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
Map<VCFHeader.HEADER_FIELDS, String> values = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
||||||
String tokens[] = line.split("\\s+");
|
String tokens[] = line.split("\t");
|
||||||
|
|
||||||
// check to ensure that the column count of tokens is right
|
// check to ensure that the column count of tokens is right
|
||||||
if (tokens.length != mHeader.getColumnCount()) {
|
if (tokens.length != mHeader.getColumnCount()) {
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,24 @@
|
||||||
package org.broadinstitute.sting.utils.genotype.vcf;
|
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import java.io.BufferedReader;
|
||||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
import java.io.File;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import java.io.FileNotFoundException;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import java.io.FileReader;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
import java.io.IOException;
|
||||||
import org.junit.Assert;
|
import java.util.ArrayList;
|
||||||
import org.junit.Test;
|
import java.util.Iterator;
|
||||||
import org.junit.BeforeClass;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import java.io.*;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
import java.util.*;
|
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
/** test the VCFReader class test */
|
/** test the VCFReader class test */
|
||||||
public class VCFReaderTest extends BaseTest {
|
public class VCFReaderTest extends BaseTest {
|
||||||
|
|
@ -20,6 +28,7 @@ public class VCFReaderTest extends BaseTest {
|
||||||
private static final String VCF_MIXUP_FILE = validationDataLocation + "mixedup.v2.vcf";
|
private static final String VCF_MIXUP_FILE = validationDataLocation + "mixedup.v2.vcf";
|
||||||
private static final File complexFile = new File(validationDataLocation + "complexExample.vcf");
|
private static final File complexFile = new File(validationDataLocation + "complexExample.vcf");
|
||||||
private static final File headerNoRecordsFile = new File(validationDataLocation + "headerNoRecords.vcf");
|
private static final File headerNoRecordsFile = new File(validationDataLocation + "headerNoRecords.vcf");
|
||||||
|
private static final File headerSampleSpaceFile = new File(validationDataLocation + "headerSampleSpaceFile.vcf");
|
||||||
|
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
|
|
@ -164,7 +173,7 @@ public class VCFReaderTest extends BaseTest {
|
||||||
BufferedReader breader = new BufferedReader(reader);
|
BufferedReader breader = new BufferedReader(reader);
|
||||||
String line;
|
String line;
|
||||||
while ((line = breader.readLine()) != null) {
|
while ((line = breader.readLine()) != null) {
|
||||||
String[] pieces = line.split("\\s+");
|
String[] pieces = line.split("\t");
|
||||||
|
|
||||||
if (line.contains("##")) {
|
if (line.contains("##")) {
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -340,10 +349,18 @@ public class VCFReaderTest extends BaseTest {
|
||||||
if (reader.hasNext()) Assert.fail("The reader should NOT have a record");
|
if (reader.hasNext()) Assert.fail("The reader should NOT have a record");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
public void testHeaderNoRecords() {
|
public void testHeaderNoRecords() {
|
||||||
VCFReader reader = new VCFReader(headerNoRecordsFile);
|
VCFReader reader = new VCFReader(headerNoRecordsFile);
|
||||||
Assert.assertTrue(reader.getHeader().getMetaData() != null);
|
Assert.assertTrue(reader.getHeader().getMetaData() != null);
|
||||||
Assert.assertTrue(!reader.iterator().hasNext());
|
Assert.assertTrue(!reader.iterator().hasNext());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHeaderSampleSpaceFile() {
|
||||||
|
VCFReader reader = new VCFReader(headerSampleSpaceFile);
|
||||||
|
Assert.assertTrue(reader.getHeader().hasGenotypingData());
|
||||||
|
Assert.assertTrue(reader.getHeader().getGenotypeSamples().size() == 1);
|
||||||
|
Assert.assertTrue(reader.getHeader().getGenotypeSamples().contains("SAMPLE NAME"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue