vcf changes from Richards comments, fixed a test case
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1456 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ee05ddde16
commit
811503d67b
|
|
@ -19,9 +19,6 @@ public class VCFHeader {
|
||||||
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
|
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
|
||||||
}
|
}
|
||||||
|
|
||||||
// our header field ordering, as a linked hash set to guarantee ordering
|
|
||||||
private Set<HEADER_FIELDS> mHeaderFields = new LinkedHashSet<HEADER_FIELDS>();
|
|
||||||
|
|
||||||
// the associated meta data
|
// the associated meta data
|
||||||
private final Map<String, String> mMetaData = new HashMap<String, String>();
|
private final Map<String, String> mMetaData = new HashMap<String, String>();
|
||||||
|
|
||||||
|
|
@ -46,11 +43,9 @@ public class VCFHeader {
|
||||||
/**
|
/**
|
||||||
* create a VCF header, given a list of meta data and auxillary tags
|
* create a VCF header, given a list of meta data and auxillary tags
|
||||||
*
|
*
|
||||||
* @param headerFields the required header fields, in order they're presented
|
|
||||||
* @param metaData the meta data associated with this header
|
* @param metaData the meta data associated with this header
|
||||||
*/
|
*/
|
||||||
protected VCFHeader(Set<HEADER_FIELDS> headerFields, Map<String, String> metaData) {
|
protected VCFHeader(Map<String, String> metaData) {
|
||||||
for (HEADER_FIELDS field : headerFields) mHeaderFields.add(field);
|
|
||||||
for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key));
|
for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key));
|
||||||
checkVCFVersion();
|
checkVCFVersion();
|
||||||
}
|
}
|
||||||
|
|
@ -58,18 +53,16 @@ public class VCFHeader {
|
||||||
/**
|
/**
|
||||||
* create a VCF header, given a list of meta data and auxillary tags
|
* create a VCF header, given a list of meta data and auxillary tags
|
||||||
*
|
*
|
||||||
* @param headerFields the required header fields, in order they're presented
|
|
||||||
* @param metaData the meta data associated with this header
|
* @param metaData the meta data associated with this header
|
||||||
* @param genotypeSampleNames the genotype format field, and the sample names
|
* @param genotypeSampleNames the genotype format field, and the sample names
|
||||||
*/
|
*/
|
||||||
protected VCFHeader(Set<HEADER_FIELDS> headerFields, Map<String, String> metaData, List<String> genotypeSampleNames) {
|
protected VCFHeader(Map<String, String> metaData, List<String> genotypeSampleNames) {
|
||||||
for (HEADER_FIELDS field : headerFields) mHeaderFields.add(field);
|
|
||||||
for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key));
|
for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key));
|
||||||
for (String col : genotypeSampleNames) {
|
for (String col : genotypeSampleNames) {
|
||||||
if (!col.equals("FORMAT"))
|
if (!col.equals("FORMAT"))
|
||||||
mGenotypeSampleNames.add(col);
|
mGenotypeSampleNames.add(col);
|
||||||
}
|
}
|
||||||
hasGenotypingData = true;
|
if (genotypeSampleNames.size() > 0) hasGenotypingData = true;
|
||||||
checkVCFVersion();
|
checkVCFVersion();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -88,12 +81,16 @@ public class VCFHeader {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get the header fields in order they're presented in the input file
|
* get the header fields in order they're presented in the input file (which is now required to be
|
||||||
|
* the order presented in the spec).
|
||||||
*
|
*
|
||||||
* @return a set of the header fields, in order
|
* @return a set of the header fields, in order
|
||||||
*/
|
*/
|
||||||
public Set<HEADER_FIELDS> getHeaderFields() {
|
public Set<HEADER_FIELDS> getHeaderFields() {
|
||||||
return mHeaderFields;
|
Set<HEADER_FIELDS> fields = new LinkedHashSet<HEADER_FIELDS>();
|
||||||
|
for (HEADER_FIELDS field : HEADER_FIELDS.values())
|
||||||
|
fields.add(field);
|
||||||
|
return fields;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -125,7 +122,7 @@ public class VCFHeader {
|
||||||
|
|
||||||
/** @return the column count, */
|
/** @return the column count, */
|
||||||
public int getColumnCount() {
|
public int getColumnCount() {
|
||||||
return mHeaderFields.size() + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
|
return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
lines.add(line);
|
lines.add(line);
|
||||||
line = mReader.readLine();
|
line = mReader.readLine();
|
||||||
}
|
}
|
||||||
mHeader = this.createHeader(lines);
|
mHeader = this.createHeader(lines);
|
||||||
mNextRecord = createRecord(line, mHeader);
|
mNextRecord = createRecord(line, mHeader);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException("VCFReader: Failed to parse VCF File on line: " + line, e);
|
throw new RuntimeException("VCFReader: Failed to parse VCF File on line: " + line, e);
|
||||||
|
|
@ -126,7 +126,6 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
protected VCFHeader createHeader(List<String> headerStrings) {
|
protected VCFHeader createHeader(List<String> headerStrings) {
|
||||||
|
|
||||||
Map<String, String> metaData = new HashMap<String, String>();
|
Map<String, String> metaData = new HashMap<String, String>();
|
||||||
Set<VCFHeader.HEADER_FIELDS> headerFields = new LinkedHashSet<VCFHeader.HEADER_FIELDS>();
|
|
||||||
List<String> auxTags = new ArrayList<String>();
|
List<String> auxTags = new ArrayList<String>();
|
||||||
// iterate over all the passed in strings
|
// iterate over all the passed in strings
|
||||||
for (String str : headerStrings) {
|
for (String str : headerStrings) {
|
||||||
|
|
@ -142,32 +141,28 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// iterate over all the passed in strings
|
// iterate over all the passed in strings
|
||||||
for (String str : headerStrings) {
|
for (String str : headerStrings) { // TODO: fix, we shouldn't loop over every line
|
||||||
if (str.startsWith("#") && !str.startsWith("##")) {
|
if (str.startsWith("#") && !str.startsWith("##")) {
|
||||||
String[] strings = str.substring(1).split("\\s+");
|
String[] strings = str.substring(1).split("\\s+");
|
||||||
for (String s : strings) {
|
// the columns should be in order according to Richard Durbin
|
||||||
VCFHeader.HEADER_FIELDS field;
|
int arrayIndex = 0;
|
||||||
|
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
|
||||||
try {
|
try {
|
||||||
field = VCFHeader.HEADER_FIELDS.valueOf(s);
|
if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex]))
|
||||||
|
throw new RuntimeException("VCFReader: we were expecting column name " + field + " but we saw " + strings[arrayIndex]);
|
||||||
} catch (IllegalArgumentException e) {
|
} catch (IllegalArgumentException e) {
|
||||||
throw new RuntimeException("VCFReader: Unknown column name \"" + s + "\", it does not match a known column header name.");
|
throw new RuntimeException("VCFReader: Unknown column name \"" + strings[arrayIndex] + "\", it does not match a known column header name.");
|
||||||
}
|
|
||||||
if (headerFields.contains(field))
|
|
||||||
throw new RuntimeException("VCFReader: Header field duplication is not allowed");
|
|
||||||
try {
|
|
||||||
headerFields.add(VCFHeader.HEADER_FIELDS.valueOf(s));
|
|
||||||
} catch (IllegalArgumentException e) {
|
|
||||||
if (!s.equals("FORMAT"))
|
|
||||||
auxTags.add(s);
|
|
||||||
}
|
}
|
||||||
|
arrayIndex++;
|
||||||
|
}
|
||||||
|
while (arrayIndex < strings.length) {
|
||||||
|
if (!strings[arrayIndex].equals("FORMAT"))
|
||||||
|
auxTags.add(strings[arrayIndex]);
|
||||||
|
arrayIndex++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (headerFields.size() != VCFHeader.HEADER_FIELDS.values().length) {
|
return new VCFHeader(metaData, auxTags);
|
||||||
throw new RuntimeException("VCFReader: The VCF column header line is missing " + (VCFHeader.HEADER_FIELDS.values().length - headerFields.size())
|
|
||||||
+ " of the " + VCFHeader.HEADER_FIELDS.values().length + " required fields");
|
|
||||||
}
|
|
||||||
return new VCFHeader(headerFields, metaData, auxTags);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -221,7 +216,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
Map<String, String> tagToValue = new HashMap<String, String>();
|
Map<String, String> tagToValue = new HashMap<String, String>();
|
||||||
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
|
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
|
||||||
List<String> bases = new ArrayList<String>();
|
List<String> bases = new ArrayList<String>();
|
||||||
|
int addedCount = 0;
|
||||||
String keyStrings[] = formatString.split(":");
|
String keyStrings[] = formatString.split(":");
|
||||||
for (String key : keyStrings) {
|
for (String key : keyStrings) {
|
||||||
String parse;
|
String parse;
|
||||||
|
|
@ -236,17 +231,23 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
if (key.equals("GT")) {
|
if (key.equals("GT")) {
|
||||||
Matcher m = gtPattern.matcher(parse);
|
Matcher m = gtPattern.matcher(parse);
|
||||||
if (!m.matches())
|
if (!m.matches())
|
||||||
throw new RuntimeException("Ubable to match GT genotype flag to it's regular expression");
|
throw new RuntimeException("VCFReader: Unable to match GT genotype flag to it's expected pattern, the field was: " + parse);
|
||||||
phase = VCFGenotypeRecord.determinePhase(m.group(2));
|
phase = VCFGenotypeRecord.determinePhase(m.group(2));
|
||||||
addAllele(m.group(1), altAlleles, referenceBase, bases);
|
addAllele(m.group(1), altAlleles, referenceBase, bases);
|
||||||
if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases);
|
if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases);
|
||||||
}
|
}
|
||||||
tagToValue.put(key, parse);
|
tagToValue.put(key, parse);
|
||||||
|
addedCount++;
|
||||||
if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1;
|
if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1;
|
||||||
genotypeString = genotypeString.substring(nextDivider + 1, genotypeString.length());
|
genotypeString = genotypeString.substring(nextDivider + 1, genotypeString.length());
|
||||||
}
|
}
|
||||||
if (keyStrings.length != tagToValue.size() || genotypeString.length() > 0)
|
// catch some common errors, either there are too many field keys or there are two many field values
|
||||||
throw new RuntimeException("genotype value count doesn't match the key count");
|
if (keyStrings.length != tagToValue.size())
|
||||||
|
throw new RuntimeException("VCFReader: genotype value count doesn't match the key count (expected "
|
||||||
|
+ keyStrings.length + " but saw " + tagToValue.size() + ")");
|
||||||
|
else if (genotypeString.length() > 0)
|
||||||
|
throw new RuntimeException("VCFReader: genotype string contained additional unprocessed fields: " + genotypeString
|
||||||
|
+ ". This most likely means that the format string is shorter then the value fields.");
|
||||||
return new VCFGenotypeRecord(sampleName, tagToValue, bases, phase, referenceBase);
|
return new VCFGenotypeRecord(sampleName, tagToValue, bases, phase, referenceBase);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,9 @@ package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.text.DateFormat;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.util.Date;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
|
|
||||||
|
|
@ -24,7 +27,7 @@ public class VCFValidator {
|
||||||
* and if no errors pop up in processing, well hey, looks good to us.
|
* and if no errors pop up in processing, well hey, looks good to us.
|
||||||
*
|
*
|
||||||
* @param args the vcf file is the only required parameter, with the optional -A indicating that errors
|
* @param args the vcf file is the only required parameter, with the optional -A indicating that errors
|
||||||
* should be held until the end of processing
|
* should be held until the end of processing
|
||||||
*/
|
*/
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
boolean catchAll = false;
|
boolean catchAll = false;
|
||||||
|
|
@ -37,13 +40,14 @@ public class VCFValidator {
|
||||||
printUsage();
|
printUsage();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
printHeader(args[(catchAll) ? 1 : 0]);
|
||||||
File vcfFile = new File(args[(catchAll) ? 1 : 0]);
|
File vcfFile = new File(args[(catchAll) ? 1 : 0]);
|
||||||
if (!vcfFile.exists()) {
|
if (!vcfFile.exists()) {
|
||||||
System.err.println("Specified VCF file doesn't exist, please check the input file\n");
|
System.err.println("Specified VCF file doesn't exist, please check the input file\n");
|
||||||
printUsage();
|
printUsage();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// count hom many records we see
|
// count hom many records we've see
|
||||||
int recordCount = 0;
|
int recordCount = 0;
|
||||||
Map<Integer, Exception> problems = new TreeMap<Integer, Exception>();
|
Map<Integer, Exception> problems = new TreeMap<Integer, Exception>();
|
||||||
|
|
||||||
|
|
@ -53,24 +57,28 @@ public class VCFValidator {
|
||||||
|
|
||||||
// the number of samples should be set in the header and consistant over all records
|
// the number of samples should be set in the header and consistant over all records
|
||||||
final int sampleCount = reader.getHeader().getGenotypeSamples().size();
|
final int sampleCount = reader.getHeader().getGenotypeSamples().size();
|
||||||
while (reader.hasNext()) {
|
boolean keepGoing = true;
|
||||||
|
while (keepGoing) {
|
||||||
try {
|
try {
|
||||||
recordCount++;
|
recordCount++;
|
||||||
VCFRecord rec = reader.next();
|
keepGoing = reader.hasNext();
|
||||||
// if the header indicates we have genotyping data, try to extract it for all samples
|
if (keepGoing) {
|
||||||
if (reader.getHeader().hasGenotypingData()) {
|
VCFRecord rec = reader.next();
|
||||||
int sampleCounter = 0;
|
// if the header indicates we have genotyping data, try to extract it for all samples
|
||||||
for (VCFGenotypeRecord genorec : rec.getVCFGenotypeRecords()) {
|
if (reader.getHeader().hasGenotypingData()) {
|
||||||
sampleCounter++;
|
int sampleCounter = 0;
|
||||||
/**
|
for (VCFGenotypeRecord genorec : rec.getVCFGenotypeRecords()) {
|
||||||
* just cycle through the records right now; any additional checks for
|
sampleCounter++;
|
||||||
* the records should go in this block.
|
/**
|
||||||
**/
|
* just cycle through the records right now; any additional checks for
|
||||||
|
* the records should go in this block.
|
||||||
|
**/
|
||||||
|
}
|
||||||
|
if (sampleCounter != sampleCount)
|
||||||
|
throw new RuntimeException("Record " + recordCount + " does not have the required number " +
|
||||||
|
"of records (" + sampleCounter + " in the record, " + sampleCount + " in the header)");
|
||||||
|
|
||||||
}
|
}
|
||||||
if (sampleCounter != sampleCount)
|
|
||||||
throw new RuntimeException("Record " + recordCount + " does not have the required number " +
|
|
||||||
"of records (" + sampleCounter + " in the record, " + sampleCount + " in the header)");
|
|
||||||
|
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
if (catchAll)
|
if (catchAll)
|
||||||
|
|
@ -82,9 +90,10 @@ public class VCFValidator {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
if (catchAll)
|
if (catchAll) {
|
||||||
problems.put(new Integer(0), e);
|
problems.put(new Integer(0), e);
|
||||||
else
|
e.printStackTrace();
|
||||||
|
} else
|
||||||
validationFailed(e, recordCount);
|
validationFailed(e, recordCount);
|
||||||
}
|
}
|
||||||
System.err.println("Viewed " + recordCount + " VCF record entries.");
|
System.err.println("Viewed " + recordCount + " VCF record entries.");
|
||||||
|
|
@ -108,9 +117,7 @@ public class VCFValidator {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** print the usage information for the VCF validator */
|
||||||
* print the usage information for the VCF validator
|
|
||||||
*/
|
|
||||||
public static void printUsage() {
|
public static void printUsage() {
|
||||||
System.err.println("VCF validator (VCF Version " + VCF_VERSION + ")");
|
System.err.println("VCF validator (VCF Version " + VCF_VERSION + ")");
|
||||||
System.err.println("Usage:");
|
System.err.println("Usage:");
|
||||||
|
|
@ -121,4 +128,16 @@ public class VCFValidator {
|
||||||
System.err.println("");
|
System.err.println("");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void printHeader(String file) {
|
||||||
|
System.err.println("-------------------------------------------");
|
||||||
|
System.err.println("VCF Validator v1.0\n");
|
||||||
|
System.err.println("Run on file " + file + " at " + getDateTime());
|
||||||
|
System.err.println("-------------------------------------------");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getDateTime() {
|
||||||
|
DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
|
||||||
|
Date date = new Date();
|
||||||
|
return dateFormat.format(date);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,16 +26,13 @@ public class VCFHeaderTest extends BaseTest {
|
||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testHeaderConstructor() {
|
public void testHeaderConstructor() {
|
||||||
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
|
|
||||||
headerFields.add(field);
|
|
||||||
}
|
|
||||||
metaData.put("format","VCRv3.2");
|
metaData.put("format","VCRv3.2");
|
||||||
metaData.put("two","2");
|
metaData.put("two","2");
|
||||||
additionalColumns.add("extra1");
|
additionalColumns.add("extra1");
|
||||||
additionalColumns.add("extra2");
|
additionalColumns.add("extra2");
|
||||||
// this should create a header that is valid
|
// this should create a header that is valid
|
||||||
|
|
||||||
VCFHeader header = new VCFHeader(headerFields, metaData, additionalColumns);
|
VCFHeader header = new VCFHeader(metaData, additionalColumns);
|
||||||
|
|
||||||
// check the fields
|
// check the fields
|
||||||
int index = 0;
|
int index = 0;
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,11 @@ public class VCFReaderTest extends BaseTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testVCFInput() {
|
public void testVCFInput() {
|
||||||
|
try {
|
||||||
|
Thread.sleep(5000);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||||
|
}
|
||||||
VCFReader reader = new VCFReader(vcfFile);
|
VCFReader reader = new VCFReader(vcfFile);
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
while (reader.hasNext()) {
|
while (reader.hasNext()) {
|
||||||
|
|
|
||||||
|
|
@ -46,15 +46,12 @@ public class VCFWriterTest extends BaseTest {
|
||||||
* @return a fake VCF header
|
* @return a fake VCF header
|
||||||
*/
|
*/
|
||||||
private VCFHeader createFakeHeader() {
|
private VCFHeader createFakeHeader() {
|
||||||
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
|
|
||||||
headerFields.add(field);
|
|
||||||
}
|
|
||||||
metaData.put("format", "VCRv3.2"); // required
|
metaData.put("format", "VCRv3.2"); // required
|
||||||
metaData.put("two", "2");
|
metaData.put("two", "2");
|
||||||
additionalColumns.add("FORMAT");
|
additionalColumns.add("FORMAT");
|
||||||
additionalColumns.add("extra1");
|
additionalColumns.add("extra1");
|
||||||
additionalColumns.add("extra2");
|
additionalColumns.add("extra2");
|
||||||
return new VCFHeader(headerFields, metaData, additionalColumns);
|
return new VCFHeader(metaData, additionalColumns);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue