My interpretation of the VCF spec is that the FORMAT field should only be present if there is genotype/sample data. So the VCFCodec now throws an exception when it encounters such a case. I had to fix one of the integration test VCFs.

This commit is contained in:
Eric Banks 2011-09-21 22:23:28 -04:00
parent e53cb79d42
commit 8f8b59a932
3 changed files with 32 additions and 29 deletions

View File

@ -115,15 +115,21 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
} }
arrayIndex++; arrayIndex++;
} }
boolean sawFormatTag = false;
if ( arrayIndex < strings.length ) { if ( arrayIndex < strings.length ) {
if ( !strings[arrayIndex].equals("FORMAT") ) if ( !strings[arrayIndex].equals("FORMAT") )
throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'"); throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'");
sawFormatTag = true;
arrayIndex++; arrayIndex++;
} }
while (arrayIndex < strings.length) while ( arrayIndex < strings.length )
auxTags.add(strings[arrayIndex++]); auxTags.add(strings[arrayIndex++]);
if ( sawFormatTag && auxTags.size() == 0 )
throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data");
} else { } else {
if ( str.startsWith("##INFO=") ) { if ( str.startsWith("##INFO=") ) {
VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version); VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version);
@ -200,10 +206,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
* @return a VariantContext * @return a VariantContext
*/ */
public Feature decode(String line) { public Feature decode(String line) {
return reallyDecode(line);
}
private Feature reallyDecode(String line) {
// the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;

View File

@ -35,9 +35,6 @@ public class VCFHeader {
// the header string indicator // the header string indicator
public static final String HEADER_INDICATOR = "#"; public static final String HEADER_INDICATOR = "#";
/** do we have genotying data? */
private boolean hasGenotypingData = false;
// were the input samples sorted originally (or are we sorting them)? // were the input samples sorted originally (or are we sorting them)?
private boolean samplesWereAlreadySorted = true; private boolean samplesWereAlreadySorted = true;
@ -57,17 +54,15 @@ public class VCFHeader {
* create a VCF header, given a list of meta data and auxillary tags * create a VCF header, given a list of meta data and auxillary tags
* *
* @param metaData the meta data associated with this header * @param metaData the meta data associated with this header
* @param genotypeSampleNames the genotype format field, and the sample names * @param genotypeSampleNames the sample names
*/ */
public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) { public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
mMetaData = new TreeSet<VCFHeaderLine>(); mMetaData = new TreeSet<VCFHeaderLine>();
if ( metaData != null ) if ( metaData != null )
mMetaData.addAll(metaData); mMetaData.addAll(metaData);
for (String col : genotypeSampleNames) {
if (!col.equals("FORMAT")) mGenotypeSampleNames.addAll(genotypeSampleNames);
mGenotypeSampleNames.add(col);
}
if (genotypeSampleNames.size() > 0) hasGenotypingData = true;
loadVCFVersion(); loadVCFVersion();
loadMetaDataMaps(); loadMetaDataMaps();
@ -157,7 +152,7 @@ public class VCFHeader {
* @return true if we have genotyping columns, false otherwise * @return true if we have genotyping columns, false otherwise
*/ */
public boolean hasGenotypingData() { public boolean hasGenotypingData() {
return hasGenotypingData; return mGenotypeSampleNames.size() > 0;
} }
/** /**
@ -171,7 +166,7 @@ public class VCFHeader {
/** @return the column count */ /** @return the column count */
public int getColumnCount() { public int getColumnCount() {
return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0); return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
} }
/** /**

View File

@ -174,6 +174,12 @@ public class UserException extends ReviewedStingException {
} }
} }
public static class MalformedVCFHeader extends UserException {
public MalformedVCFHeader(String message) {
super(String.format("The provided VCF file has a malformed header: %s", message));
}
}
public static class ReadMissingReadGroup extends MalformedBAM { public static class ReadMissingReadGroup extends MalformedBAM {
public ReadMissingReadGroup(SAMRecord read) { public ReadMissingReadGroup(SAMRecord read) {
super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName())); super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName()));