From d3f4a5a9017b914894a73134abb8c7dbd5751ee6 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 10:37:38 -0500 Subject: [PATCH 1/4] Fail gracefully when encountering malformed VCFs without enough data columns --- .../org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java | 2 ++ .../src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java | 2 ++ 2 files changed, 4 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java index aaa2e63a7..b3329c708 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java @@ -120,6 +120,8 @@ public class VCF3Codec extends AbstractVCFCodec { genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); + if ( nParts != genotypeParts.length ) + generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo); ArrayList genotypes = new ArrayList(nParts); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 4c1bb1d9e..453155be7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -147,6 +147,8 @@ public class VCFCodec extends AbstractVCFCodec { genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); + if ( nParts != genotypeParts.length ) + generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo); ArrayList genotypes = new ArrayList(nParts); From 09a5a9eac08ec4b26983bff3c73a0373a91ca688 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 10:43:52 -0500 Subject: [PATCH 2/4] Don't update lineNo for decodeLoc - only for decode (otherwise they get double-counted). Even still, because of the way the GATK currently utilizes Tribble we can parse the same line multiple times, which knocks the line counter out of sync. For now, I've added a TODO in the code to remind us and the error messages note that it's an approximate line number. --- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 3 ++- .../broadinstitute/sting/utils/exceptions/UserException.java | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 3009c236b..b902f220f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -184,7 +184,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { * @return a feature, (not guaranteed complete) that has the correct start and stop */ public Feature decodeLoc(String line) { - lineNo++; // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; @@ -279,6 +278,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { builder.source(getName()); // increment the line count + // TODO -- because of the way the engine utilizes Tribble, we can parse a line multiple times (especially when + // TODO -- the first record is far along the contig) and the line counter can get out of sync lineNo++; // parse out the required fields diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index c599d4759..a2816b58f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -184,11 +184,11 @@ public class UserException extends ReviewedStingException { public static class MalformedVCF extends UserException { public MalformedVCF(String message, String line) { - super(String.format("The provided VCF file is malformed at line %s: %s", line, message)); + super(String.format("The provided VCF file is malformed at approximately line %s: %s", line, message)); } public MalformedVCF(String message, int lineNo) { - super(String.format("The provided VCF file is malformed at line number %d: %s", lineNo, message)); + super(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); } } From 9497e9492cb6904fac16349ae5cdaadb8a4350d8 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 11:21:28 -0500 Subject: [PATCH 3/4] Bug fix for complex records: do not ever reverse clip out a complete allele. --- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index b902f220f..e44c10f1f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -595,6 +595,11 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { if ( a.isSymbolic() ) continue; + // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong + // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). + if ( a.length() - clipping == 0 ) + return clipping - 1; + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) stillClipping = false; else if ( ref.length() == clipping ) From 76485217184bc6f6891f7a3a76380b38a3fceb6f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Dec 2011 11:26:43 -0500 Subject: [PATCH 4/4] Add check for mixed genotype so that we don't exception out for a valid record --- .../gatk/walkers/varianteval/evaluators/CountVariants.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index c740eb78c..e5e8dfaf5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -182,6 +182,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval { nHomDerived++; } + break; + case MIXED: break; default: throw new ReviewedStingException("BUG: Unexpected genotype type: " + g);