From a6d3e4bd4709c262268b7bb831e82b2247a32024 Mon Sep 17 00:00:00 2001 From: aaron Date: Tue, 22 Jun 2010 18:26:37 +0000 Subject: [PATCH] Add code to allow reference alleles with 'N' in VariantContext, but not in the alternate allele(s). Also more updates to the VCF 4 code (fixed parsing for files without genotypes). This check-in will temperarly break the build (I need to see if Bamboo is correctly returning the log file for the failed builds). Will be fixed once Bamboo starts building. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3609 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/contexts/variantcontext/Allele.java | 15 +- .../gatk/refdata/VariantContextAdaptors.java | 22 +-- .../gatk/refdata/features/vcf4/VCF4Codec.java | 160 +++++++++++------- .../walkers/VCF4ReaderTestWalker.java | 2 +- .../refdata/features/vcf4/VCF4UnitTest.java | 47 +++-- 5 files changed, 162 insertions(+), 84 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java index 8bcb056bb..3e97b8de5 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java @@ -106,7 +106,7 @@ public class Allele implements Comparable { this.isRef = isRef; this.bases = bases; - if ( ! acceptableAlleleBases(bases) ) + if ( ! acceptableAlleleBases(bases,isRef) ) throw new IllegalArgumentException("Unexpected base in allele bases " + new String(bases)); } @@ -186,24 +186,29 @@ public class Allele implements Comparable { /** * @param bases bases representing an allele + * @param reference is this the reference allele * @return true if the bases represent the well formatted allele */ - public static boolean acceptableAlleleBases(String bases) { - return acceptableAlleleBases(bases.getBytes()); + public static boolean acceptableAlleleBases(String bases, boolean reference) { + return acceptableAlleleBases(bases.getBytes(),reference); } /** * @param bases bases representing an allele + * @param reference are we the reference (we allow n's in the reference allele) * @return true if the bases represent the well formatted allele */ - public static boolean acceptableAlleleBases(byte[] bases) { + public static boolean acceptableAlleleBases(byte[] bases, boolean reference) { if ( wouldBeNullAllele(bases) || wouldBeNoCallAllele(bases) ) return true; for ( int i = 0; i < bases.length; i++ ) { switch (bases[i]) { case 'A': case 'C': case 'G': case 'T': break; - default: return false; + case 'N': if(!reference) + return false; break; + default: + return false; } // if ( ! BaseUtils.isRegularBase(bases[i]) ) { // return false; diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index f59c6e197..2b60ab92b 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -94,7 +94,7 @@ public class VariantContextAdaptors { private static class DBSnpAdaptor extends VCAdaptor { VariantContext convert(String name, Object input, ReferenceContext ref) { DbSNPFeature dbsnp = (DbSNPFeature)input; - if ( ! Allele.acceptableAlleleBases(DbSNPHelper.getReference(dbsnp)) ) + if ( ! Allele.acceptableAlleleBases(DbSNPHelper.getReference(dbsnp),true) ) return null; Allele refAllele = Allele.create(DbSNPHelper.getReference(dbsnp), true); @@ -105,7 +105,7 @@ public class VariantContextAdaptors { // add all of the alt alleles for ( String alt : DbSNPHelper.getAlternateAlleleList(dbsnp) ) { - if ( ! Allele.acceptableAlleleBases(alt) ) { + if ( ! Allele.acceptableAlleleBases(alt,false) ) { //System.out.printf("Excluding dbsnp record %s%n", dbsnp); return null; } @@ -131,7 +131,7 @@ public class VariantContextAdaptors { private static VariantContext vcfToVariantContext(String name, VCFRecord vcf, ReferenceContext ref) { if ( vcf.isReference() || vcf.isSNP() || vcf.isIndel() ) { // add the reference allele - if ( ! Allele.acceptableAlleleBases(vcf.getReference()) ) { + if ( ! Allele.acceptableAlleleBases(vcf.getReference(),true) ) { System.out.printf("Excluding vcf record %s%n", vcf); return null; } @@ -146,7 +146,7 @@ public class VariantContextAdaptors { alleles.add(refAllele); for ( VCFGenotypeEncoding alt : vcf.getAlternateAlleles() ) { - if ( ! Allele.acceptableAlleleBases(alt.getBases()) ) { + if ( ! Allele.acceptableAlleleBases(alt.getBases(),false) ) { //System.out.printf("Excluding vcf record %s%n", vcf); return null; } @@ -531,7 +531,7 @@ public class VariantContextAdaptors { VariantContext convert(String name, Object input, ReferenceContext ref) { RodGLF glf = (RodGLF)input; - if ( ! Allele.acceptableAlleleBases(glf.getReference()) ) + if ( ! Allele.acceptableAlleleBases(glf.getReference(),true) ) return null; Allele refAllele = Allele.create(glf.getReference(), true); @@ -543,7 +543,7 @@ public class VariantContextAdaptors { // add all of the alt alleles for ( String alt : glf.getAlternateAlleleList() ) { - if ( ! Allele.acceptableAlleleBases(alt) ) { + if ( ! Allele.acceptableAlleleBases(alt,false) ) { return null; } Allele allele = Allele.create(alt, false); @@ -603,7 +603,7 @@ public class VariantContextAdaptors { */ VariantContext convert(String name, Object input, ReferenceContext ref) { GeliTextFeature geli = (GeliTextFeature)input; - if ( ! Allele.acceptableAlleleBases(String.valueOf(geli.getRefBase())) ) + if ( ! Allele.acceptableAlleleBases(String.valueOf(geli.getRefBase()),true) ) return null; Allele refAllele = Allele.create(String.valueOf(geli.getRefBase()), true); @@ -614,7 +614,7 @@ public class VariantContextAdaptors { List genotypeAlleles = new ArrayList(); // add all of the alt alleles for ( char alt : geli.getGenotype().toString().toCharArray() ) { - if ( ! Allele.acceptableAlleleBases(String.valueOf(alt)) ) { + if ( ! Allele.acceptableAlleleBases(String.valueOf(alt),false) ) { return null; } Allele allele = Allele.create(String.valueOf(alt), false); @@ -670,7 +670,7 @@ public class VariantContextAdaptors { */ VariantContext convert(String name, Object input, ReferenceContext ref) { GenotypeLikelihoods geli = ((rodGELI)input).getGeliRecord(); - if ( ! Allele.acceptableAlleleBases(String.valueOf((char)geli.getReferenceBase())) ) + if ( ! Allele.acceptableAlleleBases(String.valueOf((char)geli.getReferenceBase()),true) ) return null; Allele refAllele = Allele.create(String.valueOf((char)geli.getReferenceBase()), true); @@ -679,14 +679,14 @@ public class VariantContextAdaptors { alleles.add(refAllele); // add the two alt alleles - if (!Allele.acceptableAlleleBases(String.valueOf((char) geli.getBestGenotype().getAllele1()))) { + if (!Allele.acceptableAlleleBases(String.valueOf((char) geli.getBestGenotype().getAllele1()),false)) { return null; } Allele allele1 = Allele.create(String.valueOf((char) geli.getBestGenotype().getAllele1()), false); if (!alleles.contains(allele1) && !allele1.equals(refAllele, true)) alleles.add(allele1); // add the two alt alleles - if (!Allele.acceptableAlleleBases(String.valueOf((char) geli.getBestGenotype().getAllele2()))) { + if (!Allele.acceptableAlleleBases(String.valueOf((char) geli.getBestGenotype().getAllele2()),false)) { return null; } Allele allele2 = Allele.create(String.valueOf((char) geli.getBestGenotype().getAllele2()), false); diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java index dcddda97a..5a1c6350c 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -28,31 +28,29 @@ public class VCF4Codec implements FeatureCodec { // we have to store the list of strings that make up the header until they're needed private VCFHeader header = null; - // used to subtract from the + // used to convert the index of the alternate allele in genotypes to a integer index private static int ZERO_CHAR = (byte)'0'; // a mapping of the allele private static Map> alleleMap = new HashMap>(3); - //private static String[] CachedGTKey = new String[100]; + // cache the genotyope values private static String[] CachedGTValues = new String[100]; - // for performance testing purposes - public static boolean parseGenotypesToo = true; - // for performance testing purposes public static boolean validate = true; // a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over + // todo: make this thread safe? private String[] parts = null; // for performance we cache the hashmap of filter encodings for quick lookup private HashMap> filterHash = new HashMap>(); - // a set of the genotype keys? TODO: rename to something better - private String[] GTKeys = new String[100]; + // a set of the genotype keys? + private String[] genotypeKeyArray = new String[100]; - // a list of the info fields, filter fields, and format fields + // a list of the info fields, filter fields, and format fields, for quick lookup to validate against ArrayList infoFields = new ArrayList(); ArrayList formatFields = new ArrayList(); ArrayList filterFields = new ArrayList(); @@ -60,6 +58,9 @@ public class VCF4Codec implements FeatureCodec { // do we want to validate the info, format, and filter fields private final boolean validateFromHeader = true; + // we store a name to give to each of the variant contexts we emit + private String name = "Unkown"; + /** * this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates * @param reader the line reader to take header lines from @@ -76,27 +77,7 @@ public class VCF4Codec implements FeatureCodec { headerStrings.add(line); } else if (line.startsWith("#")) { - headerStrings.add(line); - header = VCFReaderUtils.createHeader(headerStrings, VCFHeaderVersion.VCF4_0); - - // load the parsing fields - Set headerLines = header.getMetaData(); - - // setup our look-up lists for validation - for (VCFHeaderLine hl : headerLines) { - if (hl.getClass() == VCFFilterHeaderLine.class) - this.filterFields.add(((VCFFilterHeaderLine)hl).getmName()); - if (hl.getClass() == VCFFormatHeaderLine.class) - this.formatFields.add(((VCFFormatHeaderLine)hl).getmName()); - if (hl.getClass() == VCFInfoHeaderLine.class) - this.infoFields.add(((VCFInfoHeaderLine)hl).getmName()); - } - // sort the lists - Collections.sort(filterFields); - Collections.sort(formatFields); - Collections.sort(infoFields); - - return headerStrings.size(); + return createHeader(headerStrings, line); } else { throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file"); @@ -110,6 +91,41 @@ public class VCF4Codec implements FeatureCodec { } + /** + * create a VCF header + * @param headerStrings a list of strings that represent all the ## entries + * @param line the single # line (column names) + * @return the count of header lines + */ + private int createHeader(List headerStrings, String line) { + headerStrings.add(line); + header = VCFReaderUtils.createHeader(headerStrings, VCFHeaderVersion.VCF4_0); + + // load the parsing fields + Set headerLines = header.getMetaData(); + + // setup our look-up lists for validation + for (VCFHeaderLine hl : headerLines) { + if (hl.getClass() == VCFFilterHeaderLine.class) + this.filterFields.add(((VCFFilterHeaderLine)hl).getmName()); + if (hl.getClass() == VCFFormatHeaderLine.class) + this.formatFields.add(((VCFFormatHeaderLine)hl).getmName()); + if (hl.getClass() == VCFInfoHeaderLine.class) + this.infoFields.add(((VCFInfoHeaderLine)hl).getmName()); + } + // sort the lists so we can binary search them later on + Collections.sort(filterFields); + Collections.sort(formatFields); + Collections.sort(infoFields); + + return headerStrings.size(); + } + + /** + * the fast decode function + * @param line the line of text for the record + * @return a feature, (not guaranteed complete) that has the correct start and stop + */ public Feature decodeLoc(String line) { return decode(line); } @@ -205,6 +221,11 @@ public class VCF4Codec implements FeatureCodec { return attributes; } + /** + * validate the attributes against the stored fields of the appopriate type + * @param attributes the list of fields to check for inclusion against the field array + * @param fields the master list; all attributes must be in this list to validate + */ private void validateFields(Set attributes, List fields) { // validate the info fields if (validateFromHeader) { @@ -234,15 +255,15 @@ public class VCF4Codec implements FeatureCodec { private List parseAlleles(String ref, String alts) { List alleles = new ArrayList(2); // we are almost always biallelic // ref - checkAllele(ref); + checkAllele(ref, true); Allele refAllele = Allele.create(ref, true); alleles.add(refAllele); if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split - parseSingleAllele(alleles, alts); + parseSingleAllele(alleles, alts, false); else for ( String alt : Utils.split(alts, ",") ) - parseSingleAllele(alleles, alt); + parseSingleAllele(alleles, alt, false); return alleles; } @@ -250,9 +271,10 @@ public class VCF4Codec implements FeatureCodec { /** * check to make sure the allele is an acceptable allele * @param allele the allele to check + * @param isRef are we the reference allele? */ - private static void checkAllele(String allele) { - if ( ! Allele.acceptableAlleleBases(allele) ) { + private static void checkAllele(String allele,boolean isRef) { + if ( ! Allele.acceptableAlleleBases(allele,isRef) ) { throw new StingException("Unparsable vcf record with allele " + allele); } } @@ -261,9 +283,10 @@ public class VCF4Codec implements FeatureCodec { * parse a single allele, given the allele list * @param alleles the alleles available * @param alt the allele to parse + * @param isRef are we the reference allele? */ - private void parseSingleAllele(List alleles, String alt) { - checkAllele(alt); + private void parseSingleAllele(List alleles, String alt, boolean isRef) { + checkAllele(alt,isRef); Allele allele = Allele.create(alt, false); if ( ! allele.isNoCall() ) @@ -304,10 +327,12 @@ public class VCF4Codec implements FeatureCodec { /** * parse out the VCF line + * * @param parts the parts split up * @return a variant context object */ private VariantContext parseVCFLine(String[] parts) { + // parse out the required fields String contig = parts[0]; long pos = Long.valueOf(parts[1]); String id = parts[2]; @@ -316,26 +341,31 @@ public class VCF4Codec implements FeatureCodec { Double qual = parseQual(parts[5]); String filter = parts[6]; String info = parts[7]; - String GT = parts[8]; - int genotypesStart = 9; - List alleles = parseAlleles(ref, alts); + + List alleles = parseAlleles(ref, alts); Set filters = parseFilters(filter); Map attributes = parseInfo(info, id); - // parse genotypes - int nGTKeys = ParsingUtils.split(GT, GTKeys, ':'); - Map genotypes = new HashMap(Math.max(parts.length - genotypesStart, 1)); - Iterator iter = header.getGenotypeSamples().iterator(); + // find out our current location, and clip the alleles down to their minimum length + Pair> locAndAlleles = (ref.length() > 1) ? + clipAlleles(contig, pos, ref, alleles) : + new Pair>(GenomeLocParser.createGenomeLoc(contig, pos), alleles); - Pair> locAndAlleles = (ref.length() > 1) ? - clipAlleles(contig,pos,ref,alleles) : - new Pair>(GenomeLocParser.createGenomeLoc(contig,pos),alleles); + // a map to store our genotypes + Map genotypes = null; + // do we have genotyping data + if (parts.length > 8) { + String GT = parts[8]; + int genotypesStart = 9; + // parse genotypes + int nGTKeys = ParsingUtils.split(GT, genotypeKeyArray, ':'); + genotypes = new HashMap(Math.max(parts.length - genotypesStart, 1)); + Iterator iter = header.getGenotypeSamples().iterator(); - if ( parseGenotypesToo ) { alleleMap.clear(); - for ( int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++ ) { + for (int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++) { String sample = parts[genotypeOffset]; String[] GTValues = CachedGTValues; ParsingUtils.split(sample, GTValues, ':'); @@ -345,30 +375,30 @@ public class VCF4Codec implements FeatureCodec { // todo -- the parsing of attributes could be made lazy for performance Map gtAttributes = null; - if ( nGTKeys > 1 ) { + if (nGTKeys > 1) { gtAttributes = new HashMap(nGTKeys - 1); - for ( int i = 1; i < nGTKeys; i++ ) { - if ( GTKeys[i].equals("GQ") ) { + for (int i = 1; i < nGTKeys; i++) { + if (genotypeKeyArray[i].equals("GQ")) { GTQual = parseQual(GTValues[i]); - } if ( GTKeys[i].equals("FL") ) { // deal with genotype filters here + } + if (genotypeKeyArray[i].equals("FL")) { // deal with genotype filters here genotypeFilters.addAll(parseFilters(GTValues[i])); } else { - gtAttributes.put(GTKeys[i], GTValues[i]); + gtAttributes.put(genotypeKeyArray[i], GTValues[i]); } } // validate the format fields - validateFields(gtAttributes.keySet(),formatFields); + validateFields(gtAttributes.keySet(), formatFields); } - boolean phased = GTKeys[0].charAt(1) == '|'; + boolean phased = genotypeKeyArray[0].charAt(1) == '|'; Genotype g = new Genotype(iter.next(), genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased); genotypes.put(g.getSampleName(), g); } } - // todo -- we need access to our track name to name the variant context - return new VariantContext("foo", locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes); + return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes); } /** @@ -434,4 +464,20 @@ public class VCF4Codec implements FeatureCodec { if (clazz != VCFHeader.class) throw new ClassCastException("expecting class " + clazz + " but VCF4Codec provides " + VCFHeader.class); return this.header; } + + /** + * get the name of this codec + * @return our set name + */ + public String getName() { + return name; + } + + /** + * set the name of this codec + * @param name + */ + public void setName(String name) { + this.name = name; + } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4ReaderTestWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4ReaderTestWalker.java index 691e7bd4b..985ca3219 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4ReaderTestWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4ReaderTestWalker.java @@ -92,7 +92,7 @@ public class VCF4ReaderTestWalker extends RodWalker { String[] parts = new String[10000]; public void onTraversalDone(Long num){ VCF4Codec vcf4codec = new VCF4Codec(); - VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES; + //VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES; VCF4Codec.validate = ! DontValidate; VCFCodec vcf3codec = new VCFCodec(); diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java index 556b70f2c..deac944a9 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java @@ -26,7 +26,8 @@ import java.util.Set; * test out pieces of the VCF 4 codec. */ public class VCF4UnitTest extends BaseTest { - File vcfFile = new File("testdata/vcfExmaple.vcf4"); + File vcfGenotypeFile = new File("testdata/vcf/vcfWithGenotypes.vcf"); + File vcfNoGenotypeFile = new File("testdata/vcf/vcfWithoutGenotypes.vcf"); // setup the contig ordering @BeforeClass @@ -42,7 +43,7 @@ public class VCF4UnitTest extends BaseTest { @Test public void testReadBasicHeader() { - TestSetup testSetup = new TestSetup().invoke(vcfFile); + TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); int seenRecords = 0; @@ -95,7 +96,7 @@ public class VCF4UnitTest extends BaseTest { @Test public void testOutputHeader() { - TestSetup testSetup = new TestSetup().invoke(vcfFile); + TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); File tempFile = null; try { @@ -115,9 +116,8 @@ public class VCF4UnitTest extends BaseTest { @Test public void testCountVCF4Records() { - TestSetup testSetup = new TestSetup().invoke(vcfFile); + TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); AsciiLineReader reader = testSetup.getReader(); - VCF4Codec codec = testSetup.getCodec(); // now parse the lines String line = null; @@ -139,6 +139,33 @@ public class VCF4UnitTest extends BaseTest { Assert.fail("Failed to read a line"); } } + Assert.assertEquals(7,recordCount); + } + + @Test + public void testCountVCF4RecordsWithoutGenotypes() { + TestSetup testSetup = new TestSetup().invoke(vcfNoGenotypeFile); + AsciiLineReader reader = testSetup.getReader(); + + // now parse the lines + String line = null; + try { + line = reader.readLine(); + } catch (IOException e) { + Assert.fail("Failed to read a line"); + } + + // our record count + int recordCount = 0; + while (line != null) { + try { + recordCount++; + testSetup.codec.decode(line); + line = reader.readLine(); + } catch (IOException e) { + Assert.fail("Failed to read a line"); + } + } Assert.assertEquals(6,recordCount); } @@ -150,25 +177,25 @@ public class VCF4UnitTest extends BaseTest { @Test public void testCheckInfoValidation() { - TestSetup testSetup = new TestSetup().invoke(vcfFile); + TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); testSetup.codec.decode(regularLine); } @Test public void testCheckTwoFewInfoValidation() { - TestSetup testSetup = new TestSetup().invoke(vcfFile); + TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); testSetup.codec.decode(twoFewInfoLine); } @Test(expected=StingException.class) public void testCheckTwoManyInfoValidation() { - TestSetup testSetup = new TestSetup().invoke(vcfFile); + TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); testSetup.codec.decode(twoManyInfoLine); } - //File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf"); + /* File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf"); - /*@Test + @Test public void checkLargeVCF() { TestSetup testSetup = new TestSetup().invoke(largeVCF); AsciiLineReader reader = testSetup.getReader();