diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java index 0b8ae0fd7..44f7b8e8d 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java @@ -90,7 +90,10 @@ public class VCFGenotypeRecord { String str = ""; boolean first = true; for (VCFGenotypeEncoding allele : mGenotypeAlleles) { - str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0); + if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED) + str += VCFGenotypeRecord.EMPTY_GENOTYPE; + else + str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0); if (first) { switch (mPhaseType) { case UNPHASED: diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java index 5227c2ca3..6cebf4489 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.genotype.vcf; import org.broadinstitute.sting.utils.genotype.*; +import org.apache.log4j.Logger; import java.io.File; import java.io.OutputStream; @@ -25,6 +26,10 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { private final File mFile; private final OutputStream mStream; + /** our log, which we want to capture anything from this class */ + protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class); + + public VCFGenotypeWriterAdapter(String source, String referenceName, File writeTo, Set sampleNames) { mReferenceName = referenceName; mSource = source; @@ -125,12 +130,32 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { VCFParameters params = new VCFParameters(); params.addFormatItem("GT"); - for (Genotype gtype : genotypes) { - // setup the parameters - params.setLocations(gtype.getLocation(), gtype.getReference()); + // mapping of our sample names to genotypes + if (genotypes.size() < 1) { + throw new IllegalArgumentException("Unable to parse out the current location: genotype array must at least contain one entry"); + } - VCFGenotypeRecord record = createVCFGenotypeRecord(params, gtype); - params.addGenotypeRecord(record); + // get the location and reference + params.setLocations(genotypes.get(0).getLocation(), genotypes.get(0).getReference()); + + Map genotypeMap = genotypeListToSampleNameMap(genotypes); + + for (String name : mHeader.getGenotypeSamples()) { + if (genotypeMap.containsKey(name)) { + Genotype gtype = genotypeMap.get(name); + VCFGenotypeRecord record = createVCFGenotypeRecord(params, gtype); + params.addGenotypeRecord(record); + genotypeMap.remove(name); + } else { + VCFGenotypeRecord record = createNoCallRecord(params, name); + params.addGenotypeRecord(record); + } + } + + if (genotypeMap.size() > 0) { + for (String name : genotypeMap.keySet()) + logger.fatal("Genotype " + name + " was present in the VCFHeader"); + throw new IllegalArgumentException("Genotype array passed to VCFGenotypeWriterAdapter contained Genotypes not in the VCF header"); } Map infoFields = getInfoFields(metadata, params); @@ -213,6 +238,29 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { return record; } + /** + * create a no call record + * + * @param params the VCF parameters object + * @param sampleName the sample name + * + * @return a VCFGenotypeRecord for the no call situation + */ + private VCFGenotypeRecord createNoCallRecord(VCFParameters params, String sampleName) { + Map map = new HashMap(); + + + List alleles = new ArrayList(); + alleles.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_GENOTYPE)); + alleles.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_GENOTYPE)); + + VCFGenotypeRecord record = new VCFGenotypeRecord(sampleName, + alleles, + VCFGenotypeRecord.PHASE.UNPHASED, + map); + return record; + } + /** * create the allele array? * @@ -234,4 +282,21 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { return true; } + /** + * create a genotype mapping from a list and their sample names + * + * @param list a list of genotype samples + * + * @return a mapping of the sample name to genotype fields + */ + private static Map genotypeListToSampleNameMap(List list) { + Map map = new HashMap(); + for (Genotype rec : list) { + if (!(rec instanceof SampleBacked)) + throw new IllegalArgumentException("Genotype must be backed by sample information"); + map.put(((SampleBacked) rec).getSampleName(), rec); + } + return map; + } + } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java index 3550b9b61..cb853a280 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java @@ -25,7 +25,7 @@ public class VCFReader implements Iterator, Iterable { private static Pattern pMeta = Pattern.compile("^" + VCFHeader.METADATA_INDICATOR + "\\s*(\\S+)\\s*=\\s*(\\S+)\\s*$"); // our pattern matching for the genotype mFields - private static final Pattern gtPattern = Pattern.compile("([0-9]+)([\\\\|\\/])([0-9]*)"); + private static final Pattern gtPattern = Pattern.compile("([0-9\\.]+)([\\\\|\\/])([0-9\\.]*)"); /** * Create a VCF reader, given a VCF file @@ -48,7 +48,7 @@ public class VCFReader implements Iterator, Iterable { lines.add(line); line = mReader.readLine(); } - mHeader = this.createHeader(lines); + mHeader = this.createHeader(lines); mNextRecord = createRecord(line, mHeader); } catch (IOException e) { throw new RuntimeException("VCFReader: Failed to parse VCF File on line: " + line, e); @@ -236,8 +236,7 @@ public class VCFReader implements Iterator, Iterable { phase = VCFGenotypeRecord.determinePhase(m.group(2)); addAllele(m.group(1), altAlleles, referenceBase, bases); if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases); - } - else { + } else { tagToValue.put(key, parse); } if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1; @@ -263,14 +262,18 @@ public class VCFReader implements Iterator, Iterable { * @param bases the list of bases for this genotype call */ private static void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List bases) { - int alleleValue = Integer.valueOf(alleleNumber); - // check to make sure the allele value is within bounds - if (alleleValue < 0 || alleleValue > altAlleles.length) - throw new IllegalArgumentException("VCFReader: the allele value of " + alleleValue + " is out of bounds given the alternate allele list."); - if (alleleValue == 0) - bases.add(new VCFGenotypeEncoding(String.valueOf(referenceBase))); - else - bases.add(new VCFGenotypeEncoding(altAlleles[alleleValue - 1])); + if (alleleNumber.equals(VCFGenotypeRecord.EMPTY_GENOTYPE)) { + bases.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_GENOTYPE)); + } else { + int alleleValue = Integer.valueOf(alleleNumber); + // check to make sure the allele value is within bounds + if (alleleValue < 0 || alleleValue > altAlleles.length) + throw new IllegalArgumentException("VCFReader: the allele value of " + alleleValue + " is out of bounds given the alternate allele list."); + if (alleleValue == 0) + bases.add(new VCFGenotypeEncoding(String.valueOf(referenceBase))); + else + bases.add(new VCFGenotypeEncoding(altAlleles[alleleValue - 1])); + } } diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java index 2c9144cc2..1e8e810e6 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java @@ -1,91 +1,136 @@ package org.broadinstitute.sting.utils.genotype.vcf; -import org.junit.Test; -import org.junit.Assert; import org.broadinstitute.sting.BaseTest; +import org.junit.Assert; +import org.junit.Test; import java.io.File; +import java.util.List; import java.util.Map; -/** - * test the VCFReader class test - */ +/** test the VCFReader class test */ public class VCFReaderTest extends BaseTest { private static File vcfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf"); + private static File multiSampleVCF = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/MultiSample.vcf"); @Test public void testVCFInput() { - try { - Thread.sleep(5000); - } catch (InterruptedException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - } VCFReader reader = new VCFReader(vcfFile); int counter = 0; while (reader.hasNext()) { counter++; reader.next(); } - Assert.assertEquals(5,counter); + Assert.assertEquals(5, counter); } - /** - * test the basic parsing - */ + @Test + public void testMultiSampleVCFInput() { + VCFReader reader = new VCFReader(multiSampleVCF); + int counter = 0; + while (reader.hasNext()) { + counter++; + reader.next(); + } + Assert.assertEquals(99, counter); + } + + @Test + public void testNoCallSites() { + VCFReader reader = new VCFReader(multiSampleVCF); + if (!reader.hasNext()) Assert.fail("The reader should have a record"); + VCFRecord rec = reader.next(); + final int numberOfNoCallsTruth = 9; + int noCalls = 0; + for (VCFGenotypeRecord record : rec.getVCFGenotypeRecords()) { + List encodings = record.getAlleles(); + if (encodings.get(0).getType() == VCFGenotypeEncoding.TYPE.UNCALLED && + encodings.get(1).getType() == VCFGenotypeEncoding.TYPE.UNCALLED) + noCalls++; + } + Assert.assertEquals(numberOfNoCallsTruth, noCalls); + } + + + @Test + public void testKnownCallSites() { + VCFReader reader = new VCFReader(multiSampleVCF); + if (!reader.hasNext()) Assert.fail("The reader should have a record"); + VCFRecord rec = reader.next(); + boolean seenNA11992 = false; + boolean seenNA12287 = false; + for (VCFGenotypeRecord record : rec.getVCFGenotypeRecords()) { + if (record.getSampleName().equals("NA11992")) { + List encodings = record.getAlleles(); + if (!encodings.get(0).getBases().equals("A") || + !encodings.get(1).getBases().equals("A")) { + Assert.fail("Sample NA11992 at site 1:10000005 should be AA"); + } + seenNA11992 = true; + } + if (record.getSampleName().equals("NA12287")) { + List encodings = record.getAlleles(); + if (!encodings.get(0).getBases().equals("A") || + !encodings.get(1).getBases().equals("T")) { + Assert.fail("Sample NA11992 at site 1:10000005 should be AA"); + } + seenNA12287 = true; + } + } + Assert.assertTrue(seenNA11992 && seenNA12287); + } + + /** test the basic parsing */ @Test public void testBasicParsing() { String formatString = "GT:B:C:D"; String genotypeString = "0|1:2:3:4"; - String altAlleles[] = {"A","G","T"}; + String altAlleles[] = {"A", "G", "T"}; char referenceBase = 'C'; - VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase); - Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType()); - Assert.assertEquals("C",rec.getAlleles().get(0).toString()); - Assert.assertEquals("A",rec.getAlleles().get(1).toString()); - Map values = rec.getFields(); - Assert.assertEquals(3,values.size()); + VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test", formatString, genotypeString, altAlleles, referenceBase); + Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED, rec.getPhaseType()); + Assert.assertEquals("C", rec.getAlleles().get(0).toString()); + Assert.assertEquals("A", rec.getAlleles().get(1).toString()); + Map values = rec.getFields(); + Assert.assertEquals(3, values.size()); Assert.assertTrue(values.get("B").equals("2")); Assert.assertTrue(values.get("C").equals("3")); Assert.assertTrue(values.get("D").equals("4")); } - /** - * test the parsing of a genotype field with missing parameters - */ + /** test the parsing of a genotype field with missing parameters */ @Test public void testMissingFieldParsing() { String formatString = "GT:B:C:D"; String genotypeString = "0|1:::4"; - String altAlleles[] = {"A","G","T"}; + String altAlleles[] = {"A", "G", "T"}; char referenceBase = 'C'; - VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase); - Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType()); - Assert.assertEquals("C",rec.getAlleles().get(0).toString()); - Assert.assertEquals("A",rec.getAlleles().get(1).toString()); - Map values = rec.getFields(); - Assert.assertEquals(3,values.size()); + VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test", formatString, genotypeString, altAlleles, referenceBase); + Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED, rec.getPhaseType()); + Assert.assertEquals("C", rec.getAlleles().get(0).toString()); + Assert.assertEquals("A", rec.getAlleles().get(1).toString()); + Map values = rec.getFields(); + Assert.assertEquals(3, values.size()); Assert.assertTrue(values.get("B").equals("")); Assert.assertTrue(values.get("C").equals("")); Assert.assertTrue(values.get("D").equals("4")); } - /** - * test the parsing of a genotype field with different missing parameters - */ + /** test the parsing of a genotype field with different missing parameters */ @Test public void testMissingAllFields() { String formatString = "GT:B:C:D"; String genotypeString = "0|1:::"; - String altAlleles[] = {"A","G","T"}; + String altAlleles[] = {"A", "G", "T"}; char referenceBase = 'C'; - VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase); - Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType()); - Assert.assertEquals("C",rec.getAlleles().get(0).toString()); - Assert.assertEquals("A",rec.getAlleles().get(1).toString()); - Map values = rec.getFields(); - Assert.assertEquals(3,values.size()); + VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test", formatString, genotypeString, altAlleles, referenceBase); + Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED, rec.getPhaseType()); + Assert.assertEquals("C", rec.getAlleles().get(0).toString()); + Assert.assertEquals("A", rec.getAlleles().get(1).toString()); + Map values = rec.getFields(); + Assert.assertEquals(3, values.size()); Assert.assertTrue(values.get("B").equals("")); Assert.assertTrue(values.get("C").equals("")); Assert.assertTrue(values.get("D").equals(""));