diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java b/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java index 09d8b7efb..ad0aa6ccd 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java @@ -59,7 +59,7 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, @Override public String toString() { if (this.mCurrentRecord != null) - return this.mCurrentRecord.toStringRepresentation(mReader.getHeader()); + return this.mCurrentRecord.toStringEncoding(mReader.getHeader()); else return ""; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFMixedUp.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFMixedUp.java deleted file mode 100755 index 813c6da41..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFMixedUp.java +++ /dev/null @@ -1,85 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.vcftools; - -import org.broadinstitute.sting.gatk.walkers.RefWalker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.RodVCF; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; -import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord; -import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; - -import java.io.FileReader; -import java.io.IOException; -import java.io.BufferedReader; -import java.util.ArrayList; - -public class VCFMixedUp extends RefWalker { - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - for (ReferenceOrderedDatum rod : tracker.getAllRods()) { - if (rod instanceof RodVCF) { - for (String rodfile : this.getToolkit().getArguments().RODBindings) { - String[] rodpieces = rodfile.split(","); - - if (rodpieces[2].contains(".vcf")) { - try { - FileReader reader = new FileReader(rodpieces[2]); - BufferedReader breader = new BufferedReader(reader); - - ArrayList header = new ArrayList(); - ArrayList variant = new ArrayList(); - - String line; - while ((line = breader.readLine()) != null) { - String[] pieces = line.split("\\s+"); - - if (line.contains("##")) { - } else if (line.contains("#CHROM")) { - for (int i = 0; i < pieces.length; i++) { - header.add(i, pieces[i]); - } - } else { - for (int i = 0; i < pieces.length; i++) { - variant.add(i, pieces[i]); - } - - for (int i = 0; i < pieces.length; i++) { - if (i < 9 || header.get(i).contains("NA12892") || header.get(i).contains("NA10851")) { - out.printf("%s => %s\n", header.get(i), variant.get(i)); - } - } - } - } - } catch (IOException e) {} - } - } - - RodVCF vcfrod = (RodVCF) rod; - VCFRecord record = vcfrod.mCurrentRecord; - - out.println(); - - for (VCFGenotypeRecord gr : record.getVCFGenotypeRecords()) { - if (gr.getSampleName().equalsIgnoreCase("NA12892") || gr.getSampleName().equalsIgnoreCase("NA10851")) { - out.printf("%s: %s:%s:%s:%s\n", gr.getSampleName(), - gr.toGenotypeString(record.getAlternateAlleles()), - gr.getFields().get("GQ"), - gr.getFields().get("DP"), - gr.getFields().get("HQ")); - } - } - } - } - - return 0; - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer value, Integer sum) { - return 0; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java index fdc728839..7aeb787ec 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.playground.gatk.walkers.vcftools; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; @@ -9,7 +8,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.genotype.vcf.*; import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.StingException; import java.util.*; import java.io.File; @@ -64,7 +62,7 @@ public class VCFSubsetWalker extends RefWalker, VCFWriter> initializeWriter(); } - //out.println(record.toStringRepresentation(vcfrod.getHeader())); + //out.println(record.toStringEncoding(vcfrod.getHeader())); records.add(record); } @@ -117,7 +115,7 @@ public class VCFSubsetWalker extends RefWalker, VCFWriter> if (writer != null) { writer.addRecord(subset); } else { - out.println(subset.toStringRepresentation(vheader)); + out.println(subset.toStringEncoding(vheader)); } } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java index 80a4461b4..87940d3ef 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java @@ -86,7 +86,7 @@ public class VCFGenotypeRecord { return mFields; } - public String toGenotypeString(List altAlleles) { + private String toGenotypeString(List altAlleles) { String str = ""; boolean first = true; for (VCFGenotypeEncoding allele : mGenotypeAlleles) { @@ -133,4 +133,24 @@ public class VCFGenotypeRecord { } return false; } + + /** + * output a string representation of the VCFGenotypeRecord, given the alternate alleles + * + * @param altAlleles the alternate alleles, needed for toGenotypeString() + * + * @return a string + */ + public String toStringEncoding(List altAlleles) { + StringBuilder builder = new StringBuilder(); + builder.append(toGenotypeString(altAlleles)); + boolean first = true; + for (String field : mFields.keySet()) { + if (mFields.get(field).equals("")) continue; + builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR); + builder.append(mFields.get(field)); + + } + return builder.toString(); + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java index 90df9c9a5..de5fe6198 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java @@ -21,7 +21,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { private VCFHeader mHeader = null; private String mSource; private String mReferenceName; - private final Set mSampleNames = new HashSet(); + private final Set mSampleNames = new LinkedHashSet(); /** our log, which we want to capture anything from this class */ protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java index 5c1a3cfbf..d06bbdd59 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java @@ -23,7 +23,7 @@ public class VCFHeader { private final Map mMetaData = new HashMap(); // the list of auxillary tags - private final Set mGenotypeSampleNames = new HashSet(); + private final Set mGenotypeSampleNames = new LinkedHashSet(); // the character string that indicates meta data public static final String METADATA_INDICATOR = "##"; diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java index cb853a280..41480c06f 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java @@ -124,9 +124,8 @@ public class VCFReader implements Iterator, Iterable { * @return a VCF Header created from the list of stinrgs */ protected VCFHeader createHeader(List headerStrings) { - Map metaData = new HashMap(); - Set auxTags = new HashSet(); + Set auxTags = new LinkedHashSet(); // iterate over all the passed in strings for (String str : headerStrings) { Matcher matcher = pMeta.matcher(str); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index 42ddd23b5..8e93cd2ea 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -365,7 +365,7 @@ public class VCFRecord { * @param header the VCF header for this VCF Record * @return a string */ - public String toStringRepresentation(VCFHeader header) { + public String toStringEncoding(VCFHeader header) { StringBuilder builder = new StringBuilder(); // CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO @@ -425,13 +425,7 @@ public class VCFRecord { builder.append(FIELD_SEPERATOR); if (gMap.containsKey(genotype)) { VCFGenotypeRecord rec = gMap.get(genotype); - if (!rec.toGenotypeString(this.mAlts).equals("")) - builder.append(rec.toGenotypeString(this.mAlts)); - for (String s : rec.getFields().keySet()) { - if (rec.getFields().get(s).equals("")) continue; - builder.append(GENOTYPE_FIELD_SEPERATOR); - builder.append(rec.getFields().get(s)); - } + builder.append(rec.toStringEncoding(this.mAlts)); gMap.remove(genotype); } else { builder.append(VCFGenotypeRecord.EMPTY_GENOTYPE); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index ddbd2b2d5..7d7d173f5 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -74,7 +74,7 @@ public class VCFWriter { * @param record the record to output */ public void addRecord(VCFRecord record) { - String vcfString = record.toStringRepresentation(mHeader); + String vcfString = record.toStringEncoding(mHeader); try { mWriter.write(vcfString + "\n"); } catch (IOException e) { diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java index 0ef284ec3..99f5501a6 100755 --- a/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java @@ -99,13 +99,13 @@ public class RodVCFTest extends BaseTest { while (iter.hasNext()) { VCFRecord rec1 = reader.next(); VCFRecord rec2 = iter.next().mCurrentRecord; - if (!rec1.toStringRepresentation(mHeader).equals(rec2.toStringRepresentation(mHeader))) { - fail("VCF record rec1.toString() != rec2.toString()"); + if (!rec1.toStringEncoding(mHeader).equals(rec2.toStringEncoding(mHeader))) { + fail("VCF record rec1.toStringEncoding() != rec2.toStringEncoding()"); } // verify the first line too if (first) { - if (!firstLine.equals(rec1.toStringRepresentation(mHeader) + "\n")) { - fail("VCF record rec1.toString() != expected string :\n" + rec1.toStringRepresentation(mHeader) + firstLine); + if (!firstLine.equals(rec1.toStringEncoding(mHeader) + "\n")) { + fail("VCF record rec1.toStringEncoding() != expected string :\n" + rec1.toStringEncoding(mHeader) + firstLine); } first = false; } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 2e6a1c34e..395566ea8 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -50,7 +50,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsNotAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("4d7e6670ec71e3b7065ca7e25855189d")); + Arrays.asList("0ff01251afdabafb4e137357b25be72a")); executeTest("test file has annotations, not asking for annotations, #1", spec); } @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -all -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("94103b529e013748696be3e5dd0d90ee")); + Arrays.asList("ac80f5a5ab06037f938861effcceb66f")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -82,7 +82,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsNotAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2empty.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("386ab9b98b708a840a1ae915624777d5")); + Arrays.asList("18d85a9711d56bdf7e2327b83d6745e2")); executeTest("test file doesn't have annotations, not asking for annotations, #1", spec); } @@ -98,7 +98,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -all -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2empty.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("cf2f60758269759f7be79ad18b12771b")); + Arrays.asList("cda6af61023e67ce8cc8d57cc6cd155b")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java index 30188fc3d..712db17ce 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java @@ -30,7 +30,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest { public void testNWayVenn() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -B set3,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/CEU.sample.vcf -CT NWayVenn", 1, - Arrays.asList("e067aace9080bafdf3312632de60b4fc")); + Arrays.asList("9a5910137b6b9745f6e0c3ee711a6bfa")); executeTest("testNWayVenn", spec); } } \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java index ba004237e..6195dbe5a 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java @@ -1,18 +1,22 @@ package org.broadinstitute.sting.utils.genotype.vcf; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.refdata.RodVCF; import org.junit.Assert; import org.junit.Test; -import java.io.File; +import java.io.*; +import java.util.ArrayList; +import java.util.Iterator; import java.util.List; import java.util.Map; /** test the VCFReader class test */ public class VCFReaderTest extends BaseTest { - private static File vcfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf"); - private static File multiSampleVCF = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/MultiSample.vcf"); + private static final File vcfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf"); + private static final File multiSampleVCF = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/MultiSample.vcf"); + private static final String VCF_MIXUP_FILE = "/humgen/gsa-scr1/GATK_Data/Validation_Data/mixedup.vcf"; @Test public void testVCFInput() { @@ -59,12 +63,12 @@ public class VCFReaderTest extends BaseTest { if (!reader.hasNext()) Assert.fail("The reader should have a record"); VCFRecord rec = reader.next(); boolean seenNA11992 = false; - boolean seenNA12287 = false; + boolean seenNA12287 = false; for (VCFGenotypeRecord record : rec.getVCFGenotypeRecords()) { if (record.getSampleName().equals("NA11992")) { List encodings = record.getAlleles(); if (!encodings.get(0).getBases().equals("A") || - !encodings.get(1).getBases().equals("A")) { + !encodings.get(1).getBases().equals("A")) { Assert.fail("Sample NA11992 at site 1:10000005 should be AA"); } seenNA11992 = true; @@ -72,7 +76,7 @@ public class VCFReaderTest extends BaseTest { if (record.getSampleName().equals("NA12287")) { List encodings = record.getAlleles(); if (!encodings.get(0).getBases().equals("A") || - !encodings.get(1).getBases().equals("T")) { + !encodings.get(1).getBases().equals("T")) { Assert.fail("Sample NA11992 at site 1:10000005 should be AA"); } seenNA12287 = true; @@ -135,4 +139,49 @@ public class VCFReaderTest extends BaseTest { Assert.assertTrue(values.get("C").equals("")); Assert.assertTrue(values.get("D").equals("")); } + + @Test + public void testKiransOutOfOrderExample() { + ArrayList header = new ArrayList(); + ArrayList variant = new ArrayList(); + + try { + FileReader reader = new FileReader(VCF_MIXUP_FILE); + BufferedReader breader = new BufferedReader(reader); + String line; + while ((line = breader.readLine()) != null) { + String[] pieces = line.split("\\s+"); + + if (line.contains("##")) { + continue; + } else if (line.contains("#CHROM")) { + for (int i = 0; i < pieces.length; i++) { + header.add(i, pieces[i]); + } + } else { + for (int i = 0; i < pieces.length; i++) { + variant.add(i, pieces[i]); + } + } + } + } catch (FileNotFoundException e) { + Assert.fail("Unable to open the mixed up VCF file."); + } catch (IOException e) { + Assert.fail("Unable to read from the mixed up VCF file."); + } + + // now load up a ROD + Iterator rod = RodVCF.createIterator("name",new File(VCF_MIXUP_FILE)); + RodVCF newRod = rod.next(); + List records = newRod.mCurrentRecord.getVCFGenotypeRecords(); + for (VCFGenotypeRecord record: records) { + if (!header.contains(record.getSampleName())) { + Assert.fail("Parsed header doesn't contain field " + record.getSampleName()); + } + if (!variant.get(header.indexOf(record.getSampleName())).equals(record.toStringEncoding(newRod.mCurrentRecord.getAlternateAlleles()))) { + Assert.fail("Parsed value for " + record.getSampleName() + " doesn't contain the same value ( " + record.toStringEncoding(newRod.mCurrentRecord.getAlternateAlleles()) + + " != " + variant.get(header.indexOf(record.getSampleName()))); + } + } + } } diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java index d1fdafd48..cd3ff8feb 100755 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java @@ -142,10 +142,10 @@ public class VCFRecordTest extends BaseTest { VCFRecord rec = makeFakeVCFRecord(infoFields); Map metaData = new HashMap(); List additionalColumns = new ArrayList(); - String rep = rec.toStringRepresentation(createFakeHeader()); + String rep = rec.toStringEncoding(createFakeHeader()); Assert.assertTrue(stringRep.equals(rep)); rec.addInfoField("AB", "CD"); - String rep2 = rec.toStringRepresentation(createFakeHeader()); + String rep2 = rec.toStringEncoding(createFakeHeader()); Assert.assertTrue(stringRep2.equals(rep2)); //rec.addGenotypeField(createGenotype("sample3","A","D12")); }