diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index 06d14366f..a812babaf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -26,16 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.diffengine; import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.*; -import java.util.Arrays; import java.util.Map; -import java.util.zip.GZIPInputStream; /** @@ -58,7 +54,13 @@ public class VCFDiffableReader implements DiffableReader { VCFCodec vcfCodec = new VCFCodec(); // must be read as state is stored in reader itself - vcfCodec.readHeader(lineReader); + VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader); + for ( VCFHeaderLine headerLine : header.getMetaData() ) { + String key = headerLine.getKey(); + if ( headerLine instanceof VCFNamedHeaderLine ) + key += "_" + ((VCFNamedHeaderLine) headerLine).getName(); + root.add(key, headerLine.toString()); + } String line = lineReader.readLine(); int count = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index 4833a6cad..1d9616aac 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -220,6 +220,9 @@ public class ReadBackedPhasingWalker extends RodWalker unclippedAlleles, String ref) { boolean clipping = true; // Note that the computation of forward clipping here is meant only to see whether there is a common diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index a8bf74707..e7ddac185 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -32,6 +32,7 @@ import org.broad.tribble.index.IndexFactory; import org.broad.tribble.util.LittleEndianOutputStream; import org.broad.tribble.util.ParsingUtils; import org.broad.tribble.util.PositionalStream; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; @@ -300,10 +301,7 @@ public class StandardVCFWriter implements VCFWriter { } else { List genotypeAttributeKeys = new ArrayList(); if ( vc.hasGenotypes() ) { - genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); - for ( String key : calcVCFGenotypeKeys(vc) ) { - genotypeAttributeKeys.add(key); - } + genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc)); } else if ( mHeader.hasGenotypingData() ) { // this needs to be done in case all samples are no-calls genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); @@ -387,16 +385,22 @@ public class StandardVCFWriter implements VCFWriter { continue; } - writeAllele(g.getAllele(0), alleleMap); - for (int i = 1; i < g.getPloidy(); i++) { - mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED); - writeAllele(g.getAllele(i), alleleMap); - } - List attrs = new ArrayList(genotypeFormatKeys.size()); for ( String key : genotypeFormatKeys ) { - if ( key.equals(VCFConstants.GENOTYPE_KEY) ) + + if ( key.equals(VCFConstants.GENOTYPE_KEY) ) { + if ( !g.isAvailable() ) { + throw new ReviewedStingException("GTs cannot be missing for some samples if they are available for others in the record"); + } + + writeAllele(g.getAllele(0), alleleMap); + for (int i = 1; i < g.getPloidy(); i++) { + mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED); + writeAllele(g.getAllele(i), alleleMap); + } + continue; + } Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4; @@ -440,9 +444,10 @@ public class StandardVCFWriter implements VCFWriter { break; } - for (String s : attrs ) { - mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR); - mWriter.write(s); + for (int i = 0; i < attrs.size(); i++) { + if ( i > 0 || genotypeFormatKeys.contains(VCFConstants.GENOTYPE_KEY) ) + mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR); + mWriter.write(attrs.get(i)); } } } @@ -488,10 +493,13 @@ public class StandardVCFWriter implements VCFWriter { private static List calcVCFGenotypeKeys(VariantContext vc) { Set keys = new HashSet(); + boolean sawGoodGT = false; boolean sawGoodQual = false; boolean sawGenotypeFilter = false; for ( Genotype g : vc.getGenotypes().values() ) { keys.addAll(g.getAttributes().keySet()); + if ( g.isAvailable() ) + sawGoodGT = true; if ( g.hasNegLog10PError() ) sawGoodQual = true; if (g.isFiltered() && g.isCalled()) @@ -504,7 +512,17 @@ public class StandardVCFWriter implements VCFWriter { if (sawGenotypeFilter) keys.add(VCFConstants.GENOTYPE_FILTER_KEY); - return ParsingUtils.sortList(new ArrayList(keys)); + List sortedList = ParsingUtils.sortList(new ArrayList(keys)); + + // make sure the GT is first + if ( sawGoodGT ) { + List newList = new ArrayList(sortedList.size()+1); + newList.add(VCFConstants.GENOTYPE_KEY); + newList.addAll(sortedList); + sortedList = newList; + } + + return sortedList; } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java index f3c99e963..c29f2ba8b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java @@ -141,8 +141,6 @@ public class VCF3Codec extends AbstractVCFCodec { boolean missing = i >= GTValueSplitSize; if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) { - if (i != 0) - generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes"); genotypeAlleleLocation = i; } else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]); @@ -156,12 +154,13 @@ public class VCF3Codec extends AbstractVCFCodec { } } - // check to make sure we found a gentoype field - if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field"); + // check to make sure we found a genotype field + if ( genotypeAlleleLocation < 0 ) + generateException("Unable to find the GT field for the record; the GT field is required"); + if ( genotypeAlleleLocation > 0 ) + generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes"); - // todo -- assuming allele list length in the single digits is bad. Fix me. - // Check for > 1 for haploid genotypes - boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|'; + boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1; // add it to the list try { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 0fb2940bb..05fff5d9e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -145,8 +145,6 @@ public class VCFCodec extends AbstractVCFCodec { // todo -- all of these on the fly parsing of the missing value should be static constants if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) { - if (i != 0) - generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes"); genotypeAlleleLocation = i; } else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]); @@ -160,22 +158,24 @@ public class VCFCodec extends AbstractVCFCodec { } } - // check to make sure we found a gentoype field - // TODO -- This is no longer required in v4.1 - if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field"); + // check to make sure we found a genotype field if we are a VCF4.0 file + if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 ) + generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0"); + if ( genotypeAlleleLocation > 0 ) + generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present"); - // todo -- assuming allele list length in the single digits is bad. Fix me. - // Check for > 1 for haploid genotypes - boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|'; + List GTalleles = (genotypeAlleleLocation == -1 ? null : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap)); + boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1; // add it to the list try { - genotypes.put(sampleName, new Genotype(sampleName, - parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap), - GTQual, - genotypeFilters, - gtAttributes, - phased)); + genotypes.put(sampleName, + new Genotype(sampleName, + GTalleles, + GTQual, + genotypeFilters, + gtAttributes, + phased)); } catch (TribbleException e) { throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos); } diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 0be4bec91..17c4a7df4 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -154,6 +154,16 @@ public class UserException extends ReviewedStingException { } } + public static class MalformedVCF extends UserException { + public MalformedVCF(String message, String line) { + super(String.format("The provided VCF file is malformed at line %s: %s", line, message)); + } + + public MalformedVCF(String message, int lineNo) { + super(String.format("The provided VCF file is malformed at line nmber %d: %s", lineNo, message)); + } + } + public static class ReadMissingReadGroup extends MalformedBAM { public ReadMissingReadGroup(SAMRecord read) { super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName())); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index 3a87f1196..0b5976c3c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broad.tribble.util.ParsingUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -19,12 +20,14 @@ public class Genotype { protected InferredGeneticContext commonInfo; public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; protected List alleles = null; // new ArrayList(); + protected Type type = null; protected boolean isPhased = false; - private boolean filtersWereAppliedToContext; + protected boolean filtersWereAppliedToContext; public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased) { - this.alleles = Collections.unmodifiableList(alleles); + if ( alleles != null ) + this.alleles = Collections.unmodifiableList(alleles); commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes); filtersWereAppliedToContext = filters != null; this.isPhased = isPhased; @@ -66,6 +69,9 @@ public class Genotype { } public List getAlleles(Allele allele) { + if ( getType() == Type.UNAVAILABLE ) + throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype"); + List al = new ArrayList(); for ( Allele a : alleles ) if ( a.equals(allele) ) @@ -75,6 +81,8 @@ public class Genotype { } public Allele getAllele(int i) { + if ( getType() == Type.UNAVAILABLE ) + throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype"); return alleles.get(i); } @@ -89,10 +97,21 @@ public class Genotype { NO_CALL, HOM_REF, HET, - HOM_VAR + HOM_VAR, + UNAVAILABLE } public Type getType() { + if ( type == null ) { + type = determineType(); + } + return type; + } + + protected Type determineType() { + if ( alleles == null ) + return Type.UNAVAILABLE; + Allele firstAllele = alleles.get(0); if ( firstAllele.isNoCall() ) { @@ -122,7 +141,8 @@ public class Genotype { * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF) */ public boolean isNoCall() { return getType() == Type.NO_CALL; } - public boolean isCalled() { return getType() != Type.NO_CALL; } + public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; } + public boolean isAvailable() { return getType() != Type.UNAVAILABLE; } // // Useful methods for getting genotype likelihoods for a genotype object, if present @@ -157,8 +177,8 @@ public class Genotype { } public void validate() { - if ( alleles == null ) throw new IllegalArgumentException("BUG: alleles cannot be null in setAlleles"); - if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0 in setAlleles"); + if ( alleles == null ) return; + if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0"); int nNoCalls = 0; for ( Allele allele : alleles ) { @@ -175,6 +195,9 @@ public class Genotype { } public String getGenotypeString(boolean ignoreRefState) { + if ( alleles == null ) + return null; + // Notes: // 1. Make sure to use the appropriate separator depending on whether the genotype is phased // 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index da80a3431..92c5d648b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1206,9 +1206,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName()); - for ( Allele gAllele : g.getAlleles() ) { - if ( ! hasAllele(gAllele) && gAllele.isCalled() ) - throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles); + if ( g.isAvailable() ) { + for ( Allele gAllele : g.getAlleles() ) { + if ( ! hasAllele(gAllele) && gAllele.isCalled() ) + throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles); + } } } } diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index dacaf2738..d65f4ec34 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -26,7 +26,9 @@ package org.broadinstitute.sting; import org.apache.commons.lang.StringUtils; +import org.broad.tribble.FeatureCodec; import org.broad.tribble.Tribble; +import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.gatk.CommandLineExecutable; @@ -64,10 +66,19 @@ public class WalkerTest extends BaseTest { } System.out.println("Verifying on-the-fly index " + indexFile + " for test " + name + " using file " + resultFile); - Assert.assertTrue(IndexFactory.onDiskIndexEqualToNewlyCreatedIndex(resultFile, indexFile, new VCFCodec()), "Index on disk from indexing on the fly not equal to the index created after the run completed"); + Index indexFromOutputFile = IndexFactory.createIndex(resultFile, new VCFCodec()); + Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath()); + + if ( ! indexFromOutputFile.equals(dynamicIndex) ) { + Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n", + indexFromOutputFile.getProperties(), + dynamicIndex.getProperties())); + } } } + + public List assertMatchingMD5s(final String name, List resultFiles, List expectedMD5s) { List md5s = new ArrayList(); for (int i = 0; i < resultFiles.size(); i++) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java index baa2f0383..a0cb47770 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java @@ -87,7 +87,7 @@ public class DiffableReaderUnitTest extends BaseTest { Assert.assertSame(diff.getParent(), DiffElement.ROOT); DiffNode node = diff.getValueAsNode(); - Assert.assertEquals(node.getElements().size(), 9); + Assert.assertEquals(node.getElements().size(), 10); // chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 DiffNode rec1 = node.getElement("chr1:2646").getValueAsNode(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java index 3f87fc1a2..c88eac149 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java @@ -23,7 +23,7 @@ public class MergeMNPsIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1) + " -L chr20:556259-756570", 1, - Arrays.asList("e312b7d3854d5b2834a370659514a813")); + Arrays.asList("7f11f7f75d1526077f0173c7ed1fc6c4")); executeTest("Merge MNP sites within genomic distance of 1 [TEST ONE]", spec); } @@ -33,7 +33,7 @@ public class MergeMNPsIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10) + " -L chr20:556259-756570", 1, - Arrays.asList("681f50e45f1d697370d2c355df2e18bc")); + Arrays.asList("53dd312468296826bdd3c22387390c88")); executeTest("Merge MNP sites within genomic distance of 10 [TEST TWO]", spec); } @@ -43,7 +43,7 @@ public class MergeMNPsIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100) + " -L chr20:556259-756570", 1, - Arrays.asList("0bccb0ef928a108418246bec01098083")); + Arrays.asList("e26f92d2fb9f4eaeac7f9d8ee27410ee")); executeTest("Merge MNP sites within genomic distance of 100 [TEST THREE]", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java index 009048c10..f855c1dd3 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java @@ -23,7 +23,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1) + " -L chr20:556259-756570", 1, - Arrays.asList("e16f957d888054ae0518e25660295241")); + Arrays.asList("af5e1370822551c0c6f50f23447dc627")); executeTest("Merge sites within genomic distance of 1 [TEST ONE]", spec); } @@ -33,7 +33,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10) + " -L chr20:556259-756570", 1, - Arrays.asList("122a482090677c7619c2105d44e00d11")); + Arrays.asList("dd8c44ae1ef059a7fe85399467e102eb")); executeTest("Merge sites within genomic distance of 10 [TEST TWO]", spec); } @@ -43,7 +43,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100) + " -L chr20:556259-756570", 1, - Arrays.asList("bc6a8c8a42bb2601db98e88e9ad74748")); + Arrays.asList("f81fd72ecaa57b3215406fcea860bcc5")); executeTest("Merge sites within genomic distance of 100 [TEST THREE]", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java deleted file mode 100755 index 7e1d86105..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; - -public class BatchMergeIntegrationTest extends WalkerTest { - @Test - public void testBatchMerge1() { - String bam = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; - String alleles = validationDataLocation + "batch.merge.alleles.vcf"; - WalkerTestSpec spec = new WalkerTestSpec( - "-T UnifiedGenotyper -NO_HEADER -BTI alleles -stand_call_conf 0.0 -glm BOTH -G none -nsl -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -o %s -R " + b37KGReference - + " -B:alleles,VCF " + alleles - + " -I " + bam, - 1, - Arrays.asList("f4ed8f4ef2cba96823c06e90e9d0de35")); - executeTest("testBatchMerge UG genotype given alleles:" + new File(bam).getName() + " with " + new File(alleles).getName(), spec); - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java index 8421076c9..8c96c1e11 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java @@ -20,7 +20,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testVariantsToVCFUsingGeliInput() { List md5 = new ArrayList(); - md5.add("815b82fff92aab41c209eedce2d7e7d9"); + md5.add("4accae035d271b35ee2ec58f403c68c6"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + @@ -38,7 +38,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingGeliInput() { List md5 = new ArrayList(); - md5.add("22336ee9c12aa222ce29c3c5babca7d0"); + md5.add("71e8c98d7c3a73b6287ecc339086fe03"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + @@ -56,7 +56,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingHapMapInput() { List md5 = new ArrayList(); - md5.add("9bedaa7670b86a07be5191898c3727cf"); + md5.add("f343085305e80c7a2493422e4eaad983"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + @@ -73,7 +73,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingVCFInput() { List md5 = new ArrayList(); - md5.add("cc215edec9ca28e5c79ab1b67506f9f7"); + md5.add("86f02e2e764ba35854cff2aa05a1fdd8"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java new file mode 100644 index 000000000..32ff25c7b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.utils.codecs.vcf; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +public class VCFIntegrationTest extends WalkerTest { + + @Test + public void testReadingAndWritingWitHNoChanges() { + + String md5ofInputVCF = "a990ba187a69ca44cb9bc2bb44d00447"; + String testVCF = validationDataLocation + "vcf4.1.example.vcf"; + + String baseCommand = "-R " + b37KGReference + " -NO_HEADER -o %s "; + + String test1 = baseCommand + "-T VariantAnnotator -BTI variant -B:variant,vcf " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList(md5ofInputVCF)); + List result = executeTest("Test Variant Annotator with no changes", spec1).getFirst(); + + String test2 = baseCommand + "-T VariantsToVCF -B:variant,vcf " + result.get(0).getAbsolutePath(); + WalkerTestSpec spec2 = new WalkerTestSpec(test2, 1, Arrays.asList(md5ofInputVCF)); + executeTest("Test Variants To VCF from new output", spec2); + } +} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index f9369ee3f..b6fe59f6d 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -4,13 +4,13 @@ import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.function.ListWriterFunction -import scala.io.Source._ import collection.JavaConversions._ import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel import org.broadinstitute.sting.queue.extensions.picard._ -import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord} +import net.sf.samtools.{SAMFileReader} import net.sf.samtools.SAMFileHeader.SortOrder +import org.broadinstitute.sting.queue.qscripts.utils.Utils class DataProcessingPipeline extends QScript { qscript => @@ -103,18 +103,6 @@ class DataProcessingPipeline extends QScript { val ds: String) {} - // Utility function to check if there are multiple samples in a BAM file (currently we can't deal with that) - def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = { - var sample: String = "" - for (r <- readGroups) { - if (sample.isEmpty) - sample = r.getSample - else if (sample != r.getSample) - return true; - } - return false - } - // Utility function to merge all bam files of similar samples. Generates one BAM file per sample. // It uses the sample information on the header of the input BAM files. // @@ -135,7 +123,7 @@ class DataProcessingPipeline extends QScript { // only allow one sample per file. Bam files with multiple samples would require pre-processing of the file // with PrintReads to separate the samples. Tell user to do it himself! - assert(!hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam) + assert(!Utils.hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam) // Fill out the sample table with the readgroups in this file for (rg <- readGroups) { @@ -147,20 +135,23 @@ class DataProcessingPipeline extends QScript { } } + println("\n\n*** DEBUG ***\n") // Creating one file for each sample in the dataset val sampleBamFiles = scala.collection.mutable.Map.empty[String, File] for ((sample, flist) <- sampleTable) { + + println(sample + ":") + for (f <- flist) + println (f) + println() + val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".bam") sampleBamFiles(sample) = sampleFileName add(joinBams(flist, sampleFileName)) } - return sampleBamFiles.toMap - } + println("*** DEBUG ***\n\n") - // Checks how many contigs are in the dataset. Uses the BAM file header information. - def getNumberOfContigs(bamFile: File): Int = { - val samReader = new SAMFileReader(new File(bamFile)) - return samReader.getFileHeader.getSequenceDictionary.getSequences.size() + return sampleBamFiles.toMap } // Rebuilds the Read Group string to give BWA @@ -206,17 +197,6 @@ class DataProcessingPipeline extends QScript { return realignedBams } - // Reads a BAM LIST file and creates a scala list with all the files - def createListFromFile(in: File):List[File] = { - if (in.toString.endsWith("bam")) - return List(in) - var l: List[File] = List() - for (bam <- fromFile(in).getLines) - l :+= new File(bam) - return l - } - - /**************************************************************************** * Main script @@ -226,17 +206,14 @@ class DataProcessingPipeline extends QScript { def script = { // keep a record of the number of contigs in the first bam file in the list - val bams = createListFromFile(input) - nContigs = getNumberOfContigs(bams(0)) + val bams = Utils.createListFromFile(input) + nContigs = Utils.getNumberOfContigs(bams(0)) val realignedBams = if (useBWApe || useBWAse) {performAlignment(bams)} else {bams} // Generate a BAM file per sample joining all per lane files if necessary val sampleBamFiles: Map[String, File] = createSampleFiles(bams, realignedBams) - - println("nContigs: " + nContigs) - // Final output list of processed bam files var cohortList: List[File] = List() @@ -244,6 +221,7 @@ class DataProcessingPipeline extends QScript { println("\nFound the following samples: ") for ((sample, file) <- sampleBamFiles) println("\t" + sample + " -> " + file) + println("\n") // If this is a 'knowns only' indel realignment run, do it only once for all samples. val globalIntervals = new File(outputDir + projectName + ".intervals") diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index 51de3a7ad..b2960f150 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -3,6 +3,8 @@ package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ import net.sf.samtools.SAMFileReader +import io.Source._ +import org.broadinstitute.sting.queue.qscripts.utils.Utils /** * Created by IntelliJ IDEA. @@ -32,26 +34,25 @@ class RecalibrateBaseQualities extends QScript { val queueLogDir: String = ".qlog/" var nContigs: Int = 0 - def getNumberOfContigs(bamFile: File): Int = { - val samReader = new SAMFileReader(new File(bamFile)) - return samReader.getFileHeader.getSequenceDictionary.getSequences.size() - } - def script = { - nContigs = getNumberOfContigs(input) + val bamList = Utils.createListFromFile(input) + nContigs = Utils.getNumberOfContigs(bamList(0)) - val recalFile1: File = swapExt(input, ".bam", "recal1.csv") - val recalFile2: File = swapExt(input, ".bam", "recal2.csv") - val recalBam: File = swapExt(input, ".bam", "recal.bam") - val path1: String = "before" - val path2: String = "after" - - add(cov(input, recalFile1), - recal(input, recalFile1, recalBam), - cov(recalBam, recalFile2), - analyzeCovariates(recalFile1, path1), - analyzeCovariates(recalFile2, path2)) + for (bam <- bamList) { + + val recalFile1: File = swapExt(bam, ".bam", ".recal1.csv") + val recalFile2: File = swapExt(bam, ".bam", ".recal2.csv") + val recalBam: File = swapExt(bam, ".bam", ".recal.bam") + val path1: String = bam + "before" + val path2: String = bam + "after" + + add(cov(bam, recalFile1), + recal(bam, recalFile1, recalBam), + cov(recalBam, recalFile2), + analyzeCovariates(recalFile1, path1), + analyzeCovariates(recalFile2, path2)) + } } trait CommandLineGATKArgs extends CommandLineGATK { diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala new file mode 100644 index 000000000..1189b376c --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/utils/Utils.scala @@ -0,0 +1,60 @@ +package org.broadinstitute.sting.queue.qscripts.utils + +import java.io.File +import io.Source._ +import net.sf.samtools.{SAMReadGroupRecord, SAMFileReader} + +import collection.JavaConversions._ + + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 7/14/11 + * Time: 4:57 PM + * To change this template use File | Settings | File Templates. + */ + +object Utils { + + /** + * Takes a bam list file and produces a scala list with each file allowing the bam list + * to have empty lines and comment lines (lines starting with #). + */ + def createListFromFile(in: File):List[File] = { + // If the file provided ends with .bam, it is not a bam list, we treat it as a single file. + // and return a list with only this file. + if (in.toString.endsWith(".bam")) + return List(in) + + var list: List[File] = List() + for (bam <- fromFile(in).getLines) + if (!bam.startsWith("#") && !bam.isEmpty ) + list :+= new File(bam.trim()) + list + } + + /** + * Returns the number of contigs in the BAM file header. + */ + def getNumberOfContigs(bamFile: File): Int = { + val samReader = new SAMFileReader(new File(bamFile)) + samReader.getFileHeader.getSequenceDictionary.getSequences.size() + } + + /** + * Check if there are multiple samples in a BAM file + */ + def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = { + var sample: String = "" + for (r <- readGroups) { + if (sample.isEmpty) + sample = r.getSample + else if (sample != r.getSample) + return true; + } + false + } + + +} \ No newline at end of file