Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Guillermo del Angel 2011-07-14 19:18:05 -04:00
commit 10cf9245d7
20 changed files with 268 additions and 178 deletions

View File

@ -26,16 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.diffengine;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.LineReader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
import java.util.Arrays;
import java.util.Map;
import java.util.zip.GZIPInputStream;
/**
@ -58,7 +54,13 @@ public class VCFDiffableReader implements DiffableReader {
VCFCodec vcfCodec = new VCFCodec();
// must be read as state is stored in reader itself
vcfCodec.readHeader(lineReader);
VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader);
for ( VCFHeaderLine headerLine : header.getMetaData() ) {
String key = headerLine.getKey();
if ( headerLine instanceof VCFNamedHeaderLine )
key += "_" + ((VCFNamedHeaderLine) headerLine).getName();
root.add(key, headerLine.toString());
}
String line = lineReader.readLine();
int count = 0;

View File

@ -220,6 +220,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
}
else {
unprocessedList.add(vc); // Finished with the unprocessed variant, and writer can enforce sorting on-the-fly
if (DEBUG)
logger.debug("Unprocessed variant = " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
}
int numReads = 0;
@ -1105,7 +1108,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
this.alleles = vc.getAlleles();
this.genotypes = new HashMap<String, Genotype>(vc.getGenotypes()); // since vc.getGenotypes() is unmodifiable
this.negLog10PError = vc.getNegLog10PError();
this.filters = vc.getFilters();
this.filters = vc.filtersWereApplied() ? vc.getFilters() : null;
this.attributes = new HashMap<String, Object>(vc.getAttributes());
}

View File

@ -199,8 +199,8 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
// setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "VariantsToVCF"));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
//hInfo.add(new VCFHeaderLine("source", "VariantsToVCF"));
//hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
allowedGenotypeFormatStrings.add(VCFConstants.GENOTYPE_KEY);
for ( VCFHeaderLine field : hInfo ) {

View File

@ -7,6 +7,8 @@ import org.broad.tribble.NameAwareCodec;
import org.broad.tribble.TribbleException;
import org.broad.tribble.readers.LineReader;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -96,6 +98,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
for ( String str : headerStrings ) {
if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) {
String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR);
if ( strings.length < VCFHeader.HEADER_FIELDS.values().length )
throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str);
int arrayIndex = 0;
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
try {
@ -159,12 +164,11 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
}
private Feature reallyDecode(String line) {
try {
// the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
// our header cannot be null, we need the genotype sample names and counts
if (header == null) throw new IllegalStateException("VCF Header cannot be null when decoding a record");
if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
if (parts == null)
parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)];
@ -174,17 +178,18 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
// if we have don't have a header, or we have a header with no genotyping data check that we have eight columns. Otherwise check that we have nine (normal colummns + genotyping data)
if (( (header == null || (header != null && !header.hasGenotypingData())) && nParts != NUM_STANDARD_FIELDS) ||
(header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) )
throw new IllegalArgumentException("There aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
" tokens, and saw " + nParts + " )");
throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
" tokens, and saw " + nParts + " )", lineNo);
return parseVCFLine(parts);
} catch (TribbleException e) {
throw new TribbleException.InvalidDecodeLine(e.getMessage(), line);
}
}
protected void generateException(String message) {
throw new TribbleException.InvalidDecodeLine(message, lineNo);
throw new UserException.MalformedVCF(message, lineNo);
}
private static void generateException(String message, int lineNo) {
throw new UserException.MalformedVCF(message, lineNo);
}
/**
@ -472,10 +477,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
return true;
}
private static void generateException(String message, int lineNo) {
throw new TribbleException.InvalidDecodeLine(message, lineNo);
}
private static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) {
boolean clipping = true;
// Note that the computation of forward clipping here is meant only to see whether there is a common

View File

@ -32,6 +32,7 @@ import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.util.LittleEndianOutputStream;
import org.broad.tribble.util.ParsingUtils;
import org.broad.tribble.util.PositionalStream;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
@ -300,10 +301,7 @@ public class StandardVCFWriter implements VCFWriter {
} else {
List<String> genotypeAttributeKeys = new ArrayList<String>();
if ( vc.hasGenotypes() ) {
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
for ( String key : calcVCFGenotypeKeys(vc) ) {
genotypeAttributeKeys.add(key);
}
genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc));
} else if ( mHeader.hasGenotypingData() ) {
// this needs to be done in case all samples are no-calls
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
@ -387,16 +385,22 @@ public class StandardVCFWriter implements VCFWriter {
continue;
}
writeAllele(g.getAllele(0), alleleMap);
for (int i = 1; i < g.getPloidy(); i++) {
mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
writeAllele(g.getAllele(i), alleleMap);
}
List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
for ( String key : genotypeFormatKeys ) {
if ( key.equals(VCFConstants.GENOTYPE_KEY) )
if ( key.equals(VCFConstants.GENOTYPE_KEY) ) {
if ( !g.isAvailable() ) {
throw new ReviewedStingException("GTs cannot be missing for some samples if they are available for others in the record");
}
writeAllele(g.getAllele(0), alleleMap);
for (int i = 1; i < g.getPloidy(); i++) {
mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
writeAllele(g.getAllele(i), alleleMap);
}
continue;
}
Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4;
@ -440,9 +444,10 @@ public class StandardVCFWriter implements VCFWriter {
break;
}
for (String s : attrs ) {
mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR);
mWriter.write(s);
for (int i = 0; i < attrs.size(); i++) {
if ( i > 0 || genotypeFormatKeys.contains(VCFConstants.GENOTYPE_KEY) )
mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR);
mWriter.write(attrs.get(i));
}
}
}
@ -488,10 +493,13 @@ public class StandardVCFWriter implements VCFWriter {
private static List<String> calcVCFGenotypeKeys(VariantContext vc) {
Set<String> keys = new HashSet<String>();
boolean sawGoodGT = false;
boolean sawGoodQual = false;
boolean sawGenotypeFilter = false;
for ( Genotype g : vc.getGenotypes().values() ) {
keys.addAll(g.getAttributes().keySet());
if ( g.isAvailable() )
sawGoodGT = true;
if ( g.hasNegLog10PError() )
sawGoodQual = true;
if (g.isFiltered() && g.isCalled())
@ -504,7 +512,17 @@ public class StandardVCFWriter implements VCFWriter {
if (sawGenotypeFilter)
keys.add(VCFConstants.GENOTYPE_FILTER_KEY);
return ParsingUtils.sortList(new ArrayList<String>(keys));
List<String> sortedList = ParsingUtils.sortList(new ArrayList<String>(keys));
// make sure the GT is first
if ( sawGoodGT ) {
List<String> newList = new ArrayList<String>(sortedList.size()+1);
newList.add(VCFConstants.GENOTYPE_KEY);
newList.addAll(sortedList);
sortedList = newList;
}
return sortedList;
}

View File

@ -141,8 +141,6 @@ public class VCF3Codec extends AbstractVCFCodec {
boolean missing = i >= GTValueSplitSize;
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
if (i != 0)
generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes");
genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
@ -156,12 +154,13 @@ public class VCF3Codec extends AbstractVCFCodec {
}
}
// check to make sure we found a gentoype field
if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field");
// check to make sure we found a genotype field
if ( genotypeAlleleLocation < 0 )
generateException("Unable to find the GT field for the record; the GT field is required");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes");
// todo -- assuming allele list length in the single digits is bad. Fix me.
// Check for > 1 for haploid genotypes
boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
// add it to the list
try {

View File

@ -145,8 +145,6 @@ public class VCFCodec extends AbstractVCFCodec {
// todo -- all of these on the fly parsing of the missing value should be static constants
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
if (i != 0)
generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes");
genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
@ -160,22 +158,24 @@ public class VCFCodec extends AbstractVCFCodec {
}
}
// check to make sure we found a gentoype field
// TODO -- This is no longer required in v4.1
if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field");
// check to make sure we found a genotype field if we are a VCF4.0 file
if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 )
generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
// todo -- assuming allele list length in the single digits is bad. Fix me.
// Check for > 1 for haploid genotypes
boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? null : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
// add it to the list
try {
genotypes.put(sampleName, new Genotype(sampleName,
parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap),
GTQual,
genotypeFilters,
gtAttributes,
phased));
genotypes.put(sampleName,
new Genotype(sampleName,
GTalleles,
GTQual,
genotypeFilters,
gtAttributes,
phased));
} catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
}

View File

@ -154,6 +154,16 @@ public class UserException extends ReviewedStingException {
}
}
public static class MalformedVCF extends UserException {
public MalformedVCF(String message, String line) {
super(String.format("The provided VCF file is malformed at line %s: %s", line, message));
}
public MalformedVCF(String message, int lineNo) {
super(String.format("The provided VCF file is malformed at line nmber %d: %s", lineNo, message));
}
}
public static class ReadMissingReadGroup extends MalformedBAM {
public ReadMissingReadGroup(SAMRecord read) {
super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName()));

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.variantcontext;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
@ -19,12 +20,14 @@ public class Genotype {
protected InferredGeneticContext commonInfo;
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
protected List<Allele> alleles = null; // new ArrayList<Allele>();
protected Type type = null;
protected boolean isPhased = false;
private boolean filtersWereAppliedToContext;
protected boolean filtersWereAppliedToContext;
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased) {
this.alleles = Collections.unmodifiableList(alleles);
if ( alleles != null )
this.alleles = Collections.unmodifiableList(alleles);
commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes);
filtersWereAppliedToContext = filters != null;
this.isPhased = isPhased;
@ -66,6 +69,9 @@ public class Genotype {
}
public List<Allele> getAlleles(Allele allele) {
if ( getType() == Type.UNAVAILABLE )
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
List<Allele> al = new ArrayList<Allele>();
for ( Allele a : alleles )
if ( a.equals(allele) )
@ -75,6 +81,8 @@ public class Genotype {
}
public Allele getAllele(int i) {
if ( getType() == Type.UNAVAILABLE )
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
return alleles.get(i);
}
@ -89,10 +97,21 @@ public class Genotype {
NO_CALL,
HOM_REF,
HET,
HOM_VAR
HOM_VAR,
UNAVAILABLE
}
public Type getType() {
if ( type == null ) {
type = determineType();
}
return type;
}
protected Type determineType() {
if ( alleles == null )
return Type.UNAVAILABLE;
Allele firstAllele = alleles.get(0);
if ( firstAllele.isNoCall() ) {
@ -122,7 +141,8 @@ public class Genotype {
* @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF)
*/
public boolean isNoCall() { return getType() == Type.NO_CALL; }
public boolean isCalled() { return getType() != Type.NO_CALL; }
public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; }
public boolean isAvailable() { return getType() != Type.UNAVAILABLE; }
//
// Useful methods for getting genotype likelihoods for a genotype object, if present
@ -157,8 +177,8 @@ public class Genotype {
}
public void validate() {
if ( alleles == null ) throw new IllegalArgumentException("BUG: alleles cannot be null in setAlleles");
if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0 in setAlleles");
if ( alleles == null ) return;
if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0");
int nNoCalls = 0;
for ( Allele allele : alleles ) {
@ -175,6 +195,9 @@ public class Genotype {
}
public String getGenotypeString(boolean ignoreRefState) {
if ( alleles == null )
return null;
// Notes:
// 1. Make sure to use the appropriate separator depending on whether the genotype is phased
// 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele)

View File

@ -1206,9 +1206,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati
if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName());
for ( Allele gAllele : g.getAlleles() ) {
if ( ! hasAllele(gAllele) && gAllele.isCalled() )
throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles);
if ( g.isAvailable() ) {
for ( Allele gAllele : g.getAlleles() ) {
if ( ! hasAllele(gAllele) && gAllele.isCalled() )
throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles);
}
}
}
}

View File

@ -26,7 +26,9 @@
package org.broadinstitute.sting;
import org.apache.commons.lang.StringUtils;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.Tribble;
import org.broad.tribble.index.Index;
import org.broad.tribble.index.IndexFactory;
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
import org.broadinstitute.sting.gatk.CommandLineExecutable;
@ -64,10 +66,19 @@ public class WalkerTest extends BaseTest {
}
System.out.println("Verifying on-the-fly index " + indexFile + " for test " + name + " using file " + resultFile);
Assert.assertTrue(IndexFactory.onDiskIndexEqualToNewlyCreatedIndex(resultFile, indexFile, new VCFCodec()), "Index on disk from indexing on the fly not equal to the index created after the run completed");
Index indexFromOutputFile = IndexFactory.createIndex(resultFile, new VCFCodec());
Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath());
if ( ! indexFromOutputFile.equals(dynamicIndex) ) {
Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n",
indexFromOutputFile.getProperties(),
dynamicIndex.getProperties()));
}
}
}
public List<String> assertMatchingMD5s(final String name, List<File> resultFiles, List<String> expectedMD5s) {
List<String> md5s = new ArrayList<String>();
for (int i = 0; i < resultFiles.size(); i++) {

View File

@ -87,7 +87,7 @@ public class DiffableReaderUnitTest extends BaseTest {
Assert.assertSame(diff.getParent(), DiffElement.ROOT);
DiffNode node = diff.getValueAsNode();
Assert.assertEquals(node.getElements().size(), 9);
Assert.assertEquals(node.getElements().size(), 10);
// chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03
DiffNode rec1 = node.getElement("chr1:2646").getValueAsNode();

View File

@ -23,7 +23,7 @@ public class MergeMNPsIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1)
+ " -L chr20:556259-756570",
1,
Arrays.asList("e312b7d3854d5b2834a370659514a813"));
Arrays.asList("7f11f7f75d1526077f0173c7ed1fc6c4"));
executeTest("Merge MNP sites within genomic distance of 1 [TEST ONE]", spec);
}
@ -33,7 +33,7 @@ public class MergeMNPsIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10)
+ " -L chr20:556259-756570",
1,
Arrays.asList("681f50e45f1d697370d2c355df2e18bc"));
Arrays.asList("53dd312468296826bdd3c22387390c88"));
executeTest("Merge MNP sites within genomic distance of 10 [TEST TWO]", spec);
}
@ -43,7 +43,7 @@ public class MergeMNPsIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100)
+ " -L chr20:556259-756570",
1,
Arrays.asList("0bccb0ef928a108418246bec01098083"));
Arrays.asList("e26f92d2fb9f4eaeac7f9d8ee27410ee"));
executeTest("Merge MNP sites within genomic distance of 100 [TEST THREE]", spec);
}

View File

@ -23,7 +23,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1)
+ " -L chr20:556259-756570",
1,
Arrays.asList("e16f957d888054ae0518e25660295241"));
Arrays.asList("af5e1370822551c0c6f50f23447dc627"));
executeTest("Merge sites within genomic distance of 1 [TEST ONE]", spec);
}
@ -33,7 +33,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10)
+ " -L chr20:556259-756570",
1,
Arrays.asList("122a482090677c7619c2105d44e00d11"));
Arrays.asList("dd8c44ae1ef059a7fe85399467e102eb"));
executeTest("Merge sites within genomic distance of 10 [TEST TWO]", spec);
}
@ -43,7 +43,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100)
+ " -L chr20:556259-756570",
1,
Arrays.asList("bc6a8c8a42bb2601db98e88e9ad74748"));
Arrays.asList("f81fd72ecaa57b3215406fcea860bcc5"));
executeTest("Merge sites within genomic distance of 100 [TEST THREE]", spec);
}

View File

@ -1,46 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.io.File;
import java.util.Arrays;
public class BatchMergeIntegrationTest extends WalkerTest {
@Test
public void testBatchMerge1() {
String bam = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
String alleles = validationDataLocation + "batch.merge.alleles.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T UnifiedGenotyper -NO_HEADER -BTI alleles -stand_call_conf 0.0 -glm BOTH -G none -nsl -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -o %s -R " + b37KGReference
+ " -B:alleles,VCF " + alleles
+ " -I " + bam,
1,
Arrays.asList("f4ed8f4ef2cba96823c06e90e9d0de35"));
executeTest("testBatchMerge UG genotype given alleles:" + new File(bam).getName() + " with " + new File(alleles).getName(), spec);
}
}

View File

@ -20,7 +20,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testVariantsToVCFUsingGeliInput() {
List<String> md5 = new ArrayList<String>();
md5.add("815b82fff92aab41c209eedce2d7e7d9");
md5.add("4accae035d271b35ee2ec58f403c68c6");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
@ -38,7 +38,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testGenotypesToVCFUsingGeliInput() {
List<String> md5 = new ArrayList<String>();
md5.add("22336ee9c12aa222ce29c3c5babca7d0");
md5.add("71e8c98d7c3a73b6287ecc339086fe03");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
@ -56,7 +56,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testGenotypesToVCFUsingHapMapInput() {
List<String> md5 = new ArrayList<String>();
md5.add("9bedaa7670b86a07be5191898c3727cf");
md5.add("f343085305e80c7a2493422e4eaad983");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
@ -73,7 +73,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testGenotypesToVCFUsingVCFInput() {
List<String> md5 = new ArrayList<String>();
md5.add("cc215edec9ca28e5c79ab1b67506f9f7");
md5.add("86f02e2e764ba35854cff2aa05a1fdd8");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +

View File

@ -0,0 +1,28 @@
package org.broadinstitute.sting.utils.codecs.vcf;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.io.File;
import java.util.Arrays;
import java.util.List;
public class VCFIntegrationTest extends WalkerTest {
@Test
public void testReadingAndWritingWitHNoChanges() {
String md5ofInputVCF = "a990ba187a69ca44cb9bc2bb44d00447";
String testVCF = validationDataLocation + "vcf4.1.example.vcf";
String baseCommand = "-R " + b37KGReference + " -NO_HEADER -o %s ";
String test1 = baseCommand + "-T VariantAnnotator -BTI variant -B:variant,vcf " + testVCF;
WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList(md5ofInputVCF));
List<File> result = executeTest("Test Variant Annotator with no changes", spec1).getFirst();
String test2 = baseCommand + "-T VariantsToVCF -B:variant,vcf " + result.get(0).getAbsolutePath();
WalkerTestSpec spec2 = new WalkerTestSpec(test2, 1, Arrays.asList(md5ofInputVCF));
executeTest("Test Variants To VCF from new output", spec2);
}
}

View File

@ -4,13 +4,13 @@ import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.function.ListWriterFunction
import scala.io.Source._
import collection.JavaConversions._
import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel
import org.broadinstitute.sting.queue.extensions.picard._
import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord}
import net.sf.samtools.{SAMFileReader}
import net.sf.samtools.SAMFileHeader.SortOrder
import org.broadinstitute.sting.queue.qscripts.utils.Utils
class DataProcessingPipeline extends QScript {
qscript =>
@ -103,18 +103,6 @@ class DataProcessingPipeline extends QScript {
val ds: String)
{}
// Utility function to check if there are multiple samples in a BAM file (currently we can't deal with that)
def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = {
var sample: String = ""
for (r <- readGroups) {
if (sample.isEmpty)
sample = r.getSample
else if (sample != r.getSample)
return true;
}
return false
}
// Utility function to merge all bam files of similar samples. Generates one BAM file per sample.
// It uses the sample information on the header of the input BAM files.
//
@ -135,7 +123,7 @@ class DataProcessingPipeline extends QScript {
// only allow one sample per file. Bam files with multiple samples would require pre-processing of the file
// with PrintReads to separate the samples. Tell user to do it himself!
assert(!hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam)
assert(!Utils.hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam)
// Fill out the sample table with the readgroups in this file
for (rg <- readGroups) {
@ -147,20 +135,23 @@ class DataProcessingPipeline extends QScript {
}
}
println("\n\n*** DEBUG ***\n")
// Creating one file for each sample in the dataset
val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
for ((sample, flist) <- sampleTable) {
println(sample + ":")
for (f <- flist)
println (f)
println()
val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".bam")
sampleBamFiles(sample) = sampleFileName
add(joinBams(flist, sampleFileName))
}
return sampleBamFiles.toMap
}
println("*** DEBUG ***\n\n")
// Checks how many contigs are in the dataset. Uses the BAM file header information.
def getNumberOfContigs(bamFile: File): Int = {
val samReader = new SAMFileReader(new File(bamFile))
return samReader.getFileHeader.getSequenceDictionary.getSequences.size()
return sampleBamFiles.toMap
}
// Rebuilds the Read Group string to give BWA
@ -206,17 +197,6 @@ class DataProcessingPipeline extends QScript {
return realignedBams
}
// Reads a BAM LIST file and creates a scala list with all the files
def createListFromFile(in: File):List[File] = {
if (in.toString.endsWith("bam"))
return List(in)
var l: List[File] = List()
for (bam <- fromFile(in).getLines)
l :+= new File(bam)
return l
}
/****************************************************************************
* Main script
@ -226,17 +206,14 @@ class DataProcessingPipeline extends QScript {
def script = {
// keep a record of the number of contigs in the first bam file in the list
val bams = createListFromFile(input)
nContigs = getNumberOfContigs(bams(0))
val bams = Utils.createListFromFile(input)
nContigs = Utils.getNumberOfContigs(bams(0))
val realignedBams = if (useBWApe || useBWAse) {performAlignment(bams)} else {bams}
// Generate a BAM file per sample joining all per lane files if necessary
val sampleBamFiles: Map[String, File] = createSampleFiles(bams, realignedBams)
println("nContigs: " + nContigs)
// Final output list of processed bam files
var cohortList: List[File] = List()
@ -244,6 +221,7 @@ class DataProcessingPipeline extends QScript {
println("\nFound the following samples: ")
for ((sample, file) <- sampleBamFiles)
println("\t" + sample + " -> " + file)
println("\n")
// If this is a 'knowns only' indel realignment run, do it only once for all samples.
val globalIntervals = new File(outputDir + projectName + ".intervals")

View File

@ -3,6 +3,8 @@ package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._
import net.sf.samtools.SAMFileReader
import io.Source._
import org.broadinstitute.sting.queue.qscripts.utils.Utils
/**
* Created by IntelliJ IDEA.
@ -32,26 +34,25 @@ class RecalibrateBaseQualities extends QScript {
val queueLogDir: String = ".qlog/"
var nContigs: Int = 0
def getNumberOfContigs(bamFile: File): Int = {
val samReader = new SAMFileReader(new File(bamFile))
return samReader.getFileHeader.getSequenceDictionary.getSequences.size()
}
def script = {
nContigs = getNumberOfContigs(input)
val bamList = Utils.createListFromFile(input)
nContigs = Utils.getNumberOfContigs(bamList(0))
val recalFile1: File = swapExt(input, ".bam", "recal1.csv")
val recalFile2: File = swapExt(input, ".bam", "recal2.csv")
val recalBam: File = swapExt(input, ".bam", "recal.bam")
val path1: String = "before"
val path2: String = "after"
add(cov(input, recalFile1),
recal(input, recalFile1, recalBam),
cov(recalBam, recalFile2),
analyzeCovariates(recalFile1, path1),
analyzeCovariates(recalFile2, path2))
for (bam <- bamList) {
val recalFile1: File = swapExt(bam, ".bam", ".recal1.csv")
val recalFile2: File = swapExt(bam, ".bam", ".recal2.csv")
val recalBam: File = swapExt(bam, ".bam", ".recal.bam")
val path1: String = bam + "before"
val path2: String = bam + "after"
add(cov(bam, recalFile1),
recal(bam, recalFile1, recalBam),
cov(recalBam, recalFile2),
analyzeCovariates(recalFile1, path1),
analyzeCovariates(recalFile2, path2))
}
}
trait CommandLineGATKArgs extends CommandLineGATK {

View File

@ -0,0 +1,60 @@
package org.broadinstitute.sting.queue.qscripts.utils
import java.io.File
import io.Source._
import net.sf.samtools.{SAMReadGroupRecord, SAMFileReader}
import collection.JavaConversions._
/**
* Created by IntelliJ IDEA.
* User: carneiro
* Date: 7/14/11
* Time: 4:57 PM
* To change this template use File | Settings | File Templates.
*/
object Utils {
/**
* Takes a bam list file and produces a scala list with each file allowing the bam list
* to have empty lines and comment lines (lines starting with #).
*/
def createListFromFile(in: File):List[File] = {
// If the file provided ends with .bam, it is not a bam list, we treat it as a single file.
// and return a list with only this file.
if (in.toString.endsWith(".bam"))
return List(in)
var list: List[File] = List()
for (bam <- fromFile(in).getLines)
if (!bam.startsWith("#") && !bam.isEmpty )
list :+= new File(bam.trim())
list
}
/**
* Returns the number of contigs in the BAM file header.
*/
def getNumberOfContigs(bamFile: File): Int = {
val samReader = new SAMFileReader(new File(bamFile))
samReader.getFileHeader.getSequenceDictionary.getSequences.size()
}
/**
* Check if there are multiple samples in a BAM file
*/
def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = {
var sample: String = ""
for (r <- readGroups) {
if (sample.isEmpty)
sample = r.getSample
else if (sample != r.getSample)
return true;
}
false
}
}