Stage 3 of Variation refactoring:

We are now VCF3.3 compliant.
(Only a few more stages left.  Sigh.)



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2287 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-12-08 21:43:28 +00:00
parent 9e2f831206
commit e8822a3fb4
35 changed files with 416 additions and 220 deletions

View File

@ -90,7 +90,7 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
public boolean hasStrandBias() {
assertNotNull();
return this.mCurrentRecord.getInfoValues().containsKey("SB");
return this.mCurrentRecord.getInfoValues().containsKey(VCFRecord.STRAND_BIAS_KEY);
}
/**
@ -99,7 +99,7 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* @return StrandBias with the stored slod
*/
public double getStrandBias() {
return hasStrandBias() ? Double.valueOf(this.mCurrentRecord.getInfoValues().get("SB")) : 0.0;
return hasStrandBias() ? Double.valueOf(this.mCurrentRecord.getInfoValues().get(VCFRecord.STRAND_BIAS_KEY)) : 0.0;
}
/** @return the VARIANT_TYPE of the current variant */
@ -291,8 +291,7 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
}
public RodVCF next() {
mCurrentRecord = mReader.next();
return new RodVCF(name, mCurrentRecord, mReader);
return new RodVCF(name, mReader.next(), mReader);
}
public void remove() {

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
public class DepthOfCoverage extends StandardVariantAnnotation {
@ -12,9 +13,9 @@ public class DepthOfCoverage extends StandardVariantAnnotation {
return String.format("%d", depth);
}
public String getKeyName() { return "DP"; }
public String getKeyName() { return VCFRecord.DEPTH_KEY; }
public String getDescription() { return "DP,1,Integer,\"Total Depth\""; }
public String getDescription() { return getKeyName() + ",1,Integer,\"Total Depth\""; }
public boolean useZeroQualityReads() { return false; }
}

View File

@ -4,6 +4,8 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import net.sf.samtools.SAMRecord;
import java.util.List;
@ -20,9 +22,9 @@ public class RMSMappingQuality extends StandardVariantAnnotation {
return String.format("%.2f", rms);
}
public String getKeyName() { return "MQ"; }
public String getKeyName() { return VCFRecord.RMS_MAPPING_QUALITY_KEY; }
public String getDescription() { return "MQ,1,Float,\"RMS Mapping Quality\""; }
public String getDescription() { return getKeyName() + ",1,Float,\"RMS Mapping Quality\""; }
public boolean useZeroQualityReads() { return true; }
}

View File

@ -117,10 +117,11 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
}
// setup the header fields
Map<String, String> hInfo = new HashMap<String, String>();
hInfo.put("format", VCFWriter.VERSION);
hInfo.put("source", "VariantAnnotator");
hInfo.put("reference", this.getToolkit().getArguments().referenceFile.getName());
Set<String> hInfo = new HashSet<String>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add("source=VariantAnnotator");
hInfo.add("annotatorReference=" + getToolkit().getArguments().referenceFile.getName());
hInfo.addAll(getVCFAnnotationDescriptions(requestedAnnotations));
vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter = new VCFWriter(vcfHeader, VCF_OUT);
@ -172,6 +173,40 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
return 1;
}
// option #1: don't specify annotations to be used: standard annotations are used by default
public static Set<String> getVCFAnnotationDescriptions() {
if ( standardAnnotations == null )
determineAllAnnotations();
TreeSet<String> descriptions = new TreeSet<String>();
for ( VariantAnnotation annotation : standardAnnotations.values() )
descriptions.add("INFO=" + annotation.getDescription());
return descriptions;
}
// option #2: specify that all possible annotations be used
public static Set<String> getAllVCFAnnotationDescriptions() {
if ( standardAnnotations == null )
determineAllAnnotations();
TreeSet<String> descriptions = new TreeSet<String>();
for ( VariantAnnotation annotation : allAnnotations.values() )
descriptions.add("INFO=" + annotation.getDescription());
return descriptions;
}
// option #3: specify the exact annotations to be used
public static Set<String> getVCFAnnotationDescriptions(Collection<VariantAnnotation> annotations) {
TreeSet<String> descriptions = new TreeSet<String>();
for ( VariantAnnotation annotation : annotations )
descriptions.add("INFO=" + annotation.getDescription());
return descriptions;
}
// option #1: don't specify annotations to be used: standard annotations are used by default
public static Map<String, String> getAnnotations(ReferenceContext ref, AlignmentContext context, Variation variation) {
if ( standardAnnotations == null )

View File

@ -61,16 +61,6 @@ public class CallsetConcordanceWalker extends RodWalker<Integer, Integer> {
logger.debug("Uniquified sample mapping: " + entry.getKey().first + "/" + entry.getKey().second + " -> " + entry.getValue());
}
// set up the header fields
Map<String, String> hInfo = new HashMap<String, String>();
hInfo.put("format", VCFWriter.VERSION);
hInfo.put("source", "CallsetConcordance");
hInfo.put("reference", getToolkit().getArguments().referenceFile.getName());
hInfo.put("explanation", "This file represents a concordance test of various call sets - NOT the output from a multi-sample caller");
VCFHeader header = new VCFHeader(hInfo, samples);
vcfWriter = new VCFWriter(header, OUTPUT);
// initialize requested concordance types
requestedTypes = new ArrayList<ConcordanceType>();
if (TYPES != null) {
@ -110,6 +100,25 @@ public class CallsetConcordanceWalker extends RodWalker<Integer, Integer> {
throw new StingException("The requested concordance type (" + requestedType + ") isn't a valid concordance option");
}
}
// set up the header fields
Set<String> hInfo = new HashSet<String>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add("source=CallsetConcordance");
hInfo.add("note=\"This file represents a concordance test of various call sets - NOT the output from a multi-sample caller\"");
hInfo.addAll(getVCFAnnotationDescriptions(requestedTypes));
VCFHeader header = new VCFHeader(hInfo, samples);
vcfWriter = new VCFWriter(header, OUTPUT);
}
public static Set<String> getVCFAnnotationDescriptions(Collection<ConcordanceType> types) {
TreeSet<String> descriptions = new TreeSet<String>();
for ( ConcordanceType type : types )
descriptions.add("INFO=" + type.getInfoDescription());
return descriptions;
}
public Integer map(RefMetaDataTracker rodData, ReferenceContext ref, AlignmentContext context) {

View File

@ -11,4 +11,5 @@ public interface ConcordanceType {
public void initialize(Map<String,String> args, Set<String> samples);
public String computeConcordance(Map<String, Genotype> samplesToRecords, ReferenceContext ref);
public String getInfoName();
public String getInfoDescription();
}

View File

@ -101,5 +101,6 @@ public class IndelSubsets implements ConcordanceType {
return Math.max(leftRun, rightRun);
}
public String getInfoName() { return "IndelSubsets"; }
public String getInfoName() { return "IndelSubsets"; }
public String getInfoDescription() { return getInfoName() + ",1,String,\"Indel-related subsets\""; }
}

View File

@ -39,4 +39,5 @@ public class NWayVenn implements ConcordanceType {
}
public String getInfoName() { return "NwayVenn"; }
public String getInfoDescription() { return getInfoName() + ",1,String,\"N-way Venn split\""; }
}

View File

@ -95,4 +95,5 @@ public class SNPGenotypeConcordance implements ConcordanceType {
}
public String getInfoName() { return "SnpConcordance"; }
public String getInfoDescription() { return getInfoName() + ",1,String,\"SNP concordance test\""; }
}

View File

@ -59,4 +59,5 @@ public class SimpleVenn implements ConcordanceType {
}
public String getInfoName() { return "Venn"; }
public String getInfoDescription() { return getInfoName() + ",1,String,\"2-way Venn split\""; }
}

View File

@ -43,10 +43,10 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
private void initializeVcfWriter(RodVCF rod) {
// setup the header fields
Map<String, String> hInfo = new HashMap<String, String>();
hInfo.put("format", VCFWriter.VERSION);
hInfo.put("source", "VariantFiltration");
hInfo.put("reference", this.getToolkit().getArguments().referenceFile.getName());
Set<String> hInfo = new HashSet<String>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add("source=" + "VariantFiltration");
hInfo.add("reference=" + getToolkit().getArguments().referenceFile.getName());
VCFHeader header = new VCFHeader(hInfo, rod.getHeader().getGenotypeSamples());
writer = new VCFWriter(header, out);
@ -151,6 +151,10 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
filterString.append(";" + rec.getFilterString());
rec.setFilterString(filterString.toString());
}
// otherwise, if it's not already filtered, set it to "passing filters"
else if ( !rec.isFiltered() ) {
rec.setFilterString(VCFRecord.PASSES_FILTERS);
}
if ( writer == null )
initializeVcfWriter(variant);

View File

@ -133,17 +133,31 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
if ( UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED )
samples.clear();
// get the optional header fields
Set<String> headerInfo = new HashSet<String>();
if ( UAC.ALL_ANNOTATIONS )
headerInfo.addAll(VariantAnnotator.getAllVCFAnnotationDescriptions());
else
headerInfo.addAll(VariantAnnotator.getVCFAnnotationDescriptions());
headerInfo.add("INFO=AF,1,Float,\"Allele Frequency\"");
headerInfo.add("INFO=NS,1,Integer,\"Number of Samples With Data\"");
if ( !UAC.NO_SLOD )
headerInfo.add("INFO=SB,1,Float,\"Strand Bias\"");
// create the output writer stream
if ( VARIANTS_FILE != null )
writer = GenotypeWriterFactory.create(UAC.VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE,
"UnifiedGenotyper",
this.getToolkit().getArguments().referenceFile.getName(),
samples);
samples,
headerInfo);
else
writer = GenotypeWriterFactory.create(UAC.VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out,
"UnifiedGenotyper",
this.getToolkit().getArguments().referenceFile.getName(),
samples);
samples,
headerInfo);
callsMetrics = new CallMetrics();
}

View File

@ -50,18 +50,17 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
}
public static VCFHeader getHeader(GATKArgumentCollection args, Set<String> sampleNames) {
Map<String, String> metaData = new HashMap<String, String>();
Set<String> additionalColumns = new HashSet<String>();
// Don't output the data for now because it kills our unit test MD5s and is optional
// TODO - figure out what to do here
//Calendar cal = Calendar.getInstance();
//metaData.put("fileDate", String.format("%d%02d%02d", cal.get(Calendar.YEAR), cal.get(Calendar.MONTH), cal.get(Calendar.DAY_OF_MONTH)));
metaData.put("format", "VCRv3.2");
metaData.put("source", "VariantsToVCF");
metaData.put("reference", args.referenceFile.getAbsolutePath());
Set<String> metaData = new HashSet<String>();
metaData.add("source=VariantsToVCF");
metaData.add("reference=" + args.referenceFile.getAbsolutePath());
Set<String> additionalColumns = new HashSet<String>();
additionalColumns.add("FORMAT");
additionalColumns.addAll(sampleNames);

View File

@ -29,13 +29,12 @@ public class VCFSubsetWalker extends RefWalker<ArrayList<VCFRecord>, VCFWriter>
private VCFWriter vwriter = null;
public void initializeWriter() {
Map<String, String> metaData = new HashMap<String, String>();
Set<String> metaData = new HashSet<String>();
metaData.add("source=VariantsToVCF");
metaData.add("reference=" + this.getToolkit().getArguments().referenceFile.getAbsolutePath());
Set<String> additionalColumns = new HashSet<String>();
metaData.put("format", "VCRv3.2");
metaData.put("source", "VariantsToVCF");
metaData.put("reference", this.getToolkit().getArguments().referenceFile.getAbsolutePath());
additionalColumns.add("FORMAT");
additionalColumns.addAll(SAMPLES);

View File

@ -30,9 +30,9 @@ import java.util.List;
/**
* @author aaron
* <p/>
* Class Genotype
* Class GenotypeWriter
* <p/>
* The interface for storing genotype calls.
* The interface for writing genotype calls.
*/
public interface GenotypeWriter {
@ -55,8 +55,9 @@ public interface GenotypeWriter {
/**
* add a multi-sample call if we support it
* @param genotypes the list of genotypes, that are backed by sample information
* @param variation the variation
*/
public void addMultiSampleCall(List<Genotype> genotypes, VariationCall metadata);
public void addMultiSampleCall(List<Genotype> genotypes, VariationCall variation);
/**
* @return true if we support multisample, false otherwise

View File

@ -33,6 +33,7 @@ public class GenotypeWriterFactory {
* @param source the source
* @param referenceName the ref name
* @param sampleNames the sample names
* @param headerInfo the optional header info fields
* @return the genotype writer object
*/
public static GenotypeWriter create(GENOTYPE_FORMAT format,
@ -40,7 +41,8 @@ public class GenotypeWriterFactory {
File destination,
String source,
String referenceName,
Set<String> sampleNames ) {
Set<String> sampleNames,
Set<String> headerInfo) {
switch (format) {
case GLF:
return new GLFWriter(header.toString(), destination);
@ -49,7 +51,7 @@ public class GenotypeWriterFactory {
case GELI_BINARY:
return new GeliAdapter(destination, header);
case VCF:
return new VCFGenotypeWriterAdapter(source, referenceName, destination, sampleNames);
return new VCFGenotypeWriterAdapter(source, referenceName, destination, sampleNames, headerInfo);
default:
throw new StingException("Genotype writer " + format.toString() + " is not implemented");
}
@ -60,14 +62,15 @@ public class GenotypeWriterFactory {
PrintStream destination,
String source,
String referenceName,
Set<String> sampleNames ) {
Set<String> sampleNames,
Set<String> headerInfo) {
switch (format) {
case GELI:
return new GeliTextWriter(destination);
case GLF:
return new GLFWriter(header.toString(), destination);
case VCF:
return new VCFGenotypeWriterAdapter(source, referenceName, destination, sampleNames);
return new VCFGenotypeWriterAdapter(source, referenceName, destination, sampleNames, headerInfo);
default:
throw new StingException("Genotype writer to " + format.toString() + " to standard output is not implemented");
}

View File

@ -16,12 +16,25 @@ import java.util.Map;
* <p/>
*/
public class VCFGenotypeRecord implements Genotype {
// the symbols for an empty genotype
// key names
public static final String GENOTYPE_KEY = "GT";
public static final String GENOTYPE_QUALITY_KEY = "GQ";
public static final String DEPTH_KEY = "DP";
public static final String HAPLOTYPE_QUALITY_KEY = "HQ";
public static final String FILTER_KEY = "FT";
public static final String OLD_DEPTH_KEY = "RD";
// the values for empty fields
public static final String EMPTY_GENOTYPE = "./.";
public static final String EMPTY_ALLELE = ".";
public static final int MISSING_GENOTYPE_QUALITY = -1;
public static final int MISSING_DEPTH = -1;
public static final int MISSING_HAPLOTYPE_QUALITY = -1;
public static final String UNFILTERED = ".";
public static final double MAX_QUAL_VALUE = 99.0;
// what kind of phasing this genotype has
public enum PHASE {
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN
@ -51,14 +64,21 @@ public class VCFGenotypeRecord implements Genotype {
* @param otherFlags other flags
*/
public VCFGenotypeRecord(String sampleName, List<VCFGenotypeEncoding> genotypes, PHASE phasing, Map<String, String> otherFlags) {
this.mSampleName = sampleName;
mSampleName = sampleName;
if (genotypes != null) this.mGenotypeAlleles.addAll(genotypes);
this.mPhaseType = phasing;
if (otherFlags != null) this.mFields.putAll(otherFlags);
mPhaseType = phasing;
if (otherFlags != null) {
// we need to be backwards compatible
if ( otherFlags.containsKey(OLD_DEPTH_KEY) ) {
otherFlags.put(DEPTH_KEY, otherFlags.get(OLD_DEPTH_KEY));
otherFlags.remove(OLD_DEPTH_KEY);
}
mFields.putAll(otherFlags);
}
}
public void setVCFRecord(VCFRecord record) {
this.mRecord = record;
mRecord = record;
}
public void setSampleName(String name) {
@ -102,16 +122,11 @@ public class VCFGenotypeRecord implements Genotype {
}
public double getNegLog10PError() {
return ( mFields.containsKey("GQ") ? Double.valueOf(mFields.get("GQ")) / 10.0 : 0.0);
return ( mFields.containsKey(GENOTYPE_QUALITY_KEY) ? Double.valueOf(mFields.get(GENOTYPE_QUALITY_KEY)) / 10.0 : MISSING_GENOTYPE_QUALITY);
}
public int getReadCount() {
int depth = MISSING_DEPTH;
if ( mFields.containsKey("RD") )
depth = Integer.valueOf(mFields.get("RD"));
else if ( mFields.containsKey("DP") )
depth = Integer.valueOf(mFields.get("DP"));
return depth;
return ( mFields.containsKey(DEPTH_KEY) ? Integer.valueOf(mFields.get(DEPTH_KEY)) : MISSING_DEPTH);
}
public GenomeLoc getLocation() {
@ -227,11 +242,26 @@ public class VCFGenotypeRecord implements Genotype {
StringBuilder builder = new StringBuilder();
builder.append(toGenotypeString(altAlleles));
for (String field : mFields.keySet()) {
if (mFields.get(field).equals("")) continue;
builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR);
builder.append(mFields.get(field));
if (mFields.get(field).equals(""))
builder.append(getMissingFieldValue(field));
else
builder.append(mFields.get(field));
}
return builder.toString();
}
public static String getMissingFieldValue(String field) {
String result = "";
if ( field.equals(GENOTYPE_QUALITY_KEY) )
result = String.valueOf(MISSING_GENOTYPE_QUALITY);
else if ( field.equals(DEPTH_KEY) )
result = String.valueOf(MISSING_DEPTH);
else if ( field.equals(FILTER_KEY) )
result = UNFILTERED;
// TODO -- support haplotype quality
//else if ( field.equals(HAPLOTYPE_QUALITY_KEY) )
// result = String.valueOf(MISSING_HAPLOTYPE_QUALITY);
return result;
}
}

View File

@ -27,23 +27,23 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class);
public VCFGenotypeWriterAdapter(String source, String referenceName, File writeTo, Set<String> sampleNames) {
public VCFGenotypeWriterAdapter(String source, String referenceName, File writeTo, Set<String> sampleNames, Set<String> headerInfo) {
mReferenceName = referenceName;
mSource = source;
mSampleNames.addAll(sampleNames);
initializeHeader();
initializeHeader(headerInfo);
if (writeTo == null) throw new RuntimeException("VCF output file must not be null");
mWriter = new VCFWriter(mHeader, writeTo);
}
public VCFGenotypeWriterAdapter(String source, String referenceName, OutputStream writeTo, Set<String> sampleNames) {
public VCFGenotypeWriterAdapter(String source, String referenceName, OutputStream writeTo, Set<String> sampleNames, Set<String> headerInfo) {
mReferenceName = referenceName;
mSource = source;
mSampleNames.addAll(sampleNames);
initializeHeader();
initializeHeader(headerInfo);
if (writeTo == null) throw new RuntimeException("VCF output stream must not be null");
mWriter = new VCFWriter(mHeader, writeTo);
@ -51,15 +51,18 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
/**
* initialize this VCF header
*
* @param optionalHeaderInfo the optional header fields
*/
private void initializeHeader() {
Map<String, String> hInfo = new HashMap<String, String>();
private void initializeHeader(Set<String> optionalHeaderInfo) {
Set<String> hInfo = new TreeSet<String>();
// setup the header fields
hInfo.put("format", VCFWriter.VERSION);
hInfo.put("source", mSource);
hInfo.put("reference", mReferenceName);
hInfo.add(VCFHeader.FULL_FORMAT_LINE);
hInfo.add("source=" + mSource);
hInfo.add("reference=" + mReferenceName);
hInfo.addAll(optionalHeaderInfo);
// setup the sample names
mHeader = new VCFHeader(hInfo, mSampleNames);
}
@ -114,7 +117,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
throw new IllegalArgumentException("Only VCFVariationCall objects should be passed in to the VCF writers");
VCFParameters params = new VCFParameters();
params.addFormatItem("GT");
params.addFormatItem(VCFGenotypeRecord.GENOTYPE_KEY);
// get the location and reference
if ( genotypes.size() == 0 ) {
@ -166,10 +169,10 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
VCFRecord vcfRecord = new VCFRecord(params.getReferenceBase(),
params.getContig(),
params.getPosition(),
(dbSnpID == null ? "." : dbSnpID),
(dbSnpID == null ? VCFRecord.EMPTY_ID_FIELD : dbSnpID),
params.getAlternateBases(),
qual,
"0",
VCFRecord.UNFILTERED,
infoFields,
params.getFormatString(),
params.getGenotypesRecords());
@ -189,15 +192,15 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
Map<String, String> infoFields = new HashMap<String, String>();
if ( locusdata != null ) {
if ( locusdata.getSLOD() != null )
infoFields.put("SB", String.format("%.2f", locusdata.getSLOD()));
infoFields.put(VCFRecord.STRAND_BIAS_KEY, String.format("%.2f", locusdata.getSLOD()));
if ( locusdata.hasNonRefAlleleFrequency() )
infoFields.put("AF", String.format("%.2f", locusdata.getNonRefAlleleFrequency()));
infoFields.put(VCFRecord.ALLELE_FREQUENCY_KEY, String.format("%.2f", locusdata.getNonRefAlleleFrequency()));
Map<String, String> otherFields = locusdata.getFields();
if ( otherFields != null ) {
infoFields.putAll(otherFields);
}
}
infoFields.put("NS", String.valueOf(params.getGenotypesRecords().size()));
infoFields.put(VCFRecord.SAMPLE_NUMBER_KEY, String.valueOf(params.getGenotypesRecords().size()));
return infoFields;
}

View File

@ -1,7 +1,5 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.apache.log4j.Logger;
import java.util.*;
@ -14,13 +12,25 @@ import java.util.*;
*/
public class VCFHeader {
public static final String FILE_FORMAT_KEY = "fileformat=";
public static final String OLD_FILE_FORMAT_KEY = "format="; // from version 3.2
/** the current vcf version we support. */
public static final String VCF_VERSION_HEADER = "VCFv";
public static final String OLD_VCF_VERSION_HEADER = "VCRv"; // from version 3.2
public static final double VCF_VERSION_NUMBER = 3.3;
public static final String VCF_VERSION = VCF_VERSION_HEADER + VCF_VERSION_NUMBER;
public static final String FULL_FORMAT_LINE = FILE_FORMAT_KEY + VCF_VERSION;
// the manditory header fields
public enum HEADER_FIELDS {
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
}
// the associated meta data
private final Map<String, String> mMetaData = new HashMap<String, String>();
private final Set<String> mMetaData;
// the list of auxillary tags
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
@ -31,22 +41,17 @@ public class VCFHeader {
// the header string indicator
public static final String HEADER_INDICATOR = "#";
/** our log, which we use to capture anything from this class */
private static Logger logger = Logger.getLogger(VCFHeader.class);
/** do we have genotying data? */
private boolean hasGenotypingData = false;
/** the current vcf version we support. */
private static final String VCF_VERSION = "VCRv3.2";
/**
* create a VCF header, given a list of meta data and auxillary tags
*
* @param metaData the meta data associated with this header
*/
public VCFHeader(Map<String, String> metaData) {
for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key));
public VCFHeader(Set<String> metaData) {
mMetaData = new TreeSet<String>(metaData);
checkVCFVersion();
}
@ -56,8 +61,8 @@ public class VCFHeader {
* @param metaData the meta data associated with this header
* @param genotypeSampleNames the genotype format field, and the sample names
*/
public VCFHeader(Map<String, String> metaData, Set<String> genotypeSampleNames) {
for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key));
public VCFHeader(Set<String> metaData, Set<String> genotypeSampleNames) {
mMetaData = new TreeSet<String>(metaData);
for (String col : genotypeSampleNames) {
if (!col.equals("FORMAT"))
mGenotypeSampleNames.add(col);
@ -71,13 +76,34 @@ public class VCFHeader {
* or the version is not present
*/
public void checkVCFVersion() {
if (mMetaData.containsKey("format")) {
if (mMetaData.get("format").equals(VCF_VERSION))
return;
throw new RuntimeException("VCFHeader: VCF version of " + mMetaData.get("format") +
" doesn't match the supported version of " + VCF_VERSION);
String version = null;
for ( String field : mMetaData ) {
if ( field.startsWith(FILE_FORMAT_KEY) ) {
version = field.substring(FILE_FORMAT_KEY.length());
break;
}
else if ( field.startsWith(OLD_FILE_FORMAT_KEY) ) {
version = field.substring(OLD_FILE_FORMAT_KEY.length());
break;
}
}
throw new RuntimeException("VCFHeader: VCF version isn't present");
if ( version == null )
mMetaData.add(FULL_FORMAT_LINE);
else if ( !isSupportedVersion(version) )
throw new RuntimeException("VCF version " + version +
" is not yet supported; only version " + VCF_VERSION + " and earlier can be used");
}
private boolean isSupportedVersion(String version) {
if ( !version.startsWith(VCF_VERSION_HEADER) && !version.startsWith(OLD_VCF_VERSION_HEADER) )
return false;
try {
double dVersion = Double.valueOf(version.substring(VCF_VERSION_HEADER.length()));
return dVersion <= VCF_VERSION_NUMBER;
} catch (Exception e) { }
return false;
}
/**
@ -96,9 +122,9 @@ public class VCFHeader {
/**
* get the meta data, associated with this header
*
* @return a map of the meta data
* @return a set of the meta data
*/
public Map<String, String> getMetaData() {
public Set<String> getMetaData() {
return mMetaData;
}

View File

@ -21,9 +21,6 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
// our next record
private VCFRecord mNextRecord = null;
// a pattern we use for detecting meta data and header lines
private static Pattern pMeta = Pattern.compile("^" + VCFHeader.METADATA_INDICATOR + "\\s*(\\S+)\\s*=\\s*(\\S+)\\s*$");
// our pattern matching for the genotype mFields
private static final Pattern gtPattern = Pattern.compile("([0-9\\.]+)([\\\\|\\/])([0-9\\.]*)");
@ -142,24 +139,11 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
* @return a VCF Header created from the list of stinrgs
*/
protected VCFHeader createHeader(List<String> headerStrings) {
Map<String, String> metaData = new HashMap<String, String>();
Set<String> metaData = new TreeSet<String>();
Set<String> auxTags = new LinkedHashSet<String>();
// iterate over all the passed in strings
for (String str : headerStrings) {
Matcher matcher = pMeta.matcher(str);
if (matcher.matches()) {
String metaKey;
String metaValue = "";
if (matcher.groupCount() < 1) continue;
if (matcher.groupCount() == 2) metaValue = matcher.group(2);
metaKey = matcher.group(1);
metaData.put(metaKey, metaValue);
}
}
// iterate over all the passed in strings
for (String str : headerStrings) { // TODO: fix, we shouldn't loop over every line
if (str.startsWith("#") && !str.startsWith("##")) {
for ( String str : headerStrings ) {
if ( !str.startsWith("##") ) {
String[] strings = str.substring(1).split("\\s+");
// the columns should be in order according to Richard Durbin
int arrayIndex = 0;
@ -177,8 +161,11 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
auxTags.add(strings[arrayIndex]);
arrayIndex++;
}
} else {
metaData.add(str.substring(2));
}
}
return new VCFHeader(metaData, auxTags);
}
@ -266,7 +253,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
nextDivider = (genotypeString.indexOf(":") > genotypeString.length()) ? genotypeString.length() : genotypeString.indexOf(":");
parse = genotypeString.substring(0, nextDivider);
}
if (key.equals("GT")) {
if (key.equals(VCFGenotypeRecord.GENOTYPE_KEY)) {
Matcher m = gtPattern.matcher(parse);
if (!m.matches())
throw new RuntimeException("VCFReader: Unable to match GT genotype flag to it's expected pattern, the field was: " + parse);
@ -274,6 +261,8 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
addAllele(m.group(1), altAlleles, referenceBase, bases);
if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases);
} else {
if ( parse.length() == 0 )
parse = VCFGenotypeRecord.getMissingFieldValue(key);
tagToValue.put(key, parse);
}
if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1;

View File

@ -11,13 +11,31 @@ import java.util.*;
*/
public class VCFRecord implements Variation, VariantBackedByGenotype {
// standard info field keys
public static final String ANCESTRAL_ALLELE_KEY = "AA";
public static final String ALLELE_COUNT_KEY = "AC";
public static final String ALLELE_FREQUENCY_KEY = "AF";
public static final String ALLELE_NUMBER_KEY = "AN";
public static final String RMS_BASE_QUALITY_KEY = "BQ";
public static final String DBSNP_KEY = "DB";
public static final String DEPTH_KEY = "DP";
public static final String HAPMAP2_KEY = "H2";
public static final String RMS_MAPPING_QUALITY_KEY = "MQ";
public static final String SAMPLE_NUMBER_KEY = "NS";
public static final String STRAND_BIAS_KEY = "SB";
// commonly used strings that are in the standard
public static final String FORMAT_FIELD_SEPERATOR = ":";
public static final String GENOTYPE_FIELD_SEPERATOR = ":";
public static final String FIELD_SEPERATOR = "\t";
public static final String FILTER_CODE_SEPERATOR = ";";
public static final String INFO_FIELD_SEPERATOR = ";";
// default values
public static final String UNFILTERED = ".";
public static final String PASSES_FILTERS = "0";
public static final String EMPTY_INFO_FIELD = ".";
public static final String EMPTY_ID_FIELD = ".";
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
// the reference base
@ -139,13 +157,12 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
String vals[] = columnValues.get(val).split(";");
for (String alt : vals) {
String keyVal[] = alt.split("=");
if (keyVal.length == 1 && keyVal[0].equals(".")) {
if ( keyVal.length == 1 )
this.addInfoField(keyVal[0], "");
break;
}
if (keyVal.length != 2)
else if (keyVal.length == 2)
this.addInfoField(keyVal[0], keyVal[1]);
else
throw new IllegalArgumentException("info field key-value pair did not parse into key->value pair: " + alt);
this.addInfoField(keyVal[0], keyVal[1]);
}
break;
}
@ -231,14 +248,14 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
}
public double getNonRefAlleleFrequency() {
if ( mInfoFields.containsKey("AF") ) {
return Double.valueOf(mInfoFields.get("AF"));
if ( mInfoFields.containsKey(ALLELE_FREQUENCY_KEY) ) {
return Double.valueOf(mInfoFields.get(ALLELE_FREQUENCY_KEY));
} else {
// this is the poor man's AF
if ( mInfoFields.containsKey("AC") && mInfoFields.containsKey("AN")) {
String splt[] = mInfoFields.get("AC").split(",");
if ( mInfoFields.containsKey(ALLELE_COUNT_KEY) && mInfoFields.containsKey(ALLELE_NUMBER_KEY)) {
String splt[] = mInfoFields.get(ALLELE_COUNT_KEY).split(",");
if ( splt.length > 0 ) {
return (Double.valueOf(splt[0]) / Double.valueOf(mInfoFields.get("AN")));
return (Double.valueOf(splt[0]) / Double.valueOf(mInfoFields.get(ALLELE_NUMBER_KEY)));
}
}
}
@ -250,9 +267,13 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
if ( !hasAlternateAllele() )
return VARIANT_TYPE.REFERENCE;
// TODO -- figure out what to do about records with more than one type
VCFGenotypeEncoding encoding = mAlts.get(0);
switch ( encoding.getType() ) {
VCFGenotypeEncoding.TYPE type = mAlts.get(0).getType();
for (int i = 1; i < mAlts.size(); i++) {
if ( mAlts.get(i).getType() != type )
throw new IllegalStateException("The record contains multiple encoding types");
}
switch ( type ) {
case SINGLE_BASE:
return VARIANT_TYPE.SNP;
case DELETION:
@ -309,23 +330,20 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
/**
* get the filter criteria
*
* @return an array of strings representing the filtering criteria, or 0 is none are applied
* @return an array of strings representing the filtering criteria, or UNFILTERED if none are applied
*/
public String[] getFilteringCodes() {
if (mFilterString == null) return new String[]{"0"};
return this.mFilterString.split(FILTER_CODE_SEPERATOR);
if (mFilterString == null) return new String[]{UNFILTERED};
return mFilterString.split(FILTER_CODE_SEPERATOR);
}
public boolean isFiltered() {
String[] codes = getFilteringCodes();
if ( codes.length > 1 ) return true;
else if ( codes[0].equals(".") || codes[0].equals("0") ) return false;
else return true;
return !codes[0].equals(UNFILTERED) && !codes[0].equals(PASSES_FILTERS);
}
public boolean hasFilteringCodes() {
// todo --- currently always returns true
return getFilteringCodes() != null;
return mFilterString != null;
}
public String getFilterString() {
@ -567,7 +585,9 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
if (!this.mLoc.equals(other.mLoc)) return false;
if (!this.mID.equals(other.mID)) return false;
if (this.mQual != other.mQual) return false;
if (!this.mFilterString.equals(other.mFilterString)) return false;
if ( this.mFilterString == null ) {
if ( other.mFilterString != null ) return false;
} else if ( !this.mFilterString.equals(other.mFilterString) ) return false;
if (!this.mInfoFields.equals(other.mInfoFields)) return false;
if (!this.mGenotypeFields.equals(other.mGenotypeFields)) return false;
return true;

View File

@ -20,12 +20,38 @@ public class VCFUtils {
private VCFUtils() { }
/**
* Gets the header fields from all VCF rods input by the user
*
* @param toolkit GATK engine
*
* @return a set of all fields
*/
public static Set<String> getHeaderFields(GenomeAnalysisEngine toolkit) {
// keep a map of sample name to occurrences encountered
TreeSet<String> fields = new TreeSet<String>();
// iterate to get all of the sample names
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
ReferenceOrderedData rod = source.getReferenceOrderedData();
if ( rod.getType().equals(RodVCF.class) ) {
VCFReader reader = new VCFReader(rod.getFile());
fields.addAll(reader.getHeader().getMetaData());
reader.close();
}
}
return fields;
}
/**
* Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
* (e.g. sampleX.1, sampleX.2, ...)
* When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping
* from rod/sample pairs to the new uniquified names
*
*
* @param toolkit GATK engine
* @param samples set to store the sample names
* @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names
@ -104,7 +130,7 @@ public class VCFUtils {
public static VCFRecord mergeRecords(List<RodVCF> rods, Map<Pair<String, String>, String> rodNamesToSampleNames) {
VCFParameters params = new VCFParameters();
params.addFormatItem("GT");
params.addFormatItem(VCFGenotypeRecord.GENOTYPE_KEY);
// keep track of the locus specific data so we can merge them intelligently
int totalReadDepth = 0;
@ -122,7 +148,9 @@ public class VCFUtils {
if ( params.getPosition() < 1 )
params.setLocations(rod.getLocation(), call.getReference());
params.addGenotypeRecord(createVCFGenotypeRecord(params, call, rod.mCurrentRecord));
totalReadDepth += call.getReadCount();
int depth = call.getReadCount();
if ( depth > 0 )
totalReadDepth += call.getReadCount();
}
// set the overall confidence to be the max entry we see
@ -142,14 +170,14 @@ public class VCFUtils {
}
Map<String, String> infoFields = new HashMap<String, String>();
infoFields.put("DP", String.format("%d", totalReadDepth));
infoFields.put("NS", String.valueOf(params.getGenotypesRecords().size()));
infoFields.put(VCFRecord.DEPTH_KEY, String.format("%d", totalReadDepth));
infoFields.put(VCFRecord.SAMPLE_NUMBER_KEY, String.valueOf(params.getGenotypesRecords().size()));
// set the overall strand bias and allele frequency to be the average of all entries we've seen
if ( SLODsSeen > 0 )
infoFields.put("SB", String.format("%.2f", (totalSLOD/(double)SLODsSeen)));
infoFields.put(VCFRecord.STRAND_BIAS_KEY, String.format("%.2f", (totalSLOD/(double)SLODsSeen)));
if ( freqsSeen > 0 )
infoFields.put("AF", String.format("%.2f", (totalFreq/(double)freqsSeen)));
infoFields.put(VCFRecord.ALLELE_FREQUENCY_KEY, String.format("%.2f", (totalFreq/(double)freqsSeen)));
return new VCFRecord(params.getReferenceBase(),
params.getContig(),
@ -175,13 +203,12 @@ public class VCFUtils {
public static VCFGenotypeRecord createVCFGenotypeRecord(VCFParameters params, VCFGenotypeRecord gtype, VCFRecord vcfrecord) {
Map<String, String> map = new HashMap<String, String>();
// calculate the RMS mapping qualities and the read depth
int readDepth = gtype.getReadCount();
map.put("RD", String.valueOf(readDepth));
params.addFormatItem("RD");
double qual = 10.0 * gtype.getNegLog10PError();
map.put("GQ", String.format("%.2f", qual));
params.addFormatItem("GQ");
// calculate the genotype quality and the read depth
map.put(VCFGenotypeRecord.DEPTH_KEY, String.valueOf(gtype.getReadCount()));
params.addFormatItem(VCFGenotypeRecord.DEPTH_KEY);
double qual = Math.min(10.0 * gtype.getNegLog10PError(), VCFGenotypeRecord.MAX_QUAL_VALUE);
map.put(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY, String.format("%.2f", qual));
params.addFormatItem(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
List<VCFGenotypeEncoding> alleles = createAlleleArray(gtype);
for (VCFGenotypeEncoding allele : alleles) {
@ -208,12 +235,11 @@ public class VCFUtils {
Map<String, String> map = new HashMap<String, String>();
// calculate the RMS mapping qualities and the read depth
int readDepth = gtype.getReadCount();
map.put("RD", String.valueOf(readDepth));
params.addFormatItem("RD");
double qual = gtype.getNegLog10PError();
map.put("GQ", String.format("%.2f", qual));
params.addFormatItem("GQ");
map.put(VCFGenotypeRecord.DEPTH_KEY, String.valueOf(gtype.getReadCount()));
params.addFormatItem(VCFGenotypeRecord.DEPTH_KEY);
double qual = Math.min(10.0 * gtype.getNegLog10PError(), VCFGenotypeRecord.MAX_QUAL_VALUE);
map.put(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY, String.format("%.2f", qual));
params.addFormatItem(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
List<VCFGenotypeEncoding> alleles = createAlleleArray(gtype);
for (VCFGenotypeEncoding allele : alleles) {

View File

@ -2,14 +2,14 @@ package org.broadinstitute.sting.utils.genotype.vcf;
import java.io.*;
import java.util.TreeSet;
/**
* this class writers VCF files
*/
public class VCFWriter {
public static final String VERSION = "VCRv3.2";
// the VCF header we're storing
private VCFHeader mHeader;
@ -49,10 +49,25 @@ public class VCFWriter {
mWriter = new BufferedWriter(
new OutputStreamWriter(location));
try {
// write the header meta-data out
for (String metadata : header.getMetaData().keySet()) {
mWriter.write(VCFHeader.METADATA_INDICATOR + metadata + "=" + header.getMetaData().get(metadata) + "\n");
// the fileformat field needs to be written first
TreeSet<String> allMetaData = new TreeSet<String>(header.getMetaData());
for ( String metadata : allMetaData ) {
if ( metadata.startsWith(VCFHeader.FILE_FORMAT_KEY) ) {
mWriter.write(VCFHeader.METADATA_INDICATOR + metadata + "\n");
break;
}
else if ( metadata.startsWith(VCFHeader.OLD_FILE_FORMAT_KEY) ) {
mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeader.FILE_FORMAT_KEY + metadata.substring(VCFHeader.OLD_FILE_FORMAT_KEY.length()) + "\n");
break;
}
}
// write the rest of the header meta-data out
for ( String metadata : header.getMetaData() ) {
if ( !metadata.startsWith(VCFHeader.FILE_FORMAT_KEY) && !metadata.startsWith(VCFHeader.OLD_FILE_FORMAT_KEY) )
mWriter.write(VCFHeader.METADATA_INDICATOR + metadata + "\n");
}
// write out the column line
StringBuilder b = new StringBuilder();
b.append(VCFHeader.HEADER_INDICATOR);

View File

@ -91,7 +91,7 @@ public class RodVCFTest extends BaseTest {
@Test
public void testToString() {
// slightly altered line, due to map ordering
final String firstLine = "20\t14370\trs6054257\tG\tA\t29.00\t0\tAF=0.786;DP=258;NS=58\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5\n";
final String firstLine = "20\t14370\trs6054257\tG\tA\t29.00\t0\tAF=0.786;DP=258;NS=58\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:\n";
RodVCF vcf = getVCFObject();
VCFReader reader = new VCFReader(vcfFile);
Iterator<RodVCF> iter = vcf.createIterator("VCF", vcfFile);
@ -105,7 +105,7 @@ public class RodVCFTest extends BaseTest {
// verify the first line too
if (first) {
if (!firstLine.equals(rec1.toStringEncoding(mHeader) + "\n")) {
fail("VCF record rec1.toStringEncoding() != expected string :\n" + rec1.toStringEncoding(mHeader) + firstLine);
fail("VCF record rec1.toStringEncoding() != expected string :\n" + rec1.toStringEncoding(mHeader) + "\n" + firstLine);
}
first = false;
}

View File

@ -33,7 +33,7 @@ public class SecondBaseSkewIntegrationTest extends WalkerTest {
+"-B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli "
+"-vcf %s -sample variant -L /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_test_intervals.interval_list";
String md5_for_this_test = "a2ec1d36e77da56b4a11eef34d73296e";
String md5_for_this_test = "f7e67c353d3113447d1b9c8c39de6ed0";
WalkerTestSpec spec = new WalkerTestSpec(test_args,1, Arrays.asList(md5_for_this_test));
executeTest("Testing on E2 annotated but not Q2 annotated file ",spec);
@ -49,7 +49,7 @@ public class SecondBaseSkewIntegrationTest extends WalkerTest {
+"-B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli "
+"-vcf %s -sample variant -L /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_test_intervals.interval_list";
String md5_for_this_test = "f105fd8a7ae7026a55107b86e768553a";
String md5_for_this_test = "3eee411119888fc4633870a91ed2093d";
WalkerTestSpec spec = new WalkerTestSpec(test_args,1, Arrays.asList(md5_for_this_test));
executeTest("Testing on bam file without 2bb annotations ",spec);
@ -61,7 +61,7 @@ public class SecondBaseSkewIntegrationTest extends WalkerTest {
+ " -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -A SecondBaseSkew"
+ " -sample variant -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pileup_test_chr15.vcf"
+ " -vcf %s -L chr15:46347148";
String expected_md5 = "160a8e3826eb745bcfe2f463f73e1ec7";
String expected_md5 = "c70dfb30c3caa9184604f88bc7f62a07";
WalkerTestSpec spec = new WalkerTestSpec(test_args,1,Arrays.asList(expected_md5));
executeTest("Testing on locus with many indels", spec);
}

View File

@ -34,8 +34,8 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public static String secondBaseTestmd5( int testNo ) {
switch ( testNo ) {
case 1: return "8f5b4b29eefb25d6b7b9e32a1c90f144";
case 2: return "3a53d945b38e1fc87a801f23115222fe";
case 1: return "bf64bac186fd682018dd7f0419d90190";
case 2: return "67f40627b12be31efe02c9d853fbcf37";
default: throw new StingException("Impossible test has been run: secondbasetest number "+testNo);
}
}
@ -50,7 +50,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsNotAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("0ff01251afdabafb4e137357b25be72a"));
Arrays.asList("6903f8b31820ce7a56230b99f9a9309c"));
executeTest("test file has annotations, not asking for annotations, #1", spec);
}
@ -58,7 +58,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsNotAsking2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
Arrays.asList("227f38b2f322780187eacd0c98ace3e6"));
Arrays.asList("5298c24e956361d209f14ac6138a3bbd"));
executeTest("test file has annotations, not asking for annotations, #2", spec);
}
@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -standard -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("9a30487ad885f4d49569032fe6463af3"));
Arrays.asList("31ddecd3118bd6aea00bcd0369a1b32f"));
executeTest("test file has annotations, asking for annotations, #1", spec);
}
@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsAsking2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -standard -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
Arrays.asList("ef01d735ea0bcfeb6e7394c65f2a1938"));
Arrays.asList("553c55f3bcf31a6b1d9a2c1d27fde480"));
executeTest("test file has annotations, asking for annotations, #2", spec);
}
@ -82,7 +82,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoAnnotsNotAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2empty.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("18d85a9711d56bdf7e2327b83d6745e2"));
Arrays.asList("b46f395864cb71b887d69342f56e7fdb"));
executeTest("test file doesn't have annotations, not asking for annotations, #1", spec);
}
@ -90,7 +90,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoAnnotsNotAsking2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3empty.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
Arrays.asList("da51f18d189fb0fd804822c81b786e0f"));
Arrays.asList("aaae89c48fcab615fe4204220ec62859"));
executeTest("test file doesn't have annotations, not asking for annotations, #2", spec);
}
@ -98,7 +98,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoAnnotsAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -standard -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2empty.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("7e6ceb79e9a1f104723299ed68b236c6"));
Arrays.asList("fa1071b292fd2e9db5c36410ffd44fdb"));
executeTest("test file doesn't have annotations, asking for annotations, #1", spec);
}
@ -106,7 +106,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoAnnotsAsking2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -standard -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3empty.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
Arrays.asList("87748d4c80ff76701dd01d7b0f803249"));
Arrays.asList("9d133a91949e477523a039a0a52ac2a8"));
executeTest("test file doesn't have annotations, asking for annotations, #2", spec);
}

View File

@ -14,7 +14,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
public void testSimpleVenn() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn", 1,
Arrays.asList("851b68004874f3a2e76d795e7401f8a0"));
Arrays.asList("2c7e18901dbf27bac9f36b3dbee063c6"));
executeTest("testSimpleVenn", spec);
}
@ -22,7 +22,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
public void testSNPConcordance() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SNPGenotypeConcordance:qscore=5", 1,
Arrays.asList("7afb56b30257fe2d66bee7a029d75685"));
Arrays.asList("c21d59fc3194c39c662d2e74b53dcf9c"));
executeTest("testSNPConcordance", spec);
}
@ -30,7 +30,15 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
public void testNWayVenn() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -B set3,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/CEU.sample.vcf -CT NWayVenn", 1,
Arrays.asList("f452c04c600ad10c054f18b0c77b53d5"));
Arrays.asList("2b38ae235edd10773dbee0bfae036e35"));
executeTest("testNWayVenn", spec);
}
@Test
public void testMulti() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn -CT NWayVenn -CT SNPGenotypeConcordance:qscore=5", 1,
Arrays.asList("9bcc83aadac00a160cef20e7126368ee"));
executeTest("testMulti", spec);
}
}

View File

@ -16,7 +16,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testNoAction() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("2408d449fbe7bf74099cc53d2d97c248"));
Arrays.asList("49817e684effce7b6f3d5776dc781988"));
executeTest("test no action", spec);
}
@ -24,7 +24,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testClusteredSnps() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -window 10 -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("58a2d4cd3d3ba1460833b45b9b8455c2"));
Arrays.asList("ee5e4e00bf25a912e8cab3e768fa0e7d"));
executeTest("test clustered SNPs", spec);
}
@ -32,7 +32,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testMask() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -mask foo -B mask,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("9fc2cb210b3595159f34ddfba5a2e572"));
Arrays.asList("77451c3ba8a070343e69157cdaf2be92"));
executeTest("test mask", spec);
}
@ -40,7 +40,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testFilter1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("5effa4b4fdd4dd33a373561637a5d86e"));
Arrays.asList("3b44be79d676536bd6c0f32774091fee"));
executeTest("test filter #1", spec);
}
@ -48,7 +48,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testFilter2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("5e0077148eda3b4274fbef1048902d47"));
Arrays.asList("baa425fa8e0761a05733a1a9d62d02ff"));
executeTest("test filter #2", spec);
}
}

View File

@ -47,7 +47,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1PointEM() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
Arrays.asList("ad7024c3c880a451d2f5537797b49beb"));
Arrays.asList("68fcdca40df72b3c703bab846c5f0bbd"));
executeTest("testMultiSamplePilot1 - Point Estimate EM", spec);
}
@ -55,7 +55,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot2PointEM() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
Arrays.asList("7709af47dde8e127e0e36e86073e2cb1"));
Arrays.asList("eb0cd5494ae4c1781bfa00b6c4146993"));
executeTest("testMultiSamplePilot2 - Point Estimate EM", spec);
}
@ -68,7 +68,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testPooled1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1,
Arrays.asList("1905bc65b1abb56c776558d562de5ea1"));
Arrays.asList("ec1aeb69d7d54a7ced1ce625146d1d59"));
executeTest("testPooled1", spec);
}
@ -81,7 +81,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("aaecb9fe822a42576500a91973baff03"));
Arrays.asList("e27552dad05ddf17403aaa7176b9cfe2"));
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
}
@ -89,7 +89,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot2Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("d36a8ba5ddf1265ab5be2ed390fa56e1"));
Arrays.asList("3618711e41b7e37f47b995d39adbc76b"));
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
}
@ -97,7 +97,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("029706a60440660c6a636091e9489122"));
Arrays.asList("2a32add40319ab2de44951624df2be4b"));
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
}

View File

@ -21,7 +21,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testVariantsToVCFUsingGeliInput() {
List<String> md5 = new ArrayList<String>();
md5.add("0b96a8046d2a06bd87f57df8bac1678d");
md5.add("a94c15f2e8905fd3e98301375cf0f42a");
/**
* the above MD5 was calculated from running the following command:
@ -50,7 +50,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testGenotypesToVCFUsingGeliInput() {
List<String> md5 = new ArrayList<String>();
md5.add("09660faa7cfad8af36602f79461c0605");
md5.add("6b18f33e25edbd2154c17a949656644b");
/**
* the above MD5 was calculated from running the following command:

View File

@ -18,7 +18,7 @@ import java.util.*;
public class VCFHeaderTest extends BaseTest {
private Set<VCFHeader.HEADER_FIELDS> headerFields = new LinkedHashSet<VCFHeader.HEADER_FIELDS>();
private Map<String, String> metaData = new HashMap();
private Set<String> metaData = new HashSet();
private Set<String> additionalColumns = new HashSet<String>();
/**
@ -26,8 +26,8 @@ public class VCFHeaderTest extends BaseTest {
*/
@Test
public void testHeaderConstructor() {
metaData.put("format","VCRv3.2");
metaData.put("two","2");
metaData.add(VCFHeader.FULL_FORMAT_LINE); // required
metaData.add("two=2");
additionalColumns.add("extra1");
additionalColumns.add("extra2");
// this should create a header that is valid
@ -40,12 +40,7 @@ public class VCFHeaderTest extends BaseTest {
Assert.assertEquals(VCFHeader.HEADER_FIELDS.values()[index],field);
index++;
}
index = 0;
for (String key: header.getMetaData().keySet()) {
Assert.assertEquals(header.getMetaData().get(key),metaData.get(key));
index++;
}
Assert.assertEquals(metaData.size(),index);
Assert.assertEquals(metaData.size(), header.getMetaData().size());
index = 0;
for (String key: header.getGenotypeSamples()) {
Assert.assertTrue(additionalColumns.contains(key));

View File

@ -0,0 +1,18 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.Arrays;
public class VCFIntegrationTest extends WalkerTest {
@Test
public void test1() {
// Read in and then emit each record
WalkerTestSpec spec = new WalkerTestSpec(
"-T PrintRODs -R /broad/1KG/reference/human_b36_both.fasta -L 1:10,000,000-10,050,000 -o %s -B vcf,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/complexExample.vcf", 1,
Arrays.asList("26ad7a663d0f247ac26ce5490edd7ec0"));
executeTest("test vcf", spec);
}
}

View File

@ -20,7 +20,7 @@ public class VCFReaderTest extends BaseTest {
private static final File vcfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf");
private static final File multiSampleVCF = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/MultiSample.vcf");
private static final String VCF_MIXUP_FILE = "/humgen/gsa-scr1/GATK_Data/Validation_Data/mixedup.vcf";
private static final String VCF_MIXUP_FILE = "/humgen/gsa-scr1/GATK_Data/Validation_Data/mixedup.v2.vcf";
private static IndexedFastaSequenceFile seq;

View File

@ -140,10 +140,10 @@ public class VCFRecordTest extends BaseTest {
* @return a fake VCF header
*/
public static VCFHeader createFakeHeader() {
Map<String, String> metaData = new HashMap();
Set<String> metaData = new HashSet();
metaData.add(VCFHeader.FULL_FORMAT_LINE); // required
metaData.add("two=2");
Set<String> additionalColumns = new HashSet<String>();
metaData.put("format", "VCRv3.2"); // required
metaData.put("two", "2");
additionalColumns.add("FORMAT");
additionalColumns.add("sample1");
return new VCFHeader(metaData, additionalColumns);

View File

@ -22,7 +22,7 @@ import java.util.*;
*/
public class VCFWriterTest extends BaseTest {
private Set<VCFHeader.HEADER_FIELDS> headerFields = new LinkedHashSet<VCFHeader.HEADER_FIELDS>();
private Map<String, String> metaData = new HashMap();
private Set<String> metaData = new HashSet();
private Set<String> additionalColumns = new HashSet<String>();
private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf");
@ -62,9 +62,9 @@ public class VCFWriterTest extends BaseTest {
* create a fake header of known quantity
* @return a fake VCF header
*/
public static VCFHeader createFakeHeader(Map<String, String> metaData, Set<String> additionalColumns) {
metaData.put("format", "VCRv3.2"); // required
metaData.put("two", "2");
public static VCFHeader createFakeHeader(Set<String> metaData, Set<String> additionalColumns) {
metaData.add(VCFHeader.FULL_FORMAT_LINE); // required
metaData.add("two=2");
additionalColumns.add("FORMAT");
additionalColumns.add("extra1");
additionalColumns.add("extra2");
@ -109,12 +109,7 @@ public class VCFWriterTest extends BaseTest {
Assert.assertEquals(VCFHeader.HEADER_FIELDS.values()[index], field);
index++;
}
index = 0;
for (String key : header.getMetaData().keySet()) {
Assert.assertEquals(header.getMetaData().get(key), metaData.get(key));
index++;
}
Assert.assertEquals(metaData.size(), index);
Assert.assertEquals(metaData.size(), header.getMetaData().size());
index = 0;
for (String key : header.getGenotypeSamples()) {
Assert.assertTrue(additionalColumns.contains(key));