Stage 3 of Variation refactoring:
We are now VCF3.3 compliant. (Only a few more stages left. Sigh.) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2287 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9e2f831206
commit
e8822a3fb4
|
|
@ -90,7 +90,7 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
|
|||
|
||||
public boolean hasStrandBias() {
|
||||
assertNotNull();
|
||||
return this.mCurrentRecord.getInfoValues().containsKey("SB");
|
||||
return this.mCurrentRecord.getInfoValues().containsKey(VCFRecord.STRAND_BIAS_KEY);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -99,7 +99,7 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
|
|||
* @return StrandBias with the stored slod
|
||||
*/
|
||||
public double getStrandBias() {
|
||||
return hasStrandBias() ? Double.valueOf(this.mCurrentRecord.getInfoValues().get("SB")) : 0.0;
|
||||
return hasStrandBias() ? Double.valueOf(this.mCurrentRecord.getInfoValues().get(VCFRecord.STRAND_BIAS_KEY)) : 0.0;
|
||||
}
|
||||
|
||||
/** @return the VARIANT_TYPE of the current variant */
|
||||
|
|
@ -291,8 +291,7 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
|
|||
}
|
||||
|
||||
public RodVCF next() {
|
||||
mCurrentRecord = mReader.next();
|
||||
return new RodVCF(name, mCurrentRecord, mReader);
|
||||
return new RodVCF(name, mReader.next(), mReader);
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
|
||||
|
||||
|
||||
public class DepthOfCoverage extends StandardVariantAnnotation {
|
||||
|
|
@ -12,9 +13,9 @@ public class DepthOfCoverage extends StandardVariantAnnotation {
|
|||
return String.format("%d", depth);
|
||||
}
|
||||
|
||||
public String getKeyName() { return "DP"; }
|
||||
public String getKeyName() { return VCFRecord.DEPTH_KEY; }
|
||||
|
||||
public String getDescription() { return "DP,1,Integer,\"Total Depth\""; }
|
||||
public String getDescription() { return getKeyName() + ",1,Integer,\"Total Depth\""; }
|
||||
|
||||
public boolean useZeroQualityReads() { return false; }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
|
|
@ -20,9 +22,9 @@ public class RMSMappingQuality extends StandardVariantAnnotation {
|
|||
return String.format("%.2f", rms);
|
||||
}
|
||||
|
||||
public String getKeyName() { return "MQ"; }
|
||||
public String getKeyName() { return VCFRecord.RMS_MAPPING_QUALITY_KEY; }
|
||||
|
||||
public String getDescription() { return "MQ,1,Float,\"RMS Mapping Quality\""; }
|
||||
public String getDescription() { return getKeyName() + ",1,Float,\"RMS Mapping Quality\""; }
|
||||
|
||||
public boolean useZeroQualityReads() { return true; }
|
||||
}
|
||||
|
|
@ -117,10 +117,11 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
// setup the header fields
|
||||
Map<String, String> hInfo = new HashMap<String, String>();
|
||||
hInfo.put("format", VCFWriter.VERSION);
|
||||
hInfo.put("source", "VariantAnnotator");
|
||||
hInfo.put("reference", this.getToolkit().getArguments().referenceFile.getName());
|
||||
Set<String> hInfo = new HashSet<String>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add("source=VariantAnnotator");
|
||||
hInfo.add("annotatorReference=" + getToolkit().getArguments().referenceFile.getName());
|
||||
hInfo.addAll(getVCFAnnotationDescriptions(requestedAnnotations));
|
||||
|
||||
vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter = new VCFWriter(vcfHeader, VCF_OUT);
|
||||
|
|
@ -172,6 +173,40 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
|
|||
return 1;
|
||||
}
|
||||
|
||||
// option #1: don't specify annotations to be used: standard annotations are used by default
|
||||
public static Set<String> getVCFAnnotationDescriptions() {
|
||||
if ( standardAnnotations == null )
|
||||
determineAllAnnotations();
|
||||
|
||||
TreeSet<String> descriptions = new TreeSet<String>();
|
||||
for ( VariantAnnotation annotation : standardAnnotations.values() )
|
||||
descriptions.add("INFO=" + annotation.getDescription());
|
||||
|
||||
return descriptions;
|
||||
}
|
||||
|
||||
// option #2: specify that all possible annotations be used
|
||||
public static Set<String> getAllVCFAnnotationDescriptions() {
|
||||
if ( standardAnnotations == null )
|
||||
determineAllAnnotations();
|
||||
|
||||
TreeSet<String> descriptions = new TreeSet<String>();
|
||||
for ( VariantAnnotation annotation : allAnnotations.values() )
|
||||
descriptions.add("INFO=" + annotation.getDescription());
|
||||
|
||||
return descriptions;
|
||||
}
|
||||
|
||||
// option #3: specify the exact annotations to be used
|
||||
public static Set<String> getVCFAnnotationDescriptions(Collection<VariantAnnotation> annotations) {
|
||||
|
||||
TreeSet<String> descriptions = new TreeSet<String>();
|
||||
for ( VariantAnnotation annotation : annotations )
|
||||
descriptions.add("INFO=" + annotation.getDescription());
|
||||
|
||||
return descriptions;
|
||||
}
|
||||
|
||||
// option #1: don't specify annotations to be used: standard annotations are used by default
|
||||
public static Map<String, String> getAnnotations(ReferenceContext ref, AlignmentContext context, Variation variation) {
|
||||
if ( standardAnnotations == null )
|
||||
|
|
|
|||
|
|
@ -61,16 +61,6 @@ public class CallsetConcordanceWalker extends RodWalker<Integer, Integer> {
|
|||
logger.debug("Uniquified sample mapping: " + entry.getKey().first + "/" + entry.getKey().second + " -> " + entry.getValue());
|
||||
}
|
||||
|
||||
// set up the header fields
|
||||
Map<String, String> hInfo = new HashMap<String, String>();
|
||||
hInfo.put("format", VCFWriter.VERSION);
|
||||
hInfo.put("source", "CallsetConcordance");
|
||||
hInfo.put("reference", getToolkit().getArguments().referenceFile.getName());
|
||||
hInfo.put("explanation", "This file represents a concordance test of various call sets - NOT the output from a multi-sample caller");
|
||||
VCFHeader header = new VCFHeader(hInfo, samples);
|
||||
|
||||
vcfWriter = new VCFWriter(header, OUTPUT);
|
||||
|
||||
// initialize requested concordance types
|
||||
requestedTypes = new ArrayList<ConcordanceType>();
|
||||
if (TYPES != null) {
|
||||
|
|
@ -110,6 +100,25 @@ public class CallsetConcordanceWalker extends RodWalker<Integer, Integer> {
|
|||
throw new StingException("The requested concordance type (" + requestedType + ") isn't a valid concordance option");
|
||||
}
|
||||
}
|
||||
|
||||
// set up the header fields
|
||||
Set<String> hInfo = new HashSet<String>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add("source=CallsetConcordance");
|
||||
hInfo.add("note=\"This file represents a concordance test of various call sets - NOT the output from a multi-sample caller\"");
|
||||
hInfo.addAll(getVCFAnnotationDescriptions(requestedTypes));
|
||||
VCFHeader header = new VCFHeader(hInfo, samples);
|
||||
|
||||
vcfWriter = new VCFWriter(header, OUTPUT);
|
||||
}
|
||||
|
||||
public static Set<String> getVCFAnnotationDescriptions(Collection<ConcordanceType> types) {
|
||||
|
||||
TreeSet<String> descriptions = new TreeSet<String>();
|
||||
for ( ConcordanceType type : types )
|
||||
descriptions.add("INFO=" + type.getInfoDescription());
|
||||
|
||||
return descriptions;
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker rodData, ReferenceContext ref, AlignmentContext context) {
|
||||
|
|
|
|||
|
|
@ -11,4 +11,5 @@ public interface ConcordanceType {
|
|||
public void initialize(Map<String,String> args, Set<String> samples);
|
||||
public String computeConcordance(Map<String, Genotype> samplesToRecords, ReferenceContext ref);
|
||||
public String getInfoName();
|
||||
public String getInfoDescription();
|
||||
}
|
||||
|
|
@ -101,5 +101,6 @@ public class IndelSubsets implements ConcordanceType {
|
|||
return Math.max(leftRun, rightRun);
|
||||
}
|
||||
|
||||
public String getInfoName() { return "IndelSubsets"; }
|
||||
public String getInfoName() { return "IndelSubsets"; }
|
||||
public String getInfoDescription() { return getInfoName() + ",1,String,\"Indel-related subsets\""; }
|
||||
}
|
||||
|
|
@ -39,4 +39,5 @@ public class NWayVenn implements ConcordanceType {
|
|||
}
|
||||
|
||||
public String getInfoName() { return "NwayVenn"; }
|
||||
public String getInfoDescription() { return getInfoName() + ",1,String,\"N-way Venn split\""; }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -95,4 +95,5 @@ public class SNPGenotypeConcordance implements ConcordanceType {
|
|||
}
|
||||
|
||||
public String getInfoName() { return "SnpConcordance"; }
|
||||
public String getInfoDescription() { return getInfoName() + ",1,String,\"SNP concordance test\""; }
|
||||
}
|
||||
|
|
@ -59,4 +59,5 @@ public class SimpleVenn implements ConcordanceType {
|
|||
}
|
||||
|
||||
public String getInfoName() { return "Venn"; }
|
||||
public String getInfoDescription() { return getInfoName() + ",1,String,\"2-way Venn split\""; }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,10 +43,10 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
|
||||
private void initializeVcfWriter(RodVCF rod) {
|
||||
// setup the header fields
|
||||
Map<String, String> hInfo = new HashMap<String, String>();
|
||||
hInfo.put("format", VCFWriter.VERSION);
|
||||
hInfo.put("source", "VariantFiltration");
|
||||
hInfo.put("reference", this.getToolkit().getArguments().referenceFile.getName());
|
||||
Set<String> hInfo = new HashSet<String>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add("source=" + "VariantFiltration");
|
||||
hInfo.add("reference=" + getToolkit().getArguments().referenceFile.getName());
|
||||
|
||||
VCFHeader header = new VCFHeader(hInfo, rod.getHeader().getGenotypeSamples());
|
||||
writer = new VCFWriter(header, out);
|
||||
|
|
@ -151,6 +151,10 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
filterString.append(";" + rec.getFilterString());
|
||||
rec.setFilterString(filterString.toString());
|
||||
}
|
||||
// otherwise, if it's not already filtered, set it to "passing filters"
|
||||
else if ( !rec.isFiltered() ) {
|
||||
rec.setFilterString(VCFRecord.PASSES_FILTERS);
|
||||
}
|
||||
|
||||
if ( writer == null )
|
||||
initializeVcfWriter(variant);
|
||||
|
|
|
|||
|
|
@ -133,17 +133,31 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
|
|||
if ( UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED )
|
||||
samples.clear();
|
||||
|
||||
// get the optional header fields
|
||||
Set<String> headerInfo = new HashSet<String>();
|
||||
if ( UAC.ALL_ANNOTATIONS )
|
||||
headerInfo.addAll(VariantAnnotator.getAllVCFAnnotationDescriptions());
|
||||
else
|
||||
headerInfo.addAll(VariantAnnotator.getVCFAnnotationDescriptions());
|
||||
headerInfo.add("INFO=AF,1,Float,\"Allele Frequency\"");
|
||||
headerInfo.add("INFO=NS,1,Integer,\"Number of Samples With Data\"");
|
||||
if ( !UAC.NO_SLOD )
|
||||
headerInfo.add("INFO=SB,1,Float,\"Strand Bias\"");
|
||||
|
||||
// create the output writer stream
|
||||
if ( VARIANTS_FILE != null )
|
||||
writer = GenotypeWriterFactory.create(UAC.VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE,
|
||||
"UnifiedGenotyper",
|
||||
this.getToolkit().getArguments().referenceFile.getName(),
|
||||
samples);
|
||||
samples,
|
||||
headerInfo);
|
||||
else
|
||||
writer = GenotypeWriterFactory.create(UAC.VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out,
|
||||
"UnifiedGenotyper",
|
||||
this.getToolkit().getArguments().referenceFile.getName(),
|
||||
samples);
|
||||
samples,
|
||||
headerInfo);
|
||||
|
||||
callsMetrics = new CallMetrics();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -50,18 +50,17 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
public static VCFHeader getHeader(GATKArgumentCollection args, Set<String> sampleNames) {
|
||||
Map<String, String> metaData = new HashMap<String, String>();
|
||||
Set<String> additionalColumns = new HashSet<String>();
|
||||
|
||||
// Don't output the data for now because it kills our unit test MD5s and is optional
|
||||
// TODO - figure out what to do here
|
||||
//Calendar cal = Calendar.getInstance();
|
||||
//metaData.put("fileDate", String.format("%d%02d%02d", cal.get(Calendar.YEAR), cal.get(Calendar.MONTH), cal.get(Calendar.DAY_OF_MONTH)));
|
||||
|
||||
metaData.put("format", "VCRv3.2");
|
||||
metaData.put("source", "VariantsToVCF");
|
||||
metaData.put("reference", args.referenceFile.getAbsolutePath());
|
||||
Set<String> metaData = new HashSet<String>();
|
||||
metaData.add("source=VariantsToVCF");
|
||||
metaData.add("reference=" + args.referenceFile.getAbsolutePath());
|
||||
|
||||
Set<String> additionalColumns = new HashSet<String>();
|
||||
additionalColumns.add("FORMAT");
|
||||
additionalColumns.addAll(sampleNames);
|
||||
|
||||
|
|
|
|||
|
|
@ -29,13 +29,12 @@ public class VCFSubsetWalker extends RefWalker<ArrayList<VCFRecord>, VCFWriter>
|
|||
private VCFWriter vwriter = null;
|
||||
|
||||
public void initializeWriter() {
|
||||
Map<String, String> metaData = new HashMap<String, String>();
|
||||
|
||||
Set<String> metaData = new HashSet<String>();
|
||||
metaData.add("source=VariantsToVCF");
|
||||
metaData.add("reference=" + this.getToolkit().getArguments().referenceFile.getAbsolutePath());
|
||||
|
||||
Set<String> additionalColumns = new HashSet<String>();
|
||||
|
||||
metaData.put("format", "VCRv3.2");
|
||||
metaData.put("source", "VariantsToVCF");
|
||||
metaData.put("reference", this.getToolkit().getArguments().referenceFile.getAbsolutePath());
|
||||
|
||||
additionalColumns.add("FORMAT");
|
||||
additionalColumns.addAll(SAMPLES);
|
||||
|
||||
|
|
|
|||
|
|
@ -30,9 +30,9 @@ import java.util.List;
|
|||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class Genotype
|
||||
* Class GenotypeWriter
|
||||
* <p/>
|
||||
* The interface for storing genotype calls.
|
||||
* The interface for writing genotype calls.
|
||||
*/
|
||||
public interface GenotypeWriter {
|
||||
|
||||
|
|
@ -55,8 +55,9 @@ public interface GenotypeWriter {
|
|||
/**
|
||||
* add a multi-sample call if we support it
|
||||
* @param genotypes the list of genotypes, that are backed by sample information
|
||||
* @param variation the variation
|
||||
*/
|
||||
public void addMultiSampleCall(List<Genotype> genotypes, VariationCall metadata);
|
||||
public void addMultiSampleCall(List<Genotype> genotypes, VariationCall variation);
|
||||
|
||||
/**
|
||||
* @return true if we support multisample, false otherwise
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ public class GenotypeWriterFactory {
|
|||
* @param source the source
|
||||
* @param referenceName the ref name
|
||||
* @param sampleNames the sample names
|
||||
* @param headerInfo the optional header info fields
|
||||
* @return the genotype writer object
|
||||
*/
|
||||
public static GenotypeWriter create(GENOTYPE_FORMAT format,
|
||||
|
|
@ -40,7 +41,8 @@ public class GenotypeWriterFactory {
|
|||
File destination,
|
||||
String source,
|
||||
String referenceName,
|
||||
Set<String> sampleNames ) {
|
||||
Set<String> sampleNames,
|
||||
Set<String> headerInfo) {
|
||||
switch (format) {
|
||||
case GLF:
|
||||
return new GLFWriter(header.toString(), destination);
|
||||
|
|
@ -49,7 +51,7 @@ public class GenotypeWriterFactory {
|
|||
case GELI_BINARY:
|
||||
return new GeliAdapter(destination, header);
|
||||
case VCF:
|
||||
return new VCFGenotypeWriterAdapter(source, referenceName, destination, sampleNames);
|
||||
return new VCFGenotypeWriterAdapter(source, referenceName, destination, sampleNames, headerInfo);
|
||||
default:
|
||||
throw new StingException("Genotype writer " + format.toString() + " is not implemented");
|
||||
}
|
||||
|
|
@ -60,14 +62,15 @@ public class GenotypeWriterFactory {
|
|||
PrintStream destination,
|
||||
String source,
|
||||
String referenceName,
|
||||
Set<String> sampleNames ) {
|
||||
Set<String> sampleNames,
|
||||
Set<String> headerInfo) {
|
||||
switch (format) {
|
||||
case GELI:
|
||||
return new GeliTextWriter(destination);
|
||||
case GLF:
|
||||
return new GLFWriter(header.toString(), destination);
|
||||
case VCF:
|
||||
return new VCFGenotypeWriterAdapter(source, referenceName, destination, sampleNames);
|
||||
return new VCFGenotypeWriterAdapter(source, referenceName, destination, sampleNames, headerInfo);
|
||||
default:
|
||||
throw new StingException("Genotype writer to " + format.toString() + " to standard output is not implemented");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,12 +16,25 @@ import java.util.Map;
|
|||
* <p/>
|
||||
*/
|
||||
public class VCFGenotypeRecord implements Genotype {
|
||||
// the symbols for an empty genotype
|
||||
|
||||
// key names
|
||||
public static final String GENOTYPE_KEY = "GT";
|
||||
public static final String GENOTYPE_QUALITY_KEY = "GQ";
|
||||
public static final String DEPTH_KEY = "DP";
|
||||
public static final String HAPLOTYPE_QUALITY_KEY = "HQ";
|
||||
public static final String FILTER_KEY = "FT";
|
||||
public static final String OLD_DEPTH_KEY = "RD";
|
||||
|
||||
// the values for empty fields
|
||||
public static final String EMPTY_GENOTYPE = "./.";
|
||||
public static final String EMPTY_ALLELE = ".";
|
||||
|
||||
public static final int MISSING_GENOTYPE_QUALITY = -1;
|
||||
public static final int MISSING_DEPTH = -1;
|
||||
public static final int MISSING_HAPLOTYPE_QUALITY = -1;
|
||||
public static final String UNFILTERED = ".";
|
||||
|
||||
public static final double MAX_QUAL_VALUE = 99.0;
|
||||
|
||||
// what kind of phasing this genotype has
|
||||
public enum PHASE {
|
||||
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN
|
||||
|
|
@ -51,14 +64,21 @@ public class VCFGenotypeRecord implements Genotype {
|
|||
* @param otherFlags other flags
|
||||
*/
|
||||
public VCFGenotypeRecord(String sampleName, List<VCFGenotypeEncoding> genotypes, PHASE phasing, Map<String, String> otherFlags) {
|
||||
this.mSampleName = sampleName;
|
||||
mSampleName = sampleName;
|
||||
if (genotypes != null) this.mGenotypeAlleles.addAll(genotypes);
|
||||
this.mPhaseType = phasing;
|
||||
if (otherFlags != null) this.mFields.putAll(otherFlags);
|
||||
mPhaseType = phasing;
|
||||
if (otherFlags != null) {
|
||||
// we need to be backwards compatible
|
||||
if ( otherFlags.containsKey(OLD_DEPTH_KEY) ) {
|
||||
otherFlags.put(DEPTH_KEY, otherFlags.get(OLD_DEPTH_KEY));
|
||||
otherFlags.remove(OLD_DEPTH_KEY);
|
||||
}
|
||||
mFields.putAll(otherFlags);
|
||||
}
|
||||
}
|
||||
|
||||
public void setVCFRecord(VCFRecord record) {
|
||||
this.mRecord = record;
|
||||
mRecord = record;
|
||||
}
|
||||
|
||||
public void setSampleName(String name) {
|
||||
|
|
@ -102,16 +122,11 @@ public class VCFGenotypeRecord implements Genotype {
|
|||
}
|
||||
|
||||
public double getNegLog10PError() {
|
||||
return ( mFields.containsKey("GQ") ? Double.valueOf(mFields.get("GQ")) / 10.0 : 0.0);
|
||||
return ( mFields.containsKey(GENOTYPE_QUALITY_KEY) ? Double.valueOf(mFields.get(GENOTYPE_QUALITY_KEY)) / 10.0 : MISSING_GENOTYPE_QUALITY);
|
||||
}
|
||||
|
||||
public int getReadCount() {
|
||||
int depth = MISSING_DEPTH;
|
||||
if ( mFields.containsKey("RD") )
|
||||
depth = Integer.valueOf(mFields.get("RD"));
|
||||
else if ( mFields.containsKey("DP") )
|
||||
depth = Integer.valueOf(mFields.get("DP"));
|
||||
return depth;
|
||||
return ( mFields.containsKey(DEPTH_KEY) ? Integer.valueOf(mFields.get(DEPTH_KEY)) : MISSING_DEPTH);
|
||||
}
|
||||
|
||||
public GenomeLoc getLocation() {
|
||||
|
|
@ -227,11 +242,26 @@ public class VCFGenotypeRecord implements Genotype {
|
|||
StringBuilder builder = new StringBuilder();
|
||||
builder.append(toGenotypeString(altAlleles));
|
||||
for (String field : mFields.keySet()) {
|
||||
if (mFields.get(field).equals("")) continue;
|
||||
builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR);
|
||||
builder.append(mFields.get(field));
|
||||
|
||||
if (mFields.get(field).equals(""))
|
||||
builder.append(getMissingFieldValue(field));
|
||||
else
|
||||
builder.append(mFields.get(field));
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
public static String getMissingFieldValue(String field) {
|
||||
String result = "";
|
||||
if ( field.equals(GENOTYPE_QUALITY_KEY) )
|
||||
result = String.valueOf(MISSING_GENOTYPE_QUALITY);
|
||||
else if ( field.equals(DEPTH_KEY) )
|
||||
result = String.valueOf(MISSING_DEPTH);
|
||||
else if ( field.equals(FILTER_KEY) )
|
||||
result = UNFILTERED;
|
||||
// TODO -- support haplotype quality
|
||||
//else if ( field.equals(HAPLOTYPE_QUALITY_KEY) )
|
||||
// result = String.valueOf(MISSING_HAPLOTYPE_QUALITY);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,23 +27,23 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class);
|
||||
|
||||
|
||||
public VCFGenotypeWriterAdapter(String source, String referenceName, File writeTo, Set<String> sampleNames) {
|
||||
public VCFGenotypeWriterAdapter(String source, String referenceName, File writeTo, Set<String> sampleNames, Set<String> headerInfo) {
|
||||
mReferenceName = referenceName;
|
||||
mSource = source;
|
||||
mSampleNames.addAll(sampleNames);
|
||||
|
||||
initializeHeader();
|
||||
initializeHeader(headerInfo);
|
||||
|
||||
if (writeTo == null) throw new RuntimeException("VCF output file must not be null");
|
||||
mWriter = new VCFWriter(mHeader, writeTo);
|
||||
}
|
||||
|
||||
public VCFGenotypeWriterAdapter(String source, String referenceName, OutputStream writeTo, Set<String> sampleNames) {
|
||||
public VCFGenotypeWriterAdapter(String source, String referenceName, OutputStream writeTo, Set<String> sampleNames, Set<String> headerInfo) {
|
||||
mReferenceName = referenceName;
|
||||
mSource = source;
|
||||
mSampleNames.addAll(sampleNames);
|
||||
|
||||
initializeHeader();
|
||||
initializeHeader(headerInfo);
|
||||
|
||||
if (writeTo == null) throw new RuntimeException("VCF output stream must not be null");
|
||||
mWriter = new VCFWriter(mHeader, writeTo);
|
||||
|
|
@ -51,15 +51,18 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
|
||||
/**
|
||||
* initialize this VCF header
|
||||
*
|
||||
* @param optionalHeaderInfo the optional header fields
|
||||
*/
|
||||
private void initializeHeader() {
|
||||
Map<String, String> hInfo = new HashMap<String, String>();
|
||||
private void initializeHeader(Set<String> optionalHeaderInfo) {
|
||||
Set<String> hInfo = new TreeSet<String>();
|
||||
|
||||
// setup the header fields
|
||||
hInfo.put("format", VCFWriter.VERSION);
|
||||
hInfo.put("source", mSource);
|
||||
hInfo.put("reference", mReferenceName);
|
||||
|
||||
hInfo.add(VCFHeader.FULL_FORMAT_LINE);
|
||||
hInfo.add("source=" + mSource);
|
||||
hInfo.add("reference=" + mReferenceName);
|
||||
hInfo.addAll(optionalHeaderInfo);
|
||||
|
||||
// setup the sample names
|
||||
mHeader = new VCFHeader(hInfo, mSampleNames);
|
||||
}
|
||||
|
|
@ -114,7 +117,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
throw new IllegalArgumentException("Only VCFVariationCall objects should be passed in to the VCF writers");
|
||||
|
||||
VCFParameters params = new VCFParameters();
|
||||
params.addFormatItem("GT");
|
||||
params.addFormatItem(VCFGenotypeRecord.GENOTYPE_KEY);
|
||||
|
||||
// get the location and reference
|
||||
if ( genotypes.size() == 0 ) {
|
||||
|
|
@ -166,10 +169,10 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
VCFRecord vcfRecord = new VCFRecord(params.getReferenceBase(),
|
||||
params.getContig(),
|
||||
params.getPosition(),
|
||||
(dbSnpID == null ? "." : dbSnpID),
|
||||
(dbSnpID == null ? VCFRecord.EMPTY_ID_FIELD : dbSnpID),
|
||||
params.getAlternateBases(),
|
||||
qual,
|
||||
"0",
|
||||
VCFRecord.UNFILTERED,
|
||||
infoFields,
|
||||
params.getFormatString(),
|
||||
params.getGenotypesRecords());
|
||||
|
|
@ -189,15 +192,15 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
Map<String, String> infoFields = new HashMap<String, String>();
|
||||
if ( locusdata != null ) {
|
||||
if ( locusdata.getSLOD() != null )
|
||||
infoFields.put("SB", String.format("%.2f", locusdata.getSLOD()));
|
||||
infoFields.put(VCFRecord.STRAND_BIAS_KEY, String.format("%.2f", locusdata.getSLOD()));
|
||||
if ( locusdata.hasNonRefAlleleFrequency() )
|
||||
infoFields.put("AF", String.format("%.2f", locusdata.getNonRefAlleleFrequency()));
|
||||
infoFields.put(VCFRecord.ALLELE_FREQUENCY_KEY, String.format("%.2f", locusdata.getNonRefAlleleFrequency()));
|
||||
Map<String, String> otherFields = locusdata.getFields();
|
||||
if ( otherFields != null ) {
|
||||
infoFields.putAll(otherFields);
|
||||
}
|
||||
}
|
||||
infoFields.put("NS", String.valueOf(params.getGenotypesRecords().size()));
|
||||
infoFields.put(VCFRecord.SAMPLE_NUMBER_KEY, String.valueOf(params.getGenotypesRecords().size()));
|
||||
return infoFields;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,5 @@
|
|||
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
|
|
@ -14,13 +12,25 @@ import java.util.*;
|
|||
*/
|
||||
public class VCFHeader {
|
||||
|
||||
public static final String FILE_FORMAT_KEY = "fileformat=";
|
||||
public static final String OLD_FILE_FORMAT_KEY = "format="; // from version 3.2
|
||||
|
||||
|
||||
/** the current vcf version we support. */
|
||||
public static final String VCF_VERSION_HEADER = "VCFv";
|
||||
public static final String OLD_VCF_VERSION_HEADER = "VCRv"; // from version 3.2
|
||||
public static final double VCF_VERSION_NUMBER = 3.3;
|
||||
public static final String VCF_VERSION = VCF_VERSION_HEADER + VCF_VERSION_NUMBER;
|
||||
|
||||
public static final String FULL_FORMAT_LINE = FILE_FORMAT_KEY + VCF_VERSION;
|
||||
|
||||
// the manditory header fields
|
||||
public enum HEADER_FIELDS {
|
||||
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
|
||||
}
|
||||
|
||||
// the associated meta data
|
||||
private final Map<String, String> mMetaData = new HashMap<String, String>();
|
||||
private final Set<String> mMetaData;
|
||||
|
||||
// the list of auxillary tags
|
||||
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
|
||||
|
|
@ -31,22 +41,17 @@ public class VCFHeader {
|
|||
// the header string indicator
|
||||
public static final String HEADER_INDICATOR = "#";
|
||||
|
||||
/** our log, which we use to capture anything from this class */
|
||||
private static Logger logger = Logger.getLogger(VCFHeader.class);
|
||||
|
||||
/** do we have genotying data? */
|
||||
private boolean hasGenotypingData = false;
|
||||
|
||||
/** the current vcf version we support. */
|
||||
private static final String VCF_VERSION = "VCRv3.2";
|
||||
|
||||
/**
|
||||
* create a VCF header, given a list of meta data and auxillary tags
|
||||
*
|
||||
* @param metaData the meta data associated with this header
|
||||
*/
|
||||
public VCFHeader(Map<String, String> metaData) {
|
||||
for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key));
|
||||
public VCFHeader(Set<String> metaData) {
|
||||
mMetaData = new TreeSet<String>(metaData);
|
||||
checkVCFVersion();
|
||||
}
|
||||
|
||||
|
|
@ -56,8 +61,8 @@ public class VCFHeader {
|
|||
* @param metaData the meta data associated with this header
|
||||
* @param genotypeSampleNames the genotype format field, and the sample names
|
||||
*/
|
||||
public VCFHeader(Map<String, String> metaData, Set<String> genotypeSampleNames) {
|
||||
for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key));
|
||||
public VCFHeader(Set<String> metaData, Set<String> genotypeSampleNames) {
|
||||
mMetaData = new TreeSet<String>(metaData);
|
||||
for (String col : genotypeSampleNames) {
|
||||
if (!col.equals("FORMAT"))
|
||||
mGenotypeSampleNames.add(col);
|
||||
|
|
@ -71,13 +76,34 @@ public class VCFHeader {
|
|||
* or the version is not present
|
||||
*/
|
||||
public void checkVCFVersion() {
|
||||
if (mMetaData.containsKey("format")) {
|
||||
if (mMetaData.get("format").equals(VCF_VERSION))
|
||||
return;
|
||||
throw new RuntimeException("VCFHeader: VCF version of " + mMetaData.get("format") +
|
||||
" doesn't match the supported version of " + VCF_VERSION);
|
||||
String version = null;
|
||||
for ( String field : mMetaData ) {
|
||||
if ( field.startsWith(FILE_FORMAT_KEY) ) {
|
||||
version = field.substring(FILE_FORMAT_KEY.length());
|
||||
break;
|
||||
}
|
||||
else if ( field.startsWith(OLD_FILE_FORMAT_KEY) ) {
|
||||
version = field.substring(OLD_FILE_FORMAT_KEY.length());
|
||||
break;
|
||||
}
|
||||
}
|
||||
throw new RuntimeException("VCFHeader: VCF version isn't present");
|
||||
|
||||
if ( version == null )
|
||||
mMetaData.add(FULL_FORMAT_LINE);
|
||||
else if ( !isSupportedVersion(version) )
|
||||
throw new RuntimeException("VCF version " + version +
|
||||
" is not yet supported; only version " + VCF_VERSION + " and earlier can be used");
|
||||
}
|
||||
|
||||
private boolean isSupportedVersion(String version) {
|
||||
if ( !version.startsWith(VCF_VERSION_HEADER) && !version.startsWith(OLD_VCF_VERSION_HEADER) )
|
||||
return false;
|
||||
try {
|
||||
double dVersion = Double.valueOf(version.substring(VCF_VERSION_HEADER.length()));
|
||||
return dVersion <= VCF_VERSION_NUMBER;
|
||||
} catch (Exception e) { }
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -96,9 +122,9 @@ public class VCFHeader {
|
|||
/**
|
||||
* get the meta data, associated with this header
|
||||
*
|
||||
* @return a map of the meta data
|
||||
* @return a set of the meta data
|
||||
*/
|
||||
public Map<String, String> getMetaData() {
|
||||
public Set<String> getMetaData() {
|
||||
return mMetaData;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -21,9 +21,6 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
// our next record
|
||||
private VCFRecord mNextRecord = null;
|
||||
|
||||
// a pattern we use for detecting meta data and header lines
|
||||
private static Pattern pMeta = Pattern.compile("^" + VCFHeader.METADATA_INDICATOR + "\\s*(\\S+)\\s*=\\s*(\\S+)\\s*$");
|
||||
|
||||
// our pattern matching for the genotype mFields
|
||||
private static final Pattern gtPattern = Pattern.compile("([0-9\\.]+)([\\\\|\\/])([0-9\\.]*)");
|
||||
|
||||
|
|
@ -142,24 +139,11 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
* @return a VCF Header created from the list of stinrgs
|
||||
*/
|
||||
protected VCFHeader createHeader(List<String> headerStrings) {
|
||||
Map<String, String> metaData = new HashMap<String, String>();
|
||||
Set<String> metaData = new TreeSet<String>();
|
||||
Set<String> auxTags = new LinkedHashSet<String>();
|
||||
// iterate over all the passed in strings
|
||||
for (String str : headerStrings) {
|
||||
Matcher matcher = pMeta.matcher(str);
|
||||
if (matcher.matches()) {
|
||||
String metaKey;
|
||||
String metaValue = "";
|
||||
if (matcher.groupCount() < 1) continue;
|
||||
if (matcher.groupCount() == 2) metaValue = matcher.group(2);
|
||||
metaKey = matcher.group(1);
|
||||
metaData.put(metaKey, metaValue);
|
||||
}
|
||||
}
|
||||
|
||||
// iterate over all the passed in strings
|
||||
for (String str : headerStrings) { // TODO: fix, we shouldn't loop over every line
|
||||
if (str.startsWith("#") && !str.startsWith("##")) {
|
||||
for ( String str : headerStrings ) {
|
||||
if ( !str.startsWith("##") ) {
|
||||
String[] strings = str.substring(1).split("\\s+");
|
||||
// the columns should be in order according to Richard Durbin
|
||||
int arrayIndex = 0;
|
||||
|
|
@ -177,8 +161,11 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
auxTags.add(strings[arrayIndex]);
|
||||
arrayIndex++;
|
||||
}
|
||||
} else {
|
||||
metaData.add(str.substring(2));
|
||||
}
|
||||
}
|
||||
|
||||
return new VCFHeader(metaData, auxTags);
|
||||
}
|
||||
|
||||
|
|
@ -266,7 +253,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
nextDivider = (genotypeString.indexOf(":") > genotypeString.length()) ? genotypeString.length() : genotypeString.indexOf(":");
|
||||
parse = genotypeString.substring(0, nextDivider);
|
||||
}
|
||||
if (key.equals("GT")) {
|
||||
if (key.equals(VCFGenotypeRecord.GENOTYPE_KEY)) {
|
||||
Matcher m = gtPattern.matcher(parse);
|
||||
if (!m.matches())
|
||||
throw new RuntimeException("VCFReader: Unable to match GT genotype flag to it's expected pattern, the field was: " + parse);
|
||||
|
|
@ -274,6 +261,8 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
addAllele(m.group(1), altAlleles, referenceBase, bases);
|
||||
if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases);
|
||||
} else {
|
||||
if ( parse.length() == 0 )
|
||||
parse = VCFGenotypeRecord.getMissingFieldValue(key);
|
||||
tagToValue.put(key, parse);
|
||||
}
|
||||
if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1;
|
||||
|
|
|
|||
|
|
@ -11,13 +11,31 @@ import java.util.*;
|
|||
*/
|
||||
public class VCFRecord implements Variation, VariantBackedByGenotype {
|
||||
|
||||
// standard info field keys
|
||||
public static final String ANCESTRAL_ALLELE_KEY = "AA";
|
||||
public static final String ALLELE_COUNT_KEY = "AC";
|
||||
public static final String ALLELE_FREQUENCY_KEY = "AF";
|
||||
public static final String ALLELE_NUMBER_KEY = "AN";
|
||||
public static final String RMS_BASE_QUALITY_KEY = "BQ";
|
||||
public static final String DBSNP_KEY = "DB";
|
||||
public static final String DEPTH_KEY = "DP";
|
||||
public static final String HAPMAP2_KEY = "H2";
|
||||
public static final String RMS_MAPPING_QUALITY_KEY = "MQ";
|
||||
public static final String SAMPLE_NUMBER_KEY = "NS";
|
||||
public static final String STRAND_BIAS_KEY = "SB";
|
||||
|
||||
// commonly used strings that are in the standard
|
||||
public static final String FORMAT_FIELD_SEPERATOR = ":";
|
||||
public static final String GENOTYPE_FIELD_SEPERATOR = ":";
|
||||
public static final String FIELD_SEPERATOR = "\t";
|
||||
public static final String FILTER_CODE_SEPERATOR = ";";
|
||||
public static final String INFO_FIELD_SEPERATOR = ";";
|
||||
|
||||
// default values
|
||||
public static final String UNFILTERED = ".";
|
||||
public static final String PASSES_FILTERS = "0";
|
||||
public static final String EMPTY_INFO_FIELD = ".";
|
||||
public static final String EMPTY_ID_FIELD = ".";
|
||||
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
|
||||
|
||||
// the reference base
|
||||
|
|
@ -139,13 +157,12 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
|
|||
String vals[] = columnValues.get(val).split(";");
|
||||
for (String alt : vals) {
|
||||
String keyVal[] = alt.split("=");
|
||||
if (keyVal.length == 1 && keyVal[0].equals(".")) {
|
||||
if ( keyVal.length == 1 )
|
||||
this.addInfoField(keyVal[0], "");
|
||||
break;
|
||||
}
|
||||
if (keyVal.length != 2)
|
||||
else if (keyVal.length == 2)
|
||||
this.addInfoField(keyVal[0], keyVal[1]);
|
||||
else
|
||||
throw new IllegalArgumentException("info field key-value pair did not parse into key->value pair: " + alt);
|
||||
this.addInfoField(keyVal[0], keyVal[1]);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
@ -231,14 +248,14 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
|
|||
}
|
||||
|
||||
public double getNonRefAlleleFrequency() {
|
||||
if ( mInfoFields.containsKey("AF") ) {
|
||||
return Double.valueOf(mInfoFields.get("AF"));
|
||||
if ( mInfoFields.containsKey(ALLELE_FREQUENCY_KEY) ) {
|
||||
return Double.valueOf(mInfoFields.get(ALLELE_FREQUENCY_KEY));
|
||||
} else {
|
||||
// this is the poor man's AF
|
||||
if ( mInfoFields.containsKey("AC") && mInfoFields.containsKey("AN")) {
|
||||
String splt[] = mInfoFields.get("AC").split(",");
|
||||
if ( mInfoFields.containsKey(ALLELE_COUNT_KEY) && mInfoFields.containsKey(ALLELE_NUMBER_KEY)) {
|
||||
String splt[] = mInfoFields.get(ALLELE_COUNT_KEY).split(",");
|
||||
if ( splt.length > 0 ) {
|
||||
return (Double.valueOf(splt[0]) / Double.valueOf(mInfoFields.get("AN")));
|
||||
return (Double.valueOf(splt[0]) / Double.valueOf(mInfoFields.get(ALLELE_NUMBER_KEY)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -250,9 +267,13 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
|
|||
if ( !hasAlternateAllele() )
|
||||
return VARIANT_TYPE.REFERENCE;
|
||||
|
||||
// TODO -- figure out what to do about records with more than one type
|
||||
VCFGenotypeEncoding encoding = mAlts.get(0);
|
||||
switch ( encoding.getType() ) {
|
||||
VCFGenotypeEncoding.TYPE type = mAlts.get(0).getType();
|
||||
for (int i = 1; i < mAlts.size(); i++) {
|
||||
if ( mAlts.get(i).getType() != type )
|
||||
throw new IllegalStateException("The record contains multiple encoding types");
|
||||
}
|
||||
|
||||
switch ( type ) {
|
||||
case SINGLE_BASE:
|
||||
return VARIANT_TYPE.SNP;
|
||||
case DELETION:
|
||||
|
|
@ -309,23 +330,20 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
|
|||
/**
|
||||
* get the filter criteria
|
||||
*
|
||||
* @return an array of strings representing the filtering criteria, or 0 is none are applied
|
||||
* @return an array of strings representing the filtering criteria, or UNFILTERED if none are applied
|
||||
*/
|
||||
public String[] getFilteringCodes() {
|
||||
if (mFilterString == null) return new String[]{"0"};
|
||||
return this.mFilterString.split(FILTER_CODE_SEPERATOR);
|
||||
if (mFilterString == null) return new String[]{UNFILTERED};
|
||||
return mFilterString.split(FILTER_CODE_SEPERATOR);
|
||||
}
|
||||
|
||||
public boolean isFiltered() {
|
||||
String[] codes = getFilteringCodes();
|
||||
if ( codes.length > 1 ) return true;
|
||||
else if ( codes[0].equals(".") || codes[0].equals("0") ) return false;
|
||||
else return true;
|
||||
return !codes[0].equals(UNFILTERED) && !codes[0].equals(PASSES_FILTERS);
|
||||
}
|
||||
|
||||
public boolean hasFilteringCodes() {
|
||||
// todo --- currently always returns true
|
||||
return getFilteringCodes() != null;
|
||||
return mFilterString != null;
|
||||
}
|
||||
|
||||
public String getFilterString() {
|
||||
|
|
@ -567,7 +585,9 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
|
|||
if (!this.mLoc.equals(other.mLoc)) return false;
|
||||
if (!this.mID.equals(other.mID)) return false;
|
||||
if (this.mQual != other.mQual) return false;
|
||||
if (!this.mFilterString.equals(other.mFilterString)) return false;
|
||||
if ( this.mFilterString == null ) {
|
||||
if ( other.mFilterString != null ) return false;
|
||||
} else if ( !this.mFilterString.equals(other.mFilterString) ) return false;
|
||||
if (!this.mInfoFields.equals(other.mInfoFields)) return false;
|
||||
if (!this.mGenotypeFields.equals(other.mGenotypeFields)) return false;
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -20,12 +20,38 @@ public class VCFUtils {
|
|||
private VCFUtils() { }
|
||||
|
||||
|
||||
/**
|
||||
* Gets the header fields from all VCF rods input by the user
|
||||
*
|
||||
* @param toolkit GATK engine
|
||||
*
|
||||
* @return a set of all fields
|
||||
*/
|
||||
public static Set<String> getHeaderFields(GenomeAnalysisEngine toolkit) {
|
||||
|
||||
// keep a map of sample name to occurrences encountered
|
||||
TreeSet<String> fields = new TreeSet<String>();
|
||||
|
||||
// iterate to get all of the sample names
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
ReferenceOrderedData rod = source.getReferenceOrderedData();
|
||||
if ( rod.getType().equals(RodVCF.class) ) {
|
||||
VCFReader reader = new VCFReader(rod.getFile());
|
||||
fields.addAll(reader.getHeader().getMetaData());
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
|
||||
* (e.g. sampleX.1, sampleX.2, ...)
|
||||
* When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping
|
||||
* from rod/sample pairs to the new uniquified names
|
||||
*
|
||||
*
|
||||
* @param toolkit GATK engine
|
||||
* @param samples set to store the sample names
|
||||
* @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names
|
||||
|
|
@ -104,7 +130,7 @@ public class VCFUtils {
|
|||
public static VCFRecord mergeRecords(List<RodVCF> rods, Map<Pair<String, String>, String> rodNamesToSampleNames) {
|
||||
|
||||
VCFParameters params = new VCFParameters();
|
||||
params.addFormatItem("GT");
|
||||
params.addFormatItem(VCFGenotypeRecord.GENOTYPE_KEY);
|
||||
|
||||
// keep track of the locus specific data so we can merge them intelligently
|
||||
int totalReadDepth = 0;
|
||||
|
|
@ -122,7 +148,9 @@ public class VCFUtils {
|
|||
if ( params.getPosition() < 1 )
|
||||
params.setLocations(rod.getLocation(), call.getReference());
|
||||
params.addGenotypeRecord(createVCFGenotypeRecord(params, call, rod.mCurrentRecord));
|
||||
totalReadDepth += call.getReadCount();
|
||||
int depth = call.getReadCount();
|
||||
if ( depth > 0 )
|
||||
totalReadDepth += call.getReadCount();
|
||||
}
|
||||
|
||||
// set the overall confidence to be the max entry we see
|
||||
|
|
@ -142,14 +170,14 @@ public class VCFUtils {
|
|||
}
|
||||
|
||||
Map<String, String> infoFields = new HashMap<String, String>();
|
||||
infoFields.put("DP", String.format("%d", totalReadDepth));
|
||||
infoFields.put("NS", String.valueOf(params.getGenotypesRecords().size()));
|
||||
infoFields.put(VCFRecord.DEPTH_KEY, String.format("%d", totalReadDepth));
|
||||
infoFields.put(VCFRecord.SAMPLE_NUMBER_KEY, String.valueOf(params.getGenotypesRecords().size()));
|
||||
|
||||
// set the overall strand bias and allele frequency to be the average of all entries we've seen
|
||||
if ( SLODsSeen > 0 )
|
||||
infoFields.put("SB", String.format("%.2f", (totalSLOD/(double)SLODsSeen)));
|
||||
infoFields.put(VCFRecord.STRAND_BIAS_KEY, String.format("%.2f", (totalSLOD/(double)SLODsSeen)));
|
||||
if ( freqsSeen > 0 )
|
||||
infoFields.put("AF", String.format("%.2f", (totalFreq/(double)freqsSeen)));
|
||||
infoFields.put(VCFRecord.ALLELE_FREQUENCY_KEY, String.format("%.2f", (totalFreq/(double)freqsSeen)));
|
||||
|
||||
return new VCFRecord(params.getReferenceBase(),
|
||||
params.getContig(),
|
||||
|
|
@ -175,13 +203,12 @@ public class VCFUtils {
|
|||
public static VCFGenotypeRecord createVCFGenotypeRecord(VCFParameters params, VCFGenotypeRecord gtype, VCFRecord vcfrecord) {
|
||||
Map<String, String> map = new HashMap<String, String>();
|
||||
|
||||
// calculate the RMS mapping qualities and the read depth
|
||||
int readDepth = gtype.getReadCount();
|
||||
map.put("RD", String.valueOf(readDepth));
|
||||
params.addFormatItem("RD");
|
||||
double qual = 10.0 * gtype.getNegLog10PError();
|
||||
map.put("GQ", String.format("%.2f", qual));
|
||||
params.addFormatItem("GQ");
|
||||
// calculate the genotype quality and the read depth
|
||||
map.put(VCFGenotypeRecord.DEPTH_KEY, String.valueOf(gtype.getReadCount()));
|
||||
params.addFormatItem(VCFGenotypeRecord.DEPTH_KEY);
|
||||
double qual = Math.min(10.0 * gtype.getNegLog10PError(), VCFGenotypeRecord.MAX_QUAL_VALUE);
|
||||
map.put(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY, String.format("%.2f", qual));
|
||||
params.addFormatItem(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
|
||||
|
||||
List<VCFGenotypeEncoding> alleles = createAlleleArray(gtype);
|
||||
for (VCFGenotypeEncoding allele : alleles) {
|
||||
|
|
@ -208,12 +235,11 @@ public class VCFUtils {
|
|||
Map<String, String> map = new HashMap<String, String>();
|
||||
|
||||
// calculate the RMS mapping qualities and the read depth
|
||||
int readDepth = gtype.getReadCount();
|
||||
map.put("RD", String.valueOf(readDepth));
|
||||
params.addFormatItem("RD");
|
||||
double qual = gtype.getNegLog10PError();
|
||||
map.put("GQ", String.format("%.2f", qual));
|
||||
params.addFormatItem("GQ");
|
||||
map.put(VCFGenotypeRecord.DEPTH_KEY, String.valueOf(gtype.getReadCount()));
|
||||
params.addFormatItem(VCFGenotypeRecord.DEPTH_KEY);
|
||||
double qual = Math.min(10.0 * gtype.getNegLog10PError(), VCFGenotypeRecord.MAX_QUAL_VALUE);
|
||||
map.put(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY, String.format("%.2f", qual));
|
||||
params.addFormatItem(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
|
||||
|
||||
List<VCFGenotypeEncoding> alleles = createAlleleArray(gtype);
|
||||
for (VCFGenotypeEncoding allele : alleles) {
|
||||
|
|
|
|||
|
|
@ -2,14 +2,14 @@ package org.broadinstitute.sting.utils.genotype.vcf;
|
|||
|
||||
|
||||
import java.io.*;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* this class writers VCF files
|
||||
*/
|
||||
public class VCFWriter {
|
||||
|
||||
public static final String VERSION = "VCRv3.2";
|
||||
|
||||
|
||||
// the VCF header we're storing
|
||||
private VCFHeader mHeader;
|
||||
|
||||
|
|
@ -49,10 +49,25 @@ public class VCFWriter {
|
|||
mWriter = new BufferedWriter(
|
||||
new OutputStreamWriter(location));
|
||||
try {
|
||||
// write the header meta-data out
|
||||
for (String metadata : header.getMetaData().keySet()) {
|
||||
mWriter.write(VCFHeader.METADATA_INDICATOR + metadata + "=" + header.getMetaData().get(metadata) + "\n");
|
||||
// the fileformat field needs to be written first
|
||||
TreeSet<String> allMetaData = new TreeSet<String>(header.getMetaData());
|
||||
for ( String metadata : allMetaData ) {
|
||||
if ( metadata.startsWith(VCFHeader.FILE_FORMAT_KEY) ) {
|
||||
mWriter.write(VCFHeader.METADATA_INDICATOR + metadata + "\n");
|
||||
break;
|
||||
}
|
||||
else if ( metadata.startsWith(VCFHeader.OLD_FILE_FORMAT_KEY) ) {
|
||||
mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeader.FILE_FORMAT_KEY + metadata.substring(VCFHeader.OLD_FILE_FORMAT_KEY.length()) + "\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// write the rest of the header meta-data out
|
||||
for ( String metadata : header.getMetaData() ) {
|
||||
if ( !metadata.startsWith(VCFHeader.FILE_FORMAT_KEY) && !metadata.startsWith(VCFHeader.OLD_FILE_FORMAT_KEY) )
|
||||
mWriter.write(VCFHeader.METADATA_INDICATOR + metadata + "\n");
|
||||
}
|
||||
|
||||
// write out the column line
|
||||
StringBuilder b = new StringBuilder();
|
||||
b.append(VCFHeader.HEADER_INDICATOR);
|
||||
|
|
|
|||
|
|
@ -91,7 +91,7 @@ public class RodVCFTest extends BaseTest {
|
|||
@Test
|
||||
public void testToString() {
|
||||
// slightly altered line, due to map ordering
|
||||
final String firstLine = "20\t14370\trs6054257\tG\tA\t29.00\t0\tAF=0.786;DP=258;NS=58\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5\n";
|
||||
final String firstLine = "20\t14370\trs6054257\tG\tA\t29.00\t0\tAF=0.786;DP=258;NS=58\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:\n";
|
||||
RodVCF vcf = getVCFObject();
|
||||
VCFReader reader = new VCFReader(vcfFile);
|
||||
Iterator<RodVCF> iter = vcf.createIterator("VCF", vcfFile);
|
||||
|
|
@ -105,7 +105,7 @@ public class RodVCFTest extends BaseTest {
|
|||
// verify the first line too
|
||||
if (first) {
|
||||
if (!firstLine.equals(rec1.toStringEncoding(mHeader) + "\n")) {
|
||||
fail("VCF record rec1.toStringEncoding() != expected string :\n" + rec1.toStringEncoding(mHeader) + firstLine);
|
||||
fail("VCF record rec1.toStringEncoding() != expected string :\n" + rec1.toStringEncoding(mHeader) + "\n" + firstLine);
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ public class SecondBaseSkewIntegrationTest extends WalkerTest {
|
|||
+"-B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli "
|
||||
+"-vcf %s -sample variant -L /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_test_intervals.interval_list";
|
||||
|
||||
String md5_for_this_test = "a2ec1d36e77da56b4a11eef34d73296e";
|
||||
String md5_for_this_test = "f7e67c353d3113447d1b9c8c39de6ed0";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(test_args,1, Arrays.asList(md5_for_this_test));
|
||||
executeTest("Testing on E2 annotated but not Q2 annotated file ",spec);
|
||||
|
|
@ -49,7 +49,7 @@ public class SecondBaseSkewIntegrationTest extends WalkerTest {
|
|||
+"-B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli "
|
||||
+"-vcf %s -sample variant -L /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_test_intervals.interval_list";
|
||||
|
||||
String md5_for_this_test = "f105fd8a7ae7026a55107b86e768553a";
|
||||
String md5_for_this_test = "3eee411119888fc4633870a91ed2093d";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(test_args,1, Arrays.asList(md5_for_this_test));
|
||||
executeTest("Testing on bam file without 2bb annotations ",spec);
|
||||
|
|
@ -61,7 +61,7 @@ public class SecondBaseSkewIntegrationTest extends WalkerTest {
|
|||
+ " -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -A SecondBaseSkew"
|
||||
+ " -sample variant -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pileup_test_chr15.vcf"
|
||||
+ " -vcf %s -L chr15:46347148";
|
||||
String expected_md5 = "160a8e3826eb745bcfe2f463f73e1ec7";
|
||||
String expected_md5 = "c70dfb30c3caa9184604f88bc7f62a07";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(test_args,1,Arrays.asList(expected_md5));
|
||||
executeTest("Testing on locus with many indels", spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,8 +34,8 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
|
||||
public static String secondBaseTestmd5( int testNo ) {
|
||||
switch ( testNo ) {
|
||||
case 1: return "8f5b4b29eefb25d6b7b9e32a1c90f144";
|
||||
case 2: return "3a53d945b38e1fc87a801f23115222fe";
|
||||
case 1: return "bf64bac186fd682018dd7f0419d90190";
|
||||
case 2: return "67f40627b12be31efe02c9d853fbcf37";
|
||||
default: throw new StingException("Impossible test has been run: secondbasetest number "+testNo);
|
||||
}
|
||||
}
|
||||
|
|
@ -50,7 +50,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
public void testHasAnnotsNotAsking1() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("0ff01251afdabafb4e137357b25be72a"));
|
||||
Arrays.asList("6903f8b31820ce7a56230b99f9a9309c"));
|
||||
executeTest("test file has annotations, not asking for annotations, #1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -58,7 +58,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
public void testHasAnnotsNotAsking2() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
|
||||
Arrays.asList("227f38b2f322780187eacd0c98ace3e6"));
|
||||
Arrays.asList("5298c24e956361d209f14ac6138a3bbd"));
|
||||
executeTest("test file has annotations, not asking for annotations, #2", spec);
|
||||
}
|
||||
|
||||
|
|
@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
public void testHasAnnotsAsking1() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -standard -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("9a30487ad885f4d49569032fe6463af3"));
|
||||
Arrays.asList("31ddecd3118bd6aea00bcd0369a1b32f"));
|
||||
executeTest("test file has annotations, asking for annotations, #1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
public void testHasAnnotsAsking2() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -standard -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
|
||||
Arrays.asList("ef01d735ea0bcfeb6e7394c65f2a1938"));
|
||||
Arrays.asList("553c55f3bcf31a6b1d9a2c1d27fde480"));
|
||||
executeTest("test file has annotations, asking for annotations, #2", spec);
|
||||
}
|
||||
|
||||
|
|
@ -82,7 +82,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
public void testNoAnnotsNotAsking1() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2empty.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("18d85a9711d56bdf7e2327b83d6745e2"));
|
||||
Arrays.asList("b46f395864cb71b887d69342f56e7fdb"));
|
||||
executeTest("test file doesn't have annotations, not asking for annotations, #1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -90,7 +90,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
public void testNoAnnotsNotAsking2() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3empty.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
|
||||
Arrays.asList("da51f18d189fb0fd804822c81b786e0f"));
|
||||
Arrays.asList("aaae89c48fcab615fe4204220ec62859"));
|
||||
executeTest("test file doesn't have annotations, not asking for annotations, #2", spec);
|
||||
}
|
||||
|
||||
|
|
@ -98,7 +98,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
public void testNoAnnotsAsking1() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -standard -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2empty.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("7e6ceb79e9a1f104723299ed68b236c6"));
|
||||
Arrays.asList("fa1071b292fd2e9db5c36410ffd44fdb"));
|
||||
executeTest("test file doesn't have annotations, asking for annotations, #1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -106,7 +106,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
public void testNoAnnotsAsking2() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -standard -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3empty.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
|
||||
Arrays.asList("87748d4c80ff76701dd01d7b0f803249"));
|
||||
Arrays.asList("9d133a91949e477523a039a0a52ac2a8"));
|
||||
executeTest("test file doesn't have annotations, asking for annotations, #2", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
|
|||
public void testSimpleVenn() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn", 1,
|
||||
Arrays.asList("851b68004874f3a2e76d795e7401f8a0"));
|
||||
Arrays.asList("2c7e18901dbf27bac9f36b3dbee063c6"));
|
||||
executeTest("testSimpleVenn", spec);
|
||||
}
|
||||
|
||||
|
|
@ -22,7 +22,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
|
|||
public void testSNPConcordance() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SNPGenotypeConcordance:qscore=5", 1,
|
||||
Arrays.asList("7afb56b30257fe2d66bee7a029d75685"));
|
||||
Arrays.asList("c21d59fc3194c39c662d2e74b53dcf9c"));
|
||||
executeTest("testSNPConcordance", spec);
|
||||
}
|
||||
|
||||
|
|
@ -30,7 +30,15 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
|
|||
public void testNWayVenn() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -B set3,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/CEU.sample.vcf -CT NWayVenn", 1,
|
||||
Arrays.asList("f452c04c600ad10c054f18b0c77b53d5"));
|
||||
Arrays.asList("2b38ae235edd10773dbee0bfae036e35"));
|
||||
executeTest("testNWayVenn", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMulti() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn -CT NWayVenn -CT SNPGenotypeConcordance:qscore=5", 1,
|
||||
Arrays.asList("9bcc83aadac00a160cef20e7126368ee"));
|
||||
executeTest("testMulti", spec);
|
||||
}
|
||||
}
|
||||
|
|
@ -16,7 +16,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
public void testNoAction() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("2408d449fbe7bf74099cc53d2d97c248"));
|
||||
Arrays.asList("49817e684effce7b6f3d5776dc781988"));
|
||||
executeTest("test no action", spec);
|
||||
}
|
||||
|
||||
|
|
@ -24,7 +24,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
public void testClusteredSnps() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -window 10 -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("58a2d4cd3d3ba1460833b45b9b8455c2"));
|
||||
Arrays.asList("ee5e4e00bf25a912e8cab3e768fa0e7d"));
|
||||
executeTest("test clustered SNPs", spec);
|
||||
}
|
||||
|
||||
|
|
@ -32,7 +32,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
public void testMask() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -mask foo -B mask,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("9fc2cb210b3595159f34ddfba5a2e572"));
|
||||
Arrays.asList("77451c3ba8a070343e69157cdaf2be92"));
|
||||
executeTest("test mask", spec);
|
||||
}
|
||||
|
||||
|
|
@ -40,7 +40,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
public void testFilter1() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("5effa4b4fdd4dd33a373561637a5d86e"));
|
||||
Arrays.asList("3b44be79d676536bd6c0f32774091fee"));
|
||||
executeTest("test filter #1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -48,7 +48,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
public void testFilter2() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("5e0077148eda3b4274fbef1048902d47"));
|
||||
Arrays.asList("baa425fa8e0761a05733a1a9d62d02ff"));
|
||||
executeTest("test filter #2", spec);
|
||||
}
|
||||
}
|
||||
|
|
@ -47,7 +47,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1PointEM() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("ad7024c3c880a451d2f5537797b49beb"));
|
||||
Arrays.asList("68fcdca40df72b3c703bab846c5f0bbd"));
|
||||
executeTest("testMultiSamplePilot1 - Point Estimate EM", spec);
|
||||
}
|
||||
|
||||
|
|
@ -55,7 +55,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot2PointEM() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("7709af47dde8e127e0e36e86073e2cb1"));
|
||||
Arrays.asList("eb0cd5494ae4c1781bfa00b6c4146993"));
|
||||
executeTest("testMultiSamplePilot2 - Point Estimate EM", spec);
|
||||
}
|
||||
|
||||
|
|
@ -68,7 +68,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testPooled1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1,
|
||||
Arrays.asList("1905bc65b1abb56c776558d562de5ea1"));
|
||||
Arrays.asList("ec1aeb69d7d54a7ced1ce625146d1d59"));
|
||||
executeTest("testPooled1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -81,7 +81,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("aaecb9fe822a42576500a91973baff03"));
|
||||
Arrays.asList("e27552dad05ddf17403aaa7176b9cfe2"));
|
||||
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
@ -89,7 +89,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot2Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("d36a8ba5ddf1265ab5be2ed390fa56e1"));
|
||||
Arrays.asList("3618711e41b7e37f47b995d39adbc76b"));
|
||||
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
@ -97,7 +97,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testSingleSamplePilot2Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("029706a60440660c6a636091e9489122"));
|
||||
Arrays.asList("2a32add40319ab2de44951624df2be4b"));
|
||||
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testVariantsToVCFUsingGeliInput() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("0b96a8046d2a06bd87f57df8bac1678d");
|
||||
md5.add("a94c15f2e8905fd3e98301375cf0f42a");
|
||||
|
||||
/**
|
||||
* the above MD5 was calculated from running the following command:
|
||||
|
|
@ -50,7 +50,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testGenotypesToVCFUsingGeliInput() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("09660faa7cfad8af36602f79461c0605");
|
||||
md5.add("6b18f33e25edbd2154c17a949656644b");
|
||||
|
||||
/**
|
||||
* the above MD5 was calculated from running the following command:
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import java.util.*;
|
|||
public class VCFHeaderTest extends BaseTest {
|
||||
|
||||
private Set<VCFHeader.HEADER_FIELDS> headerFields = new LinkedHashSet<VCFHeader.HEADER_FIELDS>();
|
||||
private Map<String, String> metaData = new HashMap();
|
||||
private Set<String> metaData = new HashSet();
|
||||
private Set<String> additionalColumns = new HashSet<String>();
|
||||
|
||||
/**
|
||||
|
|
@ -26,8 +26,8 @@ public class VCFHeaderTest extends BaseTest {
|
|||
*/
|
||||
@Test
|
||||
public void testHeaderConstructor() {
|
||||
metaData.put("format","VCRv3.2");
|
||||
metaData.put("two","2");
|
||||
metaData.add(VCFHeader.FULL_FORMAT_LINE); // required
|
||||
metaData.add("two=2");
|
||||
additionalColumns.add("extra1");
|
||||
additionalColumns.add("extra2");
|
||||
// this should create a header that is valid
|
||||
|
|
@ -40,12 +40,7 @@ public class VCFHeaderTest extends BaseTest {
|
|||
Assert.assertEquals(VCFHeader.HEADER_FIELDS.values()[index],field);
|
||||
index++;
|
||||
}
|
||||
index = 0;
|
||||
for (String key: header.getMetaData().keySet()) {
|
||||
Assert.assertEquals(header.getMetaData().get(key),metaData.get(key));
|
||||
index++;
|
||||
}
|
||||
Assert.assertEquals(metaData.size(),index);
|
||||
Assert.assertEquals(metaData.size(), header.getMetaData().size());
|
||||
index = 0;
|
||||
for (String key: header.getGenotypeSamples()) {
|
||||
Assert.assertTrue(additionalColumns.contains(key));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,18 @@
|
|||
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class VCFIntegrationTest extends WalkerTest {
|
||||
|
||||
@Test
|
||||
public void test1() {
|
||||
// Read in and then emit each record
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T PrintRODs -R /broad/1KG/reference/human_b36_both.fasta -L 1:10,000,000-10,050,000 -o %s -B vcf,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/complexExample.vcf", 1,
|
||||
Arrays.asList("26ad7a663d0f247ac26ce5490edd7ec0"));
|
||||
executeTest("test vcf", spec);
|
||||
}
|
||||
}
|
||||
|
|
@ -20,7 +20,7 @@ public class VCFReaderTest extends BaseTest {
|
|||
|
||||
private static final File vcfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf");
|
||||
private static final File multiSampleVCF = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/MultiSample.vcf");
|
||||
private static final String VCF_MIXUP_FILE = "/humgen/gsa-scr1/GATK_Data/Validation_Data/mixedup.vcf";
|
||||
private static final String VCF_MIXUP_FILE = "/humgen/gsa-scr1/GATK_Data/Validation_Data/mixedup.v2.vcf";
|
||||
|
||||
private static IndexedFastaSequenceFile seq;
|
||||
|
||||
|
|
|
|||
|
|
@ -140,10 +140,10 @@ public class VCFRecordTest extends BaseTest {
|
|||
* @return a fake VCF header
|
||||
*/
|
||||
public static VCFHeader createFakeHeader() {
|
||||
Map<String, String> metaData = new HashMap();
|
||||
Set<String> metaData = new HashSet();
|
||||
metaData.add(VCFHeader.FULL_FORMAT_LINE); // required
|
||||
metaData.add("two=2");
|
||||
Set<String> additionalColumns = new HashSet<String>();
|
||||
metaData.put("format", "VCRv3.2"); // required
|
||||
metaData.put("two", "2");
|
||||
additionalColumns.add("FORMAT");
|
||||
additionalColumns.add("sample1");
|
||||
return new VCFHeader(metaData, additionalColumns);
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ import java.util.*;
|
|||
*/
|
||||
public class VCFWriterTest extends BaseTest {
|
||||
private Set<VCFHeader.HEADER_FIELDS> headerFields = new LinkedHashSet<VCFHeader.HEADER_FIELDS>();
|
||||
private Map<String, String> metaData = new HashMap();
|
||||
private Set<String> metaData = new HashSet();
|
||||
private Set<String> additionalColumns = new HashSet<String>();
|
||||
private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf");
|
||||
|
||||
|
|
@ -62,9 +62,9 @@ public class VCFWriterTest extends BaseTest {
|
|||
* create a fake header of known quantity
|
||||
* @return a fake VCF header
|
||||
*/
|
||||
public static VCFHeader createFakeHeader(Map<String, String> metaData, Set<String> additionalColumns) {
|
||||
metaData.put("format", "VCRv3.2"); // required
|
||||
metaData.put("two", "2");
|
||||
public static VCFHeader createFakeHeader(Set<String> metaData, Set<String> additionalColumns) {
|
||||
metaData.add(VCFHeader.FULL_FORMAT_LINE); // required
|
||||
metaData.add("two=2");
|
||||
additionalColumns.add("FORMAT");
|
||||
additionalColumns.add("extra1");
|
||||
additionalColumns.add("extra2");
|
||||
|
|
@ -109,12 +109,7 @@ public class VCFWriterTest extends BaseTest {
|
|||
Assert.assertEquals(VCFHeader.HEADER_FIELDS.values()[index], field);
|
||||
index++;
|
||||
}
|
||||
index = 0;
|
||||
for (String key : header.getMetaData().keySet()) {
|
||||
Assert.assertEquals(header.getMetaData().get(key), metaData.get(key));
|
||||
index++;
|
||||
}
|
||||
Assert.assertEquals(metaData.size(), index);
|
||||
Assert.assertEquals(metaData.size(), header.getMetaData().size());
|
||||
index = 0;
|
||||
for (String key : header.getGenotypeSamples()) {
|
||||
Assert.assertTrue(additionalColumns.contains(key));
|
||||
|
|
|
|||
Loading…
Reference in New Issue