Added VCFUtils which contains some useful VCF-related functions (e.g. ability to merge VCF records).
Also, various minor improvements. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1998 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
cff645e98b
commit
bc6f24e88f
|
|
@ -50,7 +50,7 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked,
|
||||||
public VCFGenotypeCall(char ref, GenomeLoc loc, String genotype, double negLog10PError, String sample) {
|
public VCFGenotypeCall(char ref, GenomeLoc loc, String genotype, double negLog10PError, String sample) {
|
||||||
mRefBase = ref;
|
mRefBase = ref;
|
||||||
mLocation = loc;
|
mLocation = loc;
|
||||||
mBestGenotype = DiploidGenotype.valueOf(genotype);
|
mBestGenotype = DiploidGenotype.unorderedValueOf(genotype);
|
||||||
mRefGenotype = DiploidGenotype.createHomGenotype(ref);
|
mRefGenotype = DiploidGenotype.createHomGenotype(ref);
|
||||||
mSampleName = sample;
|
mSampleName = sample;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
||||||
Map<String, String> hInfo = new HashMap<String, String>();
|
Map<String, String> hInfo = new HashMap<String, String>();
|
||||||
|
|
||||||
// setup the header fields
|
// setup the header fields
|
||||||
hInfo.put("format", "VCRv3.2");
|
hInfo.put("format", VCFWriter.VERSION);
|
||||||
hInfo.put("source", mSource);
|
hInfo.put("source", mSource);
|
||||||
hInfo.put("reference", mReferenceName);
|
hInfo.put("reference", mReferenceName);
|
||||||
|
|
||||||
|
|
@ -144,7 +144,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
||||||
for (String name : mHeader.getGenotypeSamples()) {
|
for (String name : mHeader.getGenotypeSamples()) {
|
||||||
if (genotypeMap.containsKey(name)) {
|
if (genotypeMap.containsKey(name)) {
|
||||||
Genotype gtype = genotypeMap.get(name);
|
Genotype gtype = genotypeMap.get(name);
|
||||||
VCFGenotypeRecord record = createVCFGenotypeRecord(params, (VCFGenotypeCall)gtype);
|
VCFGenotypeRecord record = VCFUtils.createVCFGenotypeRecord(params, (VCFGenotypeCall)gtype);
|
||||||
totalReadDepth += ((VCFGenotypeCall)gtype).getReadCount();
|
totalReadDepth += ((VCFGenotypeCall)gtype).getReadCount();
|
||||||
params.addGenotypeRecord(record);
|
params.addGenotypeRecord(record);
|
||||||
genotypeMap.remove(name);
|
genotypeMap.remove(name);
|
||||||
|
|
@ -190,7 +190,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
||||||
*
|
*
|
||||||
* @return a mapping of info field to value
|
* @return a mapping of info field to value
|
||||||
*/
|
*/
|
||||||
private Map<String, String> getInfoFields(VCFGenotypeLocusData locusdata, VCFParameters params) {
|
private static Map<String, String> getInfoFields(VCFGenotypeLocusData locusdata, VCFParameters params) {
|
||||||
Map<String, String> infoFields = new HashMap<String, String>();
|
Map<String, String> infoFields = new HashMap<String, String>();
|
||||||
if ( locusdata != null ) {
|
if ( locusdata != null ) {
|
||||||
infoFields.put("SB", String.format("%.2f", locusdata.getSLOD()));
|
infoFields.put("SB", String.format("%.2f", locusdata.getSLOD()));
|
||||||
|
|
@ -200,37 +200,6 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
||||||
return infoFields;
|
return infoFields;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* create the VCF genotype record
|
|
||||||
*
|
|
||||||
* @param params the VCF parameters object
|
|
||||||
* @param gtype the genotype
|
|
||||||
*
|
|
||||||
* @return a VCFGenotypeRecord
|
|
||||||
*/
|
|
||||||
private VCFGenotypeRecord createVCFGenotypeRecord(VCFParameters params, VCFGenotypeCall gtype) {
|
|
||||||
Map<String, String> map = new HashMap<String, String>();
|
|
||||||
|
|
||||||
// calculate the RMS mapping qualities and the read depth
|
|
||||||
int readDepth = gtype.getReadCount();
|
|
||||||
map.put("RD", String.valueOf(readDepth));
|
|
||||||
params.addFormatItem("RD");
|
|
||||||
double qual = gtype.getNegLog10PError();
|
|
||||||
map.put("GQ", String.format("%.2f", qual));
|
|
||||||
params.addFormatItem("GQ");
|
|
||||||
|
|
||||||
List<VCFGenotypeEncoding> alleles = createAlleleArray(gtype);
|
|
||||||
for (VCFGenotypeEncoding allele : alleles) {
|
|
||||||
params.addAlternateBase(allele);
|
|
||||||
}
|
|
||||||
|
|
||||||
VCFGenotypeRecord record = new VCFGenotypeRecord(gtype.getSampleName(),
|
|
||||||
alleles,
|
|
||||||
VCFGenotypeRecord.PHASE.UNPHASED,
|
|
||||||
map);
|
|
||||||
return record;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create a no call record
|
* create a no call record
|
||||||
*
|
*
|
||||||
|
|
@ -253,21 +222,6 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
||||||
return record;
|
return record;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* create the allele array?
|
|
||||||
*
|
|
||||||
* @param gtype the gentoype object
|
|
||||||
*
|
|
||||||
* @return a list of string representing the string array of alleles
|
|
||||||
*/
|
|
||||||
private List<VCFGenotypeEncoding> createAlleleArray(Genotype gtype) {
|
|
||||||
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
|
|
||||||
for (char allele : gtype.getBases().toCharArray()) {
|
|
||||||
alleles.add(new VCFGenotypeEncoding(String.valueOf(allele)));
|
|
||||||
}
|
|
||||||
return alleles;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** @return true if we support multisample, false otherwise */
|
/** @return true if we support multisample, false otherwise */
|
||||||
public boolean supportsMultiSample() {
|
public boolean supportsMultiSample() {
|
||||||
return true;
|
return true;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,207 @@
|
||||||
|
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A set of static utility methods for common operations on VCF files/records.
|
||||||
|
*/
|
||||||
|
public class VCFUtils {
|
||||||
|
/**
|
||||||
|
* Constructor access disallowed...static utility methods only!
|
||||||
|
*/
|
||||||
|
private VCFUtils() { }
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
|
||||||
|
* (e.g. sampleX.1, sampleX.2, ...)
|
||||||
|
* When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping
|
||||||
|
* from rod/sample pairs to the new uniquified names
|
||||||
|
*
|
||||||
|
* @param toolkit GATK engine
|
||||||
|
* @param samples set to store the sample names
|
||||||
|
* @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names
|
||||||
|
*/
|
||||||
|
public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set<String> samples, Map<Pair<String, String>, String> rodNamesToSampleNames) {
|
||||||
|
|
||||||
|
// keep a map of sample name to next available uniquified index
|
||||||
|
HashMap<String, Integer> sampleOverlapMap = new HashMap<String, Integer>();
|
||||||
|
|
||||||
|
// iterate to get all of the sample names
|
||||||
|
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||||
|
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||||
|
ReferenceOrderedData rod = source.getReferenceOrderedData();
|
||||||
|
if ( rod.getType().equals(RodVCF.class) ) {
|
||||||
|
VCFReader reader = new VCFReader(rod.getFile());
|
||||||
|
Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
|
||||||
|
for ( String sample : vcfSamples )
|
||||||
|
addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName());
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addUniqueSample(Set<String> samples, Map<String, Integer> sampleOverlapMap, Map<Pair<String, String>, String> rodNamesToSampleNames, String newSample, String rodName) {
|
||||||
|
|
||||||
|
// if it's already a non-unique sample name, give it a unique suffix and increment the value
|
||||||
|
Integer uniqueIndex = sampleOverlapMap.get(newSample);
|
||||||
|
if ( uniqueIndex != null ) {
|
||||||
|
String uniqueName = newSample + "." + uniqueIndex;
|
||||||
|
samples.add(uniqueName);
|
||||||
|
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), uniqueName);
|
||||||
|
sampleOverlapMap.put(newSample, uniqueIndex + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// if this is the second occurrence of the sample name, uniquify both of them
|
||||||
|
else if ( samples.contains(newSample) ) {
|
||||||
|
samples.remove(newSample);
|
||||||
|
String uniqueName1 = newSample + "." + 1;
|
||||||
|
samples.add(uniqueName1);
|
||||||
|
for ( java.util.Map.Entry<Pair<String, String>, String> entry : rodNamesToSampleNames.entrySet() ) {
|
||||||
|
if ( entry.getValue().equals(newSample) ) {
|
||||||
|
entry.setValue(uniqueName1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String uniqueName2 = newSample + "." + 2;
|
||||||
|
samples.add(uniqueName2);
|
||||||
|
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), uniqueName2);
|
||||||
|
|
||||||
|
sampleOverlapMap.put(newSample, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// otherwise, just add it to the list of samples
|
||||||
|
else {
|
||||||
|
samples.add(newSample);
|
||||||
|
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), newSample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Merges various vcf records into a single one using the mapping from rodNamesToSampleNames to get unique sample names
|
||||||
|
*
|
||||||
|
* @param rods the vcf rods
|
||||||
|
* @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names
|
||||||
|
* @return the new merged vcf record
|
||||||
|
*/
|
||||||
|
public static VCFRecord mergeRecords(List<RodVCF> rods, Map<Pair<String, String>, String> rodNamesToSampleNames) {
|
||||||
|
|
||||||
|
VCFParameters params = new VCFParameters();
|
||||||
|
params.addFormatItem("GT");
|
||||||
|
|
||||||
|
// keep track of the locus specific data so we can merge them intelligently
|
||||||
|
int totalReadDepth = 0;
|
||||||
|
double maxConfidence = 0.0;
|
||||||
|
double totalSLOD = 0.0;
|
||||||
|
int SLODsSeen = 0;
|
||||||
|
double totalFreq = 0.0;
|
||||||
|
int freqsSeen = 0;
|
||||||
|
|
||||||
|
for ( RodVCF rod : rods ) {
|
||||||
|
List<Genotype> myGenotypes = rod.getGenotypes();
|
||||||
|
for ( Genotype g : myGenotypes ) {
|
||||||
|
if ( !(g instanceof VCFGenotypeCall) )
|
||||||
|
throw new StingException("Expected VCFGenotypeCall object but instead saw " + g.getClass().getSimpleName());
|
||||||
|
|
||||||
|
// set the name to be the new uniquified name and add it to the list of genotypes
|
||||||
|
VCFGenotypeCall call = (VCFGenotypeCall)g;
|
||||||
|
call.setSampleName(rodNamesToSampleNames.get(new Pair<String, String>(rod.getName(), call.getSampleName())));
|
||||||
|
if ( params.getPosition() < 1 )
|
||||||
|
params.setLocations(call.getLocation(), call.getReference());
|
||||||
|
params.addGenotypeRecord(createVCFGenotypeRecord(params, call));
|
||||||
|
totalReadDepth += call.getReadCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
// set the overall confidence to be the max entry we see
|
||||||
|
double confidence = 10.0 * rod.getNegLog10PError();
|
||||||
|
if ( confidence > maxConfidence )
|
||||||
|
maxConfidence = confidence;
|
||||||
|
|
||||||
|
if ( rod.hasNonRefAlleleFrequency() ) {
|
||||||
|
totalFreq += rod.getNonRefAlleleFrequency();
|
||||||
|
freqsSeen++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( rod.hasStrandBias() ) {
|
||||||
|
totalSLOD += rod.getStrandBias();
|
||||||
|
SLODsSeen++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<String, String> infoFields = new HashMap<String, String>();
|
||||||
|
infoFields.put("DP", String.format("%d", totalReadDepth));
|
||||||
|
infoFields.put("NS", String.valueOf(params.getGenotypesRecords().size()));
|
||||||
|
|
||||||
|
// set the overall strand bias and allele frequency to be the average of all entries we've seen
|
||||||
|
if ( SLODsSeen > 0 )
|
||||||
|
infoFields.put("SB", String.format("%.2f", (totalSLOD/(double)SLODsSeen)));
|
||||||
|
if ( freqsSeen > 0 )
|
||||||
|
infoFields.put("AF", String.format("%.2f", (totalFreq/(double)freqsSeen)));
|
||||||
|
|
||||||
|
return new VCFRecord(params.getReferenceBase(),
|
||||||
|
params.getContig(),
|
||||||
|
params.getPosition(),
|
||||||
|
".",
|
||||||
|
params.getAlternateBases(),
|
||||||
|
maxConfidence,
|
||||||
|
"0",
|
||||||
|
infoFields,
|
||||||
|
params.getFormatString(),
|
||||||
|
params.getGenotypesRecords());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create the VCF genotype record
|
||||||
|
*
|
||||||
|
* @param params the VCF parameters object
|
||||||
|
* @param gtype the genotype
|
||||||
|
*
|
||||||
|
* @return a VCFGenotypeRecord
|
||||||
|
*/
|
||||||
|
public static VCFGenotypeRecord createVCFGenotypeRecord(VCFParameters params, VCFGenotypeCall gtype) {
|
||||||
|
Map<String, String> map = new HashMap<String, String>();
|
||||||
|
|
||||||
|
// calculate the RMS mapping qualities and the read depth
|
||||||
|
int readDepth = gtype.getReadCount();
|
||||||
|
map.put("RD", String.valueOf(readDepth));
|
||||||
|
params.addFormatItem("RD");
|
||||||
|
double qual = gtype.getNegLog10PError();
|
||||||
|
map.put("GQ", String.format("%.2f", qual));
|
||||||
|
params.addFormatItem("GQ");
|
||||||
|
|
||||||
|
List<VCFGenotypeEncoding> alleles = createAlleleArray(gtype);
|
||||||
|
for (VCFGenotypeEncoding allele : alleles) {
|
||||||
|
params.addAlternateBase(allele);
|
||||||
|
}
|
||||||
|
|
||||||
|
VCFGenotypeRecord record = new VCFGenotypeRecord(gtype.getSampleName(),
|
||||||
|
alleles,
|
||||||
|
VCFGenotypeRecord.PHASE.UNPHASED,
|
||||||
|
map);
|
||||||
|
return record;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create the allele array?
|
||||||
|
*
|
||||||
|
* @param gtype the gentoype object
|
||||||
|
*
|
||||||
|
* @return a list of string representing the string array of alleles
|
||||||
|
*/
|
||||||
|
private static List<VCFGenotypeEncoding> createAlleleArray(Genotype gtype) {
|
||||||
|
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
|
||||||
|
for (char allele : gtype.getBases().toCharArray()) {
|
||||||
|
alleles.add(new VCFGenotypeEncoding(String.valueOf(allele)));
|
||||||
|
}
|
||||||
|
return alleles;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -8,6 +8,8 @@ import java.io.*;
|
||||||
*/
|
*/
|
||||||
public class VCFWriter {
|
public class VCFWriter {
|
||||||
|
|
||||||
|
public static final String VERSION = "VCRv3.2";
|
||||||
|
|
||||||
// the VCF header we're storing
|
// the VCF header we're storing
|
||||||
private VCFHeader mHeader;
|
private VCFHeader mHeader;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue