Added VCFUtils which contains some useful VCF-related functions (e.g. ability to merge VCF records).
Also, various minor improvements. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1998 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
cff645e98b
commit
bc6f24e88f
|
|
@ -50,7 +50,7 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked,
|
|||
public VCFGenotypeCall(char ref, GenomeLoc loc, String genotype, double negLog10PError, String sample) {
|
||||
mRefBase = ref;
|
||||
mLocation = loc;
|
||||
mBestGenotype = DiploidGenotype.valueOf(genotype);
|
||||
mBestGenotype = DiploidGenotype.unorderedValueOf(genotype);
|
||||
mRefGenotype = DiploidGenotype.createHomGenotype(ref);
|
||||
mSampleName = sample;
|
||||
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
Map<String, String> hInfo = new HashMap<String, String>();
|
||||
|
||||
// setup the header fields
|
||||
hInfo.put("format", "VCRv3.2");
|
||||
hInfo.put("format", VCFWriter.VERSION);
|
||||
hInfo.put("source", mSource);
|
||||
hInfo.put("reference", mReferenceName);
|
||||
|
||||
|
|
@ -144,7 +144,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
for (String name : mHeader.getGenotypeSamples()) {
|
||||
if (genotypeMap.containsKey(name)) {
|
||||
Genotype gtype = genotypeMap.get(name);
|
||||
VCFGenotypeRecord record = createVCFGenotypeRecord(params, (VCFGenotypeCall)gtype);
|
||||
VCFGenotypeRecord record = VCFUtils.createVCFGenotypeRecord(params, (VCFGenotypeCall)gtype);
|
||||
totalReadDepth += ((VCFGenotypeCall)gtype).getReadCount();
|
||||
params.addGenotypeRecord(record);
|
||||
genotypeMap.remove(name);
|
||||
|
|
@ -190,7 +190,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
*
|
||||
* @return a mapping of info field to value
|
||||
*/
|
||||
private Map<String, String> getInfoFields(VCFGenotypeLocusData locusdata, VCFParameters params) {
|
||||
private static Map<String, String> getInfoFields(VCFGenotypeLocusData locusdata, VCFParameters params) {
|
||||
Map<String, String> infoFields = new HashMap<String, String>();
|
||||
if ( locusdata != null ) {
|
||||
infoFields.put("SB", String.format("%.2f", locusdata.getSLOD()));
|
||||
|
|
@ -200,37 +200,6 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
return infoFields;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the VCF genotype record
|
||||
*
|
||||
* @param params the VCF parameters object
|
||||
* @param gtype the genotype
|
||||
*
|
||||
* @return a VCFGenotypeRecord
|
||||
*/
|
||||
private VCFGenotypeRecord createVCFGenotypeRecord(VCFParameters params, VCFGenotypeCall gtype) {
|
||||
Map<String, String> map = new HashMap<String, String>();
|
||||
|
||||
// calculate the RMS mapping qualities and the read depth
|
||||
int readDepth = gtype.getReadCount();
|
||||
map.put("RD", String.valueOf(readDepth));
|
||||
params.addFormatItem("RD");
|
||||
double qual = gtype.getNegLog10PError();
|
||||
map.put("GQ", String.format("%.2f", qual));
|
||||
params.addFormatItem("GQ");
|
||||
|
||||
List<VCFGenotypeEncoding> alleles = createAlleleArray(gtype);
|
||||
for (VCFGenotypeEncoding allele : alleles) {
|
||||
params.addAlternateBase(allele);
|
||||
}
|
||||
|
||||
VCFGenotypeRecord record = new VCFGenotypeRecord(gtype.getSampleName(),
|
||||
alleles,
|
||||
VCFGenotypeRecord.PHASE.UNPHASED,
|
||||
map);
|
||||
return record;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a no call record
|
||||
*
|
||||
|
|
@ -253,21 +222,6 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
return record;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the allele array?
|
||||
*
|
||||
* @param gtype the gentoype object
|
||||
*
|
||||
* @return a list of string representing the string array of alleles
|
||||
*/
|
||||
private List<VCFGenotypeEncoding> createAlleleArray(Genotype gtype) {
|
||||
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
|
||||
for (char allele : gtype.getBases().toCharArray()) {
|
||||
alleles.add(new VCFGenotypeEncoding(String.valueOf(allele)));
|
||||
}
|
||||
return alleles;
|
||||
}
|
||||
|
||||
/** @return true if we support multisample, false otherwise */
|
||||
public boolean supportsMultiSample() {
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,207 @@
|
|||
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A set of static utility methods for common operations on VCF files/records.
|
||||
*/
|
||||
public class VCFUtils {
|
||||
/**
|
||||
* Constructor access disallowed...static utility methods only!
|
||||
*/
|
||||
private VCFUtils() { }
|
||||
|
||||
|
||||
/**
|
||||
* Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
|
||||
* (e.g. sampleX.1, sampleX.2, ...)
|
||||
* When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping
|
||||
* from rod/sample pairs to the new uniquified names
|
||||
*
|
||||
* @param toolkit GATK engine
|
||||
* @param samples set to store the sample names
|
||||
* @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names
|
||||
*/
|
||||
public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set<String> samples, Map<Pair<String, String>, String> rodNamesToSampleNames) {
|
||||
|
||||
// keep a map of sample name to next available uniquified index
|
||||
HashMap<String, Integer> sampleOverlapMap = new HashMap<String, Integer>();
|
||||
|
||||
// iterate to get all of the sample names
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
ReferenceOrderedData rod = source.getReferenceOrderedData();
|
||||
if ( rod.getType().equals(RodVCF.class) ) {
|
||||
VCFReader reader = new VCFReader(rod.getFile());
|
||||
Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
|
||||
for ( String sample : vcfSamples )
|
||||
addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName());
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void addUniqueSample(Set<String> samples, Map<String, Integer> sampleOverlapMap, Map<Pair<String, String>, String> rodNamesToSampleNames, String newSample, String rodName) {
|
||||
|
||||
// if it's already a non-unique sample name, give it a unique suffix and increment the value
|
||||
Integer uniqueIndex = sampleOverlapMap.get(newSample);
|
||||
if ( uniqueIndex != null ) {
|
||||
String uniqueName = newSample + "." + uniqueIndex;
|
||||
samples.add(uniqueName);
|
||||
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), uniqueName);
|
||||
sampleOverlapMap.put(newSample, uniqueIndex + 1);
|
||||
}
|
||||
|
||||
// if this is the second occurrence of the sample name, uniquify both of them
|
||||
else if ( samples.contains(newSample) ) {
|
||||
samples.remove(newSample);
|
||||
String uniqueName1 = newSample + "." + 1;
|
||||
samples.add(uniqueName1);
|
||||
for ( java.util.Map.Entry<Pair<String, String>, String> entry : rodNamesToSampleNames.entrySet() ) {
|
||||
if ( entry.getValue().equals(newSample) ) {
|
||||
entry.setValue(uniqueName1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
String uniqueName2 = newSample + "." + 2;
|
||||
samples.add(uniqueName2);
|
||||
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), uniqueName2);
|
||||
|
||||
sampleOverlapMap.put(newSample, 3);
|
||||
}
|
||||
|
||||
// otherwise, just add it to the list of samples
|
||||
else {
|
||||
samples.add(newSample);
|
||||
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), newSample);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges various vcf records into a single one using the mapping from rodNamesToSampleNames to get unique sample names
|
||||
*
|
||||
* @param rods the vcf rods
|
||||
* @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names
|
||||
* @return the new merged vcf record
|
||||
*/
|
||||
public static VCFRecord mergeRecords(List<RodVCF> rods, Map<Pair<String, String>, String> rodNamesToSampleNames) {
|
||||
|
||||
VCFParameters params = new VCFParameters();
|
||||
params.addFormatItem("GT");
|
||||
|
||||
// keep track of the locus specific data so we can merge them intelligently
|
||||
int totalReadDepth = 0;
|
||||
double maxConfidence = 0.0;
|
||||
double totalSLOD = 0.0;
|
||||
int SLODsSeen = 0;
|
||||
double totalFreq = 0.0;
|
||||
int freqsSeen = 0;
|
||||
|
||||
for ( RodVCF rod : rods ) {
|
||||
List<Genotype> myGenotypes = rod.getGenotypes();
|
||||
for ( Genotype g : myGenotypes ) {
|
||||
if ( !(g instanceof VCFGenotypeCall) )
|
||||
throw new StingException("Expected VCFGenotypeCall object but instead saw " + g.getClass().getSimpleName());
|
||||
|
||||
// set the name to be the new uniquified name and add it to the list of genotypes
|
||||
VCFGenotypeCall call = (VCFGenotypeCall)g;
|
||||
call.setSampleName(rodNamesToSampleNames.get(new Pair<String, String>(rod.getName(), call.getSampleName())));
|
||||
if ( params.getPosition() < 1 )
|
||||
params.setLocations(call.getLocation(), call.getReference());
|
||||
params.addGenotypeRecord(createVCFGenotypeRecord(params, call));
|
||||
totalReadDepth += call.getReadCount();
|
||||
}
|
||||
|
||||
// set the overall confidence to be the max entry we see
|
||||
double confidence = 10.0 * rod.getNegLog10PError();
|
||||
if ( confidence > maxConfidence )
|
||||
maxConfidence = confidence;
|
||||
|
||||
if ( rod.hasNonRefAlleleFrequency() ) {
|
||||
totalFreq += rod.getNonRefAlleleFrequency();
|
||||
freqsSeen++;
|
||||
}
|
||||
|
||||
if ( rod.hasStrandBias() ) {
|
||||
totalSLOD += rod.getStrandBias();
|
||||
SLODsSeen++;
|
||||
}
|
||||
}
|
||||
|
||||
Map<String, String> infoFields = new HashMap<String, String>();
|
||||
infoFields.put("DP", String.format("%d", totalReadDepth));
|
||||
infoFields.put("NS", String.valueOf(params.getGenotypesRecords().size()));
|
||||
|
||||
// set the overall strand bias and allele frequency to be the average of all entries we've seen
|
||||
if ( SLODsSeen > 0 )
|
||||
infoFields.put("SB", String.format("%.2f", (totalSLOD/(double)SLODsSeen)));
|
||||
if ( freqsSeen > 0 )
|
||||
infoFields.put("AF", String.format("%.2f", (totalFreq/(double)freqsSeen)));
|
||||
|
||||
return new VCFRecord(params.getReferenceBase(),
|
||||
params.getContig(),
|
||||
params.getPosition(),
|
||||
".",
|
||||
params.getAlternateBases(),
|
||||
maxConfidence,
|
||||
"0",
|
||||
infoFields,
|
||||
params.getFormatString(),
|
||||
params.getGenotypesRecords());
|
||||
}
|
||||
|
||||
/**
|
||||
* create the VCF genotype record
|
||||
*
|
||||
* @param params the VCF parameters object
|
||||
* @param gtype the genotype
|
||||
*
|
||||
* @return a VCFGenotypeRecord
|
||||
*/
|
||||
public static VCFGenotypeRecord createVCFGenotypeRecord(VCFParameters params, VCFGenotypeCall gtype) {
|
||||
Map<String, String> map = new HashMap<String, String>();
|
||||
|
||||
// calculate the RMS mapping qualities and the read depth
|
||||
int readDepth = gtype.getReadCount();
|
||||
map.put("RD", String.valueOf(readDepth));
|
||||
params.addFormatItem("RD");
|
||||
double qual = gtype.getNegLog10PError();
|
||||
map.put("GQ", String.format("%.2f", qual));
|
||||
params.addFormatItem("GQ");
|
||||
|
||||
List<VCFGenotypeEncoding> alleles = createAlleleArray(gtype);
|
||||
for (VCFGenotypeEncoding allele : alleles) {
|
||||
params.addAlternateBase(allele);
|
||||
}
|
||||
|
||||
VCFGenotypeRecord record = new VCFGenotypeRecord(gtype.getSampleName(),
|
||||
alleles,
|
||||
VCFGenotypeRecord.PHASE.UNPHASED,
|
||||
map);
|
||||
return record;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the allele array?
|
||||
*
|
||||
* @param gtype the gentoype object
|
||||
*
|
||||
* @return a list of string representing the string array of alleles
|
||||
*/
|
||||
private static List<VCFGenotypeEncoding> createAlleleArray(Genotype gtype) {
|
||||
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
|
||||
for (char allele : gtype.getBases().toCharArray()) {
|
||||
alleles.add(new VCFGenotypeEncoding(String.valueOf(allele)));
|
||||
}
|
||||
return alleles;
|
||||
}
|
||||
}
|
||||
|
|
@ -8,6 +8,8 @@ import java.io.*;
|
|||
*/
|
||||
public class VCFWriter {
|
||||
|
||||
public static final String VERSION = "VCRv3.2";
|
||||
|
||||
// the VCF header we're storing
|
||||
private VCFHeader mHeader;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue