From bc6f24e88fc91421f910ec6970f7961f8fb40143 Mon Sep 17 00:00:00 2001 From: ebanks Date: Mon, 9 Nov 2009 04:53:32 +0000 Subject: [PATCH] Added VCFUtils which contains some useful VCF-related functions (e.g. ability to merge VCF records). Also, various minor improvements. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1998 348d0f76-0448-11de-a6fe-93d51630548a --- .../utils/genotype/vcf/VCFGenotypeCall.java | 2 +- .../vcf/VCFGenotypeWriterAdapter.java | 52 +---- .../sting/utils/genotype/vcf/VCFUtils.java | 207 ++++++++++++++++++ .../sting/utils/genotype/vcf/VCFWriter.java | 2 + 4 files changed, 213 insertions(+), 50 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeCall.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeCall.java index e3d837f26..a533dddae 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeCall.java @@ -50,7 +50,7 @@ public class VCFGenotypeCall implements Genotype, ReadBacked, PosteriorsBacked, public VCFGenotypeCall(char ref, GenomeLoc loc, String genotype, double negLog10PError, String sample) { mRefBase = ref; mLocation = loc; - mBestGenotype = DiploidGenotype.valueOf(genotype); + mBestGenotype = DiploidGenotype.unorderedValueOf(genotype); mRefGenotype = DiploidGenotype.createHomGenotype(ref); mSampleName = sample; diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java index df74f8938..5a2a84bd7 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java @@ -59,7 +59,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { Map hInfo = new HashMap(); // setup the header fields - hInfo.put("format", "VCRv3.2"); + hInfo.put("format", VCFWriter.VERSION); hInfo.put("source", mSource); hInfo.put("reference", mReferenceName); @@ -144,7 +144,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { for (String name : mHeader.getGenotypeSamples()) { if (genotypeMap.containsKey(name)) { Genotype gtype = genotypeMap.get(name); - VCFGenotypeRecord record = createVCFGenotypeRecord(params, (VCFGenotypeCall)gtype); + VCFGenotypeRecord record = VCFUtils.createVCFGenotypeRecord(params, (VCFGenotypeCall)gtype); totalReadDepth += ((VCFGenotypeCall)gtype).getReadCount(); params.addGenotypeRecord(record); genotypeMap.remove(name); @@ -190,7 +190,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { * * @return a mapping of info field to value */ - private Map getInfoFields(VCFGenotypeLocusData locusdata, VCFParameters params) { + private static Map getInfoFields(VCFGenotypeLocusData locusdata, VCFParameters params) { Map infoFields = new HashMap(); if ( locusdata != null ) { infoFields.put("SB", String.format("%.2f", locusdata.getSLOD())); @@ -200,37 +200,6 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { return infoFields; } - /** - * create the VCF genotype record - * - * @param params the VCF parameters object - * @param gtype the genotype - * - * @return a VCFGenotypeRecord - */ - private VCFGenotypeRecord createVCFGenotypeRecord(VCFParameters params, VCFGenotypeCall gtype) { - Map map = new HashMap(); - - // calculate the RMS mapping qualities and the read depth - int readDepth = gtype.getReadCount(); - map.put("RD", String.valueOf(readDepth)); - params.addFormatItem("RD"); - double qual = gtype.getNegLog10PError(); - map.put("GQ", String.format("%.2f", qual)); - params.addFormatItem("GQ"); - - List alleles = createAlleleArray(gtype); - for (VCFGenotypeEncoding allele : alleles) { - params.addAlternateBase(allele); - } - - VCFGenotypeRecord record = new VCFGenotypeRecord(gtype.getSampleName(), - alleles, - VCFGenotypeRecord.PHASE.UNPHASED, - map); - return record; - } - /** * create a no call record * @@ -253,21 +222,6 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { return record; } - /** - * create the allele array? - * - * @param gtype the gentoype object - * - * @return a list of string representing the string array of alleles - */ - private List createAlleleArray(Genotype gtype) { - List alleles = new ArrayList(); - for (char allele : gtype.getBases().toCharArray()) { - alleles.add(new VCFGenotypeEncoding(String.valueOf(allele))); - } - return alleles; - } - /** @return true if we support multisample, false otherwise */ public boolean supportsMultiSample() { return true; diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java new file mode 100755 index 000000000..2858b9b30 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java @@ -0,0 +1,207 @@ +package org.broadinstitute.sting.utils.genotype.vcf; + +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.RodVCF; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.genotype.Genotype; + +import java.util.*; + +/** + * A set of static utility methods for common operations on VCF files/records. + */ +public class VCFUtils { + /** + * Constructor access disallowed...static utility methods only! + */ + private VCFUtils() { } + + + /** + * Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap + * (e.g. sampleX.1, sampleX.2, ...) + * When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping + * from rod/sample pairs to the new uniquified names + * + * @param toolkit GATK engine + * @param samples set to store the sample names + * @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names + */ + public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set samples, Map, String> rodNamesToSampleNames) { + + // keep a map of sample name to next available uniquified index + HashMap sampleOverlapMap = new HashMap(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + ReferenceOrderedData rod = source.getReferenceOrderedData(); + if ( rod.getType().equals(RodVCF.class) ) { + VCFReader reader = new VCFReader(rod.getFile()); + Set vcfSamples = reader.getHeader().getGenotypeSamples(); + for ( String sample : vcfSamples ) + addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName()); + reader.close(); + } + } + } + + private static void addUniqueSample(Set samples, Map sampleOverlapMap, Map, String> rodNamesToSampleNames, String newSample, String rodName) { + + // if it's already a non-unique sample name, give it a unique suffix and increment the value + Integer uniqueIndex = sampleOverlapMap.get(newSample); + if ( uniqueIndex != null ) { + String uniqueName = newSample + "." + uniqueIndex; + samples.add(uniqueName); + rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName); + sampleOverlapMap.put(newSample, uniqueIndex + 1); + } + + // if this is the second occurrence of the sample name, uniquify both of them + else if ( samples.contains(newSample) ) { + samples.remove(newSample); + String uniqueName1 = newSample + "." + 1; + samples.add(uniqueName1); + for ( java.util.Map.Entry, String> entry : rodNamesToSampleNames.entrySet() ) { + if ( entry.getValue().equals(newSample) ) { + entry.setValue(uniqueName1); + break; + } + } + + String uniqueName2 = newSample + "." + 2; + samples.add(uniqueName2); + rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName2); + + sampleOverlapMap.put(newSample, 3); + } + + // otherwise, just add it to the list of samples + else { + samples.add(newSample); + rodNamesToSampleNames.put(new Pair(rodName, newSample), newSample); + } + } + + /** + * Merges various vcf records into a single one using the mapping from rodNamesToSampleNames to get unique sample names + * + * @param rods the vcf rods + * @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names + * @return the new merged vcf record + */ + public static VCFRecord mergeRecords(List rods, Map, String> rodNamesToSampleNames) { + + VCFParameters params = new VCFParameters(); + params.addFormatItem("GT"); + + // keep track of the locus specific data so we can merge them intelligently + int totalReadDepth = 0; + double maxConfidence = 0.0; + double totalSLOD = 0.0; + int SLODsSeen = 0; + double totalFreq = 0.0; + int freqsSeen = 0; + + for ( RodVCF rod : rods ) { + List myGenotypes = rod.getGenotypes(); + for ( Genotype g : myGenotypes ) { + if ( !(g instanceof VCFGenotypeCall) ) + throw new StingException("Expected VCFGenotypeCall object but instead saw " + g.getClass().getSimpleName()); + + // set the name to be the new uniquified name and add it to the list of genotypes + VCFGenotypeCall call = (VCFGenotypeCall)g; + call.setSampleName(rodNamesToSampleNames.get(new Pair(rod.getName(), call.getSampleName()))); + if ( params.getPosition() < 1 ) + params.setLocations(call.getLocation(), call.getReference()); + params.addGenotypeRecord(createVCFGenotypeRecord(params, call)); + totalReadDepth += call.getReadCount(); + } + + // set the overall confidence to be the max entry we see + double confidence = 10.0 * rod.getNegLog10PError(); + if ( confidence > maxConfidence ) + maxConfidence = confidence; + + if ( rod.hasNonRefAlleleFrequency() ) { + totalFreq += rod.getNonRefAlleleFrequency(); + freqsSeen++; + } + + if ( rod.hasStrandBias() ) { + totalSLOD += rod.getStrandBias(); + SLODsSeen++; + } + } + + Map infoFields = new HashMap(); + infoFields.put("DP", String.format("%d", totalReadDepth)); + infoFields.put("NS", String.valueOf(params.getGenotypesRecords().size())); + + // set the overall strand bias and allele frequency to be the average of all entries we've seen + if ( SLODsSeen > 0 ) + infoFields.put("SB", String.format("%.2f", (totalSLOD/(double)SLODsSeen))); + if ( freqsSeen > 0 ) + infoFields.put("AF", String.format("%.2f", (totalFreq/(double)freqsSeen))); + + return new VCFRecord(params.getReferenceBase(), + params.getContig(), + params.getPosition(), + ".", + params.getAlternateBases(), + maxConfidence, + "0", + infoFields, + params.getFormatString(), + params.getGenotypesRecords()); + } + + /** + * create the VCF genotype record + * + * @param params the VCF parameters object + * @param gtype the genotype + * + * @return a VCFGenotypeRecord + */ + public static VCFGenotypeRecord createVCFGenotypeRecord(VCFParameters params, VCFGenotypeCall gtype) { + Map map = new HashMap(); + + // calculate the RMS mapping qualities and the read depth + int readDepth = gtype.getReadCount(); + map.put("RD", String.valueOf(readDepth)); + params.addFormatItem("RD"); + double qual = gtype.getNegLog10PError(); + map.put("GQ", String.format("%.2f", qual)); + params.addFormatItem("GQ"); + + List alleles = createAlleleArray(gtype); + for (VCFGenotypeEncoding allele : alleles) { + params.addAlternateBase(allele); + } + + VCFGenotypeRecord record = new VCFGenotypeRecord(gtype.getSampleName(), + alleles, + VCFGenotypeRecord.PHASE.UNPHASED, + map); + return record; + } + + /** + * create the allele array? + * + * @param gtype the gentoype object + * + * @return a list of string representing the string array of alleles + */ + private static List createAlleleArray(Genotype gtype) { + List alleles = new ArrayList(); + for (char allele : gtype.getBases().toCharArray()) { + alleles.add(new VCFGenotypeEncoding(String.valueOf(allele))); + } + return alleles; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index 3a177a272..ddbd2b2d5 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -8,6 +8,8 @@ import java.io.*; */ public class VCFWriter { + public static final String VERSION = "VCRv3.2"; + // the VCF header we're storing private VCFHeader mHeader;