Added:
SequenomToVCF - Takes a sequenom ped file and converts it to a VCF file with the proper metrics for QC. It's currently a rough draft, but is working as expected on a test ped file, which is included as an integration test. Modified: VCFGenotypeCall -- added a cloneCall() method that returns a clone of the call Hapmap2VCF -- removed a VCFGenotypeCall object that gets instantiated and modified but never used (caused me all kinds of confusion when I was basing SequenomToVCF off of it) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2554 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
62dd2fa5be
commit
dfa3c3b875
|
|
@ -66,7 +66,6 @@ public class HapMap2VCF extends RodWalker<Integer, Integer> {
|
|||
|
||||
// Get reference base
|
||||
Character ref_allele = ref.getBase();
|
||||
VCFVariationCall variation = new VCFVariationCall(ref_allele, loc, Variation.VARIANT_TYPE.SNP);
|
||||
|
||||
// Print each sample's genotype info
|
||||
List<Genotype> genotype_calls = new ArrayList<Genotype>();
|
||||
|
|
@ -85,7 +84,6 @@ public class HapMap2VCF extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
// add each genotype to VCF record and write it
|
||||
variation.setGenotypeCalls(genotype_calls);
|
||||
VCFVariationCall call = new VCFVariationCall(ref_allele, loc, Variation.VARIANT_TYPE.SNP);
|
||||
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
|
||||
if (dbsnp != null)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,342 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.HardyWeinberg;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RefWalker;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes)
|
||||
*/
|
||||
public class SequenomToVCF extends RefWalker<VCFVariationCall,Integer> {
|
||||
@Argument(fullName="sequenomePedFile", shortName="sPed", doc="The sequenome file from which to generate a VCF", required=true)
|
||||
public File seqFile = null;
|
||||
@Argument(fullName="outputVCF", shortName="vcf", doc="The VCF file to write to", required=true)
|
||||
public File vcfFile = null;
|
||||
@Argument(fullName="numberOfSamples", shortName="ns", doc="The number of samples sequenced", required=false)
|
||||
public int nSamples = 192; // The number of samples in the ped file I wrote this tool for
|
||||
@Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+
|
||||
"used for smart Hardy-Weinberg annotation",required = false)
|
||||
public File popFile = null;
|
||||
@Argument(fullName="useb36ContigNames",shortName="b36contig",doc="Uses b36 contig names (1:1,000,000) rather than hg18 (chr1:1,000,000) for comparison with ref", required=false)
|
||||
public boolean useb36contigs=false;
|
||||
|
||||
private final int INIT_NUMBER_OF_POPULATIONS = 10;
|
||||
private final int DEFAULT_QUALITY = 20;
|
||||
private HashMap<String, SequenomVariantInfo> sequenomResults = new HashMap<String,SequenomVariantInfo>();
|
||||
private ArrayList<String> sampleNames = new ArrayList<String>(nSamples);
|
||||
private VCFGenotypeWriterAdapter vcfWriter;
|
||||
private final HardyWeinberg HWCalc = new HardyWeinberg();
|
||||
private final boolean useSmartHardy = popFile != null;
|
||||
private HashMap<String,String> samplesToPopulation;
|
||||
|
||||
public void initialize() {
|
||||
//System.out.println("Initializing... parse sequenom file");
|
||||
parseSequenomFile(seqFile);
|
||||
vcfWriter = new VCFGenotypeWriterAdapter(vcfFile);
|
||||
if ( useSmartHardy ) {
|
||||
samplesToPopulation = parsePopulationFile(popFile);
|
||||
}
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFHeaderLine("source", "Sequenom2VCF"));
|
||||
hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName()));
|
||||
vcfWriter.writeHeader(new TreeSet<String>(sampleNames),hInfo);
|
||||
nSamples = sampleNames.size();
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
int numberOfVariantsProcessed = 0;
|
||||
return numberOfVariantsProcessed;
|
||||
}
|
||||
|
||||
public VCFVariationCall map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( sequenomResults.containsKey(context.getLocation().toString()) ) {
|
||||
SequenomVariantInfo varInfo = sequenomResults.remove(context.getLocation().toString());
|
||||
return addVariantInformationToCall(ref,varInfo);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public Integer reduce(VCFVariationCall call, Integer numVariants) {
|
||||
if ( call == null ) {
|
||||
return numVariants;
|
||||
} else {
|
||||
printToVCF(call);
|
||||
return 1 + numVariants;
|
||||
}
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer finalReduce) {
|
||||
logger.info("Variants processed="+finalReduce.toString());
|
||||
vcfWriter.close();
|
||||
}
|
||||
|
||||
private void printToVCF(VCFVariationCall call) {
|
||||
vcfWriter.addMultiSampleCall(call.getGenotypes(),call);
|
||||
}
|
||||
|
||||
private VCFVariationCall addVariantInformationToCall(ReferenceContext ref, SequenomVariantInfo varInfo) {
|
||||
int numNoCalls = 0;
|
||||
int numHomNonrefCalls = 0;
|
||||
int numNonrefAlleles = 0;
|
||||
|
||||
int sampleNumber = 0;
|
||||
//System.out.println("Genotypes from varinfo:");
|
||||
//for ( String g : varInfo.getGenotypes()) {
|
||||
//System.out.println(g);
|
||||
//}
|
||||
ArrayList<VCFGenotypeCall> vcfGenotypeCalls = new ArrayList<VCFGenotypeCall>(nSamples);
|
||||
ArrayList<Genotype> genotypeCalls = new ArrayList<Genotype>(nSamples);
|
||||
VCFGenotypeCall vcfCall = new VCFGenotypeCall(ref.getBase(),ref.getLocus());
|
||||
boolean isSNP = false;
|
||||
|
||||
for ( String genTypeStr : varInfo.getGenotypes() ) {
|
||||
if ( genTypeStr.indexOf("0") == -1 ) {
|
||||
vcfCall.setGenotype(DiploidGenotype.createDiploidGenotype(genTypeStr.charAt(0),genTypeStr.charAt(2)));
|
||||
vcfCall.setNegLog10PError((double) DEFAULT_QUALITY/10);
|
||||
vcfCall.setSampleName(sampleNames.get(sampleNumber));
|
||||
genotypeCalls.add( vcfCall.cloneCall() );
|
||||
vcfGenotypeCalls.add( vcfCall.cloneCall() );
|
||||
if ( vcfCall.isVariant(ref.getBase()) ) {
|
||||
isSNP = true;
|
||||
if ( vcfCall.isHom() ) {
|
||||
numHomNonrefCalls++;
|
||||
numNonrefAlleles+=2;
|
||||
} else {
|
||||
numNonrefAlleles++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
numNoCalls++;
|
||||
}
|
||||
sampleNumber++;
|
||||
}
|
||||
|
||||
VCFVariationCall variantCall = new VCFVariationCall(ref.getBase(),ref.getLocus(), isSNP ? VCFVariationCall.VARIANT_TYPE.SNP : VCFVariationCall.VARIANT_TYPE.REFERENCE);
|
||||
variantCall.setGenotypeCalls(genotypeCalls);
|
||||
variantCall.setConfidence((double) DEFAULT_QUALITY);
|
||||
variantCall.setFields(generateInfoField(numNoCalls,numHomNonrefCalls,numNonrefAlleles,variantCall,ref, varInfo, vcfGenotypeCalls));
|
||||
|
||||
return variantCall;
|
||||
}
|
||||
|
||||
private Map<String,String> generateInfoField(int nocall, int homnonref, int allnonref, VCFVariationCall call,
|
||||
ReferenceContext ref, SequenomVariantInfo info, List<VCFGenotypeCall> vcfCalls) {
|
||||
double propNoCall = ( ( double ) nocall / (double) nSamples );
|
||||
double propHomNR = ( (double) homnonref / (double) nSamples );
|
||||
String hardy;
|
||||
if ( useSmartHardy ) {
|
||||
hardy = smartHardy(ref, call, info, vcfCalls);
|
||||
} else {
|
||||
hardy = HWCalc.annotate(null,ref, null, call);
|
||||
}
|
||||
HashMap<String,String> infoMap = new HashMap<String,String>(1);
|
||||
putInfoStrings(infoMap,propNoCall,propHomNR,allnonref,hardy,info.getName());
|
||||
|
||||
return infoMap;
|
||||
}
|
||||
|
||||
private void putInfoStrings(HashMap<String,String> infoMap, double pnc, double phnr, int nra, String hw, String nm) {
|
||||
|
||||
infoMap.put("snpID",nm);
|
||||
infoMap.put("noCallPct",String.format("%.2f",100.0*pnc));
|
||||
infoMap.put("homNonrefPct",String.format("%.2f",100.0*phnr));
|
||||
infoMap.put("nonrefAlleles",String.format("%d",nra));
|
||||
infoMap.put("HW",hw);
|
||||
|
||||
//return String.format("snpID=%s;nocall=%f;homNonref=%4f;numNonrefAlleles=%d;HW=%s",nm,pnc,phnr,nra,hw);
|
||||
|
||||
}
|
||||
|
||||
private String smartHardy(ReferenceContext ref, VCFVariationCall call, SequenomVariantInfo info, List<VCFGenotypeCall> vcfCalls) {
|
||||
HashMap<String,ArrayList<Genotype>> genotypesByPopulation = new HashMap<String,ArrayList<Genotype>>(INIT_NUMBER_OF_POPULATIONS);
|
||||
HashMap<String,String> hardyWeinbergByPopulation = new HashMap<String,String>(INIT_NUMBER_OF_POPULATIONS);
|
||||
|
||||
for ( String population : samplesToPopulation.values() ) {
|
||||
genotypesByPopulation.put(population,new ArrayList<Genotype>());
|
||||
}
|
||||
|
||||
for ( VCFGenotypeCall vgc : vcfCalls ) {
|
||||
genotypesByPopulation.get(vgc.getSampleName()).add(vgc);
|
||||
}
|
||||
|
||||
for ( String population : samplesToPopulation.values() ) {
|
||||
VCFVariationCall v = new VCFVariationCall(ref.getBase(),ref.getLocus(),VCFVariationCall.VARIANT_TYPE.SNP);
|
||||
v.setGenotypeCalls(genotypesByPopulation.get(population));
|
||||
hardyWeinbergByPopulation.put(population,HWCalc.annotate(null,ref,null,v));
|
||||
}
|
||||
|
||||
return smartHardyString(hardyWeinbergByPopulation,info);
|
||||
}
|
||||
|
||||
private String smartHardyString(HashMap<String,String> hwByPop, SequenomVariantInfo varInfo) {
|
||||
// for now just return the maximum:
|
||||
int maxH = -100;
|
||||
for ( String pop : samplesToPopulation.values() ) {
|
||||
maxH = Integer.parseInt(hwByPop.get(pop)) > maxH ? Integer.parseInt(hwByPop.get(pop)) : maxH;
|
||||
}
|
||||
|
||||
return String.format("%s",maxH);
|
||||
}
|
||||
|
||||
private void parseSequenomFile(File sequenomFile) {
|
||||
BufferedReader reader;
|
||||
try {
|
||||
reader = new BufferedReader(new FileReader(sequenomFile));
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException("Sequenom file could not be opened",e);
|
||||
}
|
||||
String header;
|
||||
try {
|
||||
header = reader.readLine();
|
||||
//System.out.println("Read header line, it was "+header);
|
||||
} catch (IOException e) {
|
||||
throw new StingException(e.getMessage(),e);
|
||||
}
|
||||
HashMap<Integer,SequenomVariantInfo> sequenomVariants = parseSequenomHeader(header);
|
||||
try {
|
||||
String line;
|
||||
do {
|
||||
line = reader.readLine();
|
||||
//ystem.out.println("Read line, it was"+line);
|
||||
if ( line != null ) {
|
||||
//System.out.println("Parsing line...");
|
||||
parseSequenomLine(sequenomVariants,line);
|
||||
}
|
||||
} while ( line != null);
|
||||
|
||||
reader.close();
|
||||
convertToLocusMap(sequenomVariants);
|
||||
|
||||
} catch ( IOException e) {
|
||||
throw new StingException("Error reading sequenom file", e);
|
||||
}
|
||||
}
|
||||
|
||||
private HashMap<Integer,SequenomVariantInfo> parseSequenomHeader(String header) {
|
||||
// the header will give us all the probe names and contigs
|
||||
//System.out.println("Parsing header");
|
||||
HashMap<Integer,SequenomVariantInfo> variantsByFileOffset = new HashMap<Integer,SequenomVariantInfo>();
|
||||
String[] fields = header.split("\t");
|
||||
int fieldOffset = 0;
|
||||
for ( String entry : fields ) {
|
||||
if ( fieldOffset > 5 ) {
|
||||
// actually a SNP
|
||||
String snpName = entry.split("\\|")[1];
|
||||
//System.out.println("Entry: "+entry+" Name: "+snpName);
|
||||
variantsByFileOffset.put(fieldOffset,new SequenomVariantInfo(snpName,nSamples));
|
||||
//System.out.println("Adding entry for offset "+fieldOffset);
|
||||
}
|
||||
fieldOffset++;
|
||||
}
|
||||
|
||||
return variantsByFileOffset;
|
||||
}
|
||||
|
||||
private void parseSequenomLine(HashMap<Integer,SequenomVariantInfo> variants, String line) {
|
||||
if ( line == null ) {
|
||||
// early return
|
||||
//System.out.println("Line was null in file");
|
||||
return;
|
||||
}
|
||||
String[] entries = line.split("\t");
|
||||
int offset = 0;
|
||||
if ( entries[1].equalsIgnoreCase("empty")) {
|
||||
return;
|
||||
}
|
||||
sampleNames.add(entries[1]);
|
||||
for ( String entry : entries ) {
|
||||
if ( offset > 5 ) { // actual SNP
|
||||
variants.get(offset).addGenotype(entry);
|
||||
//System.out.println("Added: "+entry+"To "+offset);
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
|
||||
private void convertToLocusMap(HashMap<Integer,SequenomVariantInfo> variants) {
|
||||
for ( SequenomVariantInfo variant : variants.values() ) {
|
||||
//System.out.println("Variant name: "+variant.getName());
|
||||
String loc = genomeLocFromVariantName(variant.getName());
|
||||
//System.out.println(variant.getName());
|
||||
if ( loc == null ) {
|
||||
throw new StingException("Genome locus was null");
|
||||
}
|
||||
sequenomResults.put(loc,variant);
|
||||
}
|
||||
|
||||
variants.clear();
|
||||
variants = null;
|
||||
}
|
||||
|
||||
private String genomeLocFromVariantName(String name) {
|
||||
String[] nameInfo = name.split("_");
|
||||
//System.out.println(name);
|
||||
//System.out.println(nameInfo[0]);
|
||||
String chr = nameInfo[0].substring(1); // format: c3 becomes 3
|
||||
String pos = nameInfo[1].substring(1); // format: p9961104 becomes 9961104
|
||||
if ( useb36contigs ) {
|
||||
return chr+":"+pos;
|
||||
} else {
|
||||
return "chr"+chr+":"+pos;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private HashMap<String,String> parsePopulationFile(File file) {
|
||||
HashMap<String,String> samplesToPopulation = new HashMap<String,String>();
|
||||
try {
|
||||
BufferedReader in = new BufferedReader( new FileReader( file ));
|
||||
String line = in.readLine();
|
||||
while ( line != null ) {
|
||||
String[] populationSamples = line.split("\t");
|
||||
String population = populationSamples[0];
|
||||
for ( int i = 1; i < populationSamples.length; i ++ ) {
|
||||
samplesToPopulation.put(populationSamples[i],population);
|
||||
}
|
||||
}
|
||||
} catch ( IOException e) {
|
||||
throw new StingException("Error reading population file", e);
|
||||
}
|
||||
|
||||
return samplesToPopulation;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
class SequenomVariantInfo {
|
||||
private ArrayList<String> genotypes;
|
||||
private String variantName;
|
||||
|
||||
public SequenomVariantInfo(String name, int nPeople) {
|
||||
genotypes = new ArrayList<String>(nPeople);
|
||||
variantName = name;
|
||||
}
|
||||
|
||||
public void addGenotype(String genotype) {
|
||||
genotypes.add(genotype);
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return variantName;
|
||||
}
|
||||
|
||||
public ArrayList<String> getGenotypes() {
|
||||
return genotypes;
|
||||
}
|
||||
}
|
||||
|
|
@ -223,4 +223,12 @@ public class VCFGenotypeCall extends AlleleConstrainedGenotype implements Genoty
|
|||
public String getSampleName() {
|
||||
return mSampleName;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return a new VCFGenotypeCall with the same internal data as this one
|
||||
*/
|
||||
public VCFGenotypeCall cloneCall() {
|
||||
return new VCFGenotypeCall(this.mRefBase, this.mLocation, this.mGenotype, this.mNegLog10PError, this.mCoverage, this.mSampleName);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Jan 11, 2010
|
||||
* Time: 11:30:59 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class SequenomToVCFIntegrationTest extends WalkerTest {
|
||||
@Test
|
||||
public void testSequenomToVCFWithoutSmartHardy() {
|
||||
String testMD5 = "ed121c7a27270ddefeded21b06fcb96d";
|
||||
String testPedFile = "/humgen/gsa-hpprojects/GATK/data/Validation_Data/Sequenom_Test_File.txt";
|
||||
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -L 1:1000000-2000000 "+
|
||||
"-T SequenomToVCF -b36contig -ns 10 -sPed "+testPedFile+" -vcf %s";
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs,1, Arrays.asList(testMD5));
|
||||
List<File> result = executeTest("TestSequenomToVCFNoSmartHardy",spec).getFirst();
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue