Updated HapMap2VCF to use the VCFGenotypeWriterAdapter interface; fixed bug in VCFParameters that affects VariantsToVCF and HapMap2VCF when reference is lower-cased; added integration test for HapMap2VCF that checks for the lower-case issue by testing against Hg18 region that has lower-cased bases
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2530 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
576594eda2
commit
6c4ac9e663
|
|
@ -6,17 +6,31 @@ import org.broadinstitute.sting.gatk.refdata.HapMapGenotypeROD;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.VariationCall;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Converts HapMap genotype information to VCF format
|
||||
*/
|
||||
public class HapMap2VCF extends RodWalker<Integer, Integer> {
|
||||
|
||||
String[] sample_ids;
|
||||
@Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true)
|
||||
protected File VCF_OUT;
|
||||
|
||||
private VCFGenotypeWriterAdapter vcfWriter;
|
||||
String[] sample_ids = null;
|
||||
|
||||
public void initialize() {
|
||||
vcfWriter = new VCFGenotypeWriterAdapter(VCF_OUT);
|
||||
}
|
||||
|
||||
/**
|
||||
* For each HapMap record, generate and print the VCF record string
|
||||
|
|
@ -26,112 +40,64 @@ public class HapMap2VCF extends RodWalker<Integer, Integer> {
|
|||
if ( tracker == null )
|
||||
return 0;
|
||||
|
||||
Iterator<ReferenceOrderedDatum> rods = tracker.getAllRods().iterator();
|
||||
boolean first_hapmap_rod = true;
|
||||
while ( rods.hasNext() ) {
|
||||
ReferenceOrderedDatum rod = rods.next();
|
||||
for ( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
|
||||
if ( rod instanceof HapMapGenotypeROD ) {
|
||||
HapMapGenotypeROD hapmap_rod = (HapMapGenotypeROD) rod;
|
||||
if (first_hapmap_rod) {
|
||||
out.println(VCFHeaderString(hapmap_rod.getSampleIDs()));
|
||||
first_hapmap_rod = false;
|
||||
|
||||
// If this is the first time map is called, we need to fill out the sample_ids from the rod
|
||||
if (sample_ids == null) {
|
||||
// ensure that there are no duplicate sample IDs
|
||||
sample_ids = hapmap_rod.getSampleIDs();
|
||||
Set<String> sample_id_set = new LinkedHashSet<String>(Arrays.asList(sample_ids));
|
||||
if (sample_id_set.size() != sample_ids.length)
|
||||
throw new IllegalStateException("Sample set passed into HapMap2VCF has repeated sample IDs");
|
||||
|
||||
// setup the header fields
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFHeaderLine("source", "HapMap2VCF"));
|
||||
hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName()));
|
||||
|
||||
// write out the header once
|
||||
vcfWriter.writeHeader(sample_id_set, hInfo);
|
||||
}
|
||||
|
||||
// Get reference and alternate bases
|
||||
|
||||
Character alt_allele, ref_allele;
|
||||
|
||||
Set<Character> observed_alleles = getObservedAlleles (hapmap_rod.getGenotypes());
|
||||
if (observed_alleles.contains('N'))
|
||||
observed_alleles.remove('N');
|
||||
ref_allele = ref.getBase();
|
||||
if (observed_alleles.contains(ref_allele))
|
||||
observed_alleles.remove(ref_allele);
|
||||
if (observed_alleles.isEmpty()) {
|
||||
alt_allele = ref_allele; // ## todo: Confirm that alt allele becomes ref allele
|
||||
}else{
|
||||
if (observed_alleles.size() != 1) {
|
||||
out.println("Error: more than 2 alleles found in hapmap chip position");
|
||||
alt_allele = 'N';
|
||||
System.exit(1);
|
||||
}else{
|
||||
alt_allele = observed_alleles.iterator().next();
|
||||
}
|
||||
}
|
||||
|
||||
// Print all position specific info
|
||||
String vcf = hapmap_rod.get("chrom")+"\t"+
|
||||
hapmap_rod.get("pos")+"\t"+
|
||||
hapmap_rod.get("rs#")+"\t"+
|
||||
ref_allele+"\t"+
|
||||
alt_allele+"\t"+
|
||||
"99\t0\t.\tGT:GQ";
|
||||
out.print(vcf);
|
||||
// Get reference base and locus
|
||||
Character ref_allele = ref.getBase();
|
||||
GenomeLoc loc = hapmap_rod.getLocation();
|
||||
VCFVariationCall variation = new VCFVariationCall(ref_allele, loc, Variation.VARIANT_TYPE.SNP);
|
||||
|
||||
// Print each sample's genotype info
|
||||
List<Genotype> genotype_calls = new ArrayList<Genotype>();
|
||||
String[] hapmap_rod_genotypes = hapmap_rod.getGenotypes();
|
||||
for (int i = 0; i < hapmap_rod_genotypes.length; i++) {
|
||||
String sample_id = sample_ids[i];
|
||||
String hapmap_rod_genotype = hapmap_rod_genotypes[i];
|
||||
|
||||
String allele_strings = "";
|
||||
for (String genotype : hapmap_rod.getGenotypes()) {
|
||||
String allele_str = ""; // one allele string
|
||||
Integer GQ = 99;
|
||||
for (Character allele : genotype.toCharArray()) {
|
||||
if (allele == ref_allele) {
|
||||
allele_str += "0";
|
||||
}else{
|
||||
if (allele == alt_allele) {
|
||||
allele_str += "1";
|
||||
}else{
|
||||
if (allele == 'N') {
|
||||
allele_str += "0";
|
||||
GQ = 0;
|
||||
}else{
|
||||
out.println("ERROR: Unexpected tri-allelic site detected");
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (allele_str.length() == 1) {
|
||||
allele_str += "/";
|
||||
}
|
||||
// for each sample, set the genotype if it exists
|
||||
VCFGenotypeCall genotype = new VCFGenotypeCall(ref_allele, loc);
|
||||
if (!hapmap_rod_genotype.contains("N")) {
|
||||
genotype.setGenotype(DiploidGenotype.createDiploidGenotype(hapmap_rod_genotype.charAt(0), hapmap_rod_genotype.charAt(1)));
|
||||
genotype.setSampleName(sample_id);
|
||||
genotype_calls.add(genotype);
|
||||
}
|
||||
allele_strings += "\t" + allele_str+":"+GQ;
|
||||
}
|
||||
|
||||
out.println(allele_strings);
|
||||
// add each genotype to VCF record and write it
|
||||
variation.setGenotypeCalls(genotype_calls);
|
||||
vcfWriter.addMultiSampleCall(genotype_calls, new VCFVariationCall(ref_allele, loc, Variation.VARIANT_TYPE.SNP));
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
public static Set<Character> getObservedAlleles (String[] genotypes) {
|
||||
Set<Character> observed_alleles = new HashSet<Character>();
|
||||
for (String genotype : genotypes)
|
||||
for (Character allele : genotype.toCharArray())
|
||||
if (!observed_alleles.contains(allele))
|
||||
observed_alleles.add(allele);
|
||||
|
||||
return observed_alleles;
|
||||
}
|
||||
|
||||
public String VCFHeaderString (String[] sample_ids) {
|
||||
String header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";
|
||||
for (String sample_id : sample_ids)
|
||||
header += "\t" + sample_id;
|
||||
|
||||
return header;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the number of loci processed to zero.
|
||||
*
|
||||
* @return 0
|
||||
*/
|
||||
public Integer reduceInit() {
|
||||
out.println("#fileformat=VCFv3.3");
|
||||
out.println("##reference=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta");
|
||||
out.println("##source=VariantsToVCF");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -66,7 +66,7 @@ class VCFParameters {
|
|||
|
||||
public void addAlternateBase(VCFGenotypeEncoding base) {
|
||||
if ( !alternateBases.contains(base) &&
|
||||
!base.toString().equals(String.valueOf(getReferenceBase())) &&
|
||||
!base.toString().equals(String.valueOf(getReferenceBase()).toUpperCase()) &&
|
||||
!base.toString().equals(VCFGenotypeRecord.EMPTY_ALLELE) )
|
||||
alternateBases.add(base);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,37 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.io.File;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class VariantsToVCFIntegrationTest
|
||||
* <p/>
|
||||
* test(s) for the VariantsToVCF walker.
|
||||
*/
|
||||
public class HapMap2VCFIntegrationTest extends WalkerTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testHapMap2VCF() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("4d36df142bbd3d446baec6213771800a");
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-R " + seqLocation + "references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" +
|
||||
" --rodBind HapMapChip,HapMapGenotype," + validationDataLocation + "hapmap_phase_ii+iii_genotypes_chrX_YRI_r27_nr.hapmap" +
|
||||
" -T HapMap2VCF" +
|
||||
" -L chrX:1-1,000,000" +
|
||||
" --vcfOutput %s",
|
||||
1, // just one output file
|
||||
md5);
|
||||
List<File> result = executeTest("testHapMap2VCFUsingGeliInput", spec).getFirst();
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue