Converter from HapMap chip genotype data to VCF added; HapMapGenotypeROD adjusted to not convert from Hg18 to b36 formatting of contigs

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2447 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-12-28 01:36:08 +00:00
parent d2770f380c
commit 57516582c2
2 changed files with 154 additions and 1 deletions

View File

@ -16,7 +16,9 @@ public class HapMapGenotypeROD extends TabularROD
{
//System.out.printf("chrom: %s; pos: %s\n", this.get("chrom"), this.get("pos"));
return GenomeLocParser.createGenomeLoc(this.get("chrom").replaceAll("chr", ""), Long.parseLong(this.get("pos")));
// For converting from Hg18 to b36 format:
// return GenomeLocParser.createGenomeLoc(this.get("chrom").replaceAll("chr", ""), Long.parseLong(this.get("pos")));
return GenomeLocParser.createGenomeLoc(this.get("chrom"), Long.parseLong(this.get("pos")));
}
public String[] getSampleIDs()

View File

@ -0,0 +1,151 @@
package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.HapMapGenotypeROD;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
/**
* Converts HapMap genotype information to VCF format
*/
public class HapMap2VCF extends RodWalker<Integer, Integer> {
String[] sample_ids;
/**
* For each HapMap record, generate and print the VCF record string
* Output the VCF header if this is the first record
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return 0;
Iterator<ReferenceOrderedDatum> rods = tracker.getAllRods().iterator();
boolean first_hapmap_rod = true;
while ( rods.hasNext() ) {
ReferenceOrderedDatum rod = rods.next();
if ( rod instanceof HapMapGenotypeROD ) {
HapMapGenotypeROD hapmap_rod = (HapMapGenotypeROD) rod;
if (first_hapmap_rod) {
out.println(VCFHeaderString(hapmap_rod.getSampleIDs()));
first_hapmap_rod = false;
}
// Get reference and alternate bases
Character alt_allele, ref_allele;
Set<Character> observed_alleles = getObservedAlleles (hapmap_rod.getGenotypes());
if (observed_alleles.contains('N'))
observed_alleles.remove('N');
ref_allele = ref.getBase();
if (observed_alleles.contains(ref_allele))
observed_alleles.remove(ref_allele);
if (observed_alleles.isEmpty()) {
alt_allele = ref_allele; // ## todo: Confirm that alt allele becomes ref allele
}else{
if (observed_alleles.size() != 1) {
out.println("Error: more than 2 alleles found in hapmap chip position");
alt_allele = 'N';
System.exit(1);
}else{
alt_allele = observed_alleles.iterator().next();
}
}
// Print all position specific info
String vcf = hapmap_rod.get("chrom")+"\t"+
hapmap_rod.get("pos")+"\t"+
hapmap_rod.get("rs#")+"\t"+
ref_allele+"\t"+
alt_allele+"\t"+
"99\t0\t.\tGT:GQ";
out.print(vcf);
// Print each sample's genotype info
String allele_strings = "";
for (String genotype : hapmap_rod.getGenotypes()) {
String allele_str = ""; // one allele string
Integer GQ = 99;
for (Character allele : genotype.toCharArray()) {
if (allele == ref_allele) {
allele_str += "0";
}else{
if (allele == alt_allele) {
allele_str += "1";
}else{
if (allele == 'N') {
allele_str += "0";
GQ = 0;
}else{
out.println("ERROR: Unexpected tri-allelic site detected");
System.exit(1);
}
}
}
if (allele_str.length() == 1) {
allele_str += "/";
}
}
allele_strings += "\t" + allele_str+":"+GQ;
}
out.println(allele_strings);
}
}
return 1;
}
public static Set<Character> getObservedAlleles (String[] genotypes) {
Set<Character> observed_alleles = new HashSet<Character>();
for (String genotype : genotypes)
for (Character allele : genotype.toCharArray())
if (!observed_alleles.contains(allele))
observed_alleles.add(allele);
return observed_alleles;
}
public String VCFHeaderString (String[] sample_ids) {
String header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";
for (String sample_id : sample_ids)
header += "\t" + sample_id;
return header;
}
/**
* Initialize the number of loci processed to zero.
*
* @return 0
*/
public Integer reduceInit() {
out.println("#fileformat=VCFv3.3");
out.println("##reference=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta");
out.println("##source=VariantsToVCF");
return 0;
}
/**
* Increment the number of rods processed.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return the new number of rods processed.
*/
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
public void onTraversalDone(Integer value) {}
}