Converter from HapMap chip genotype data to VCF added; HapMapGenotypeROD adjusted to not convert from Hg18 to b36 formatting of contigs
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2447 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d2770f380c
commit
57516582c2
|
|
@ -16,7 +16,9 @@ public class HapMapGenotypeROD extends TabularROD
|
|||
{
|
||||
//System.out.printf("chrom: %s; pos: %s\n", this.get("chrom"), this.get("pos"));
|
||||
|
||||
return GenomeLocParser.createGenomeLoc(this.get("chrom").replaceAll("chr", ""), Long.parseLong(this.get("pos")));
|
||||
// For converting from Hg18 to b36 format:
|
||||
// return GenomeLocParser.createGenomeLoc(this.get("chrom").replaceAll("chr", ""), Long.parseLong(this.get("pos")));
|
||||
return GenomeLocParser.createGenomeLoc(this.get("chrom"), Long.parseLong(this.get("pos")));
|
||||
}
|
||||
|
||||
public String[] getSampleIDs()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,151 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.HapMapGenotypeROD;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Converts HapMap genotype information to VCF format
|
||||
*/
|
||||
public class HapMap2VCF extends RodWalker<Integer, Integer> {
|
||||
|
||||
String[] sample_ids;
|
||||
|
||||
/**
|
||||
* For each HapMap record, generate and print the VCF record string
|
||||
* Output the VCF header if this is the first record
|
||||
*/
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return 0;
|
||||
|
||||
Iterator<ReferenceOrderedDatum> rods = tracker.getAllRods().iterator();
|
||||
boolean first_hapmap_rod = true;
|
||||
while ( rods.hasNext() ) {
|
||||
ReferenceOrderedDatum rod = rods.next();
|
||||
if ( rod instanceof HapMapGenotypeROD ) {
|
||||
HapMapGenotypeROD hapmap_rod = (HapMapGenotypeROD) rod;
|
||||
if (first_hapmap_rod) {
|
||||
out.println(VCFHeaderString(hapmap_rod.getSampleIDs()));
|
||||
first_hapmap_rod = false;
|
||||
}
|
||||
|
||||
// Get reference and alternate bases
|
||||
|
||||
Character alt_allele, ref_allele;
|
||||
|
||||
Set<Character> observed_alleles = getObservedAlleles (hapmap_rod.getGenotypes());
|
||||
if (observed_alleles.contains('N'))
|
||||
observed_alleles.remove('N');
|
||||
ref_allele = ref.getBase();
|
||||
if (observed_alleles.contains(ref_allele))
|
||||
observed_alleles.remove(ref_allele);
|
||||
if (observed_alleles.isEmpty()) {
|
||||
alt_allele = ref_allele; // ## todo: Confirm that alt allele becomes ref allele
|
||||
}else{
|
||||
if (observed_alleles.size() != 1) {
|
||||
out.println("Error: more than 2 alleles found in hapmap chip position");
|
||||
alt_allele = 'N';
|
||||
System.exit(1);
|
||||
}else{
|
||||
alt_allele = observed_alleles.iterator().next();
|
||||
}
|
||||
}
|
||||
|
||||
// Print all position specific info
|
||||
String vcf = hapmap_rod.get("chrom")+"\t"+
|
||||
hapmap_rod.get("pos")+"\t"+
|
||||
hapmap_rod.get("rs#")+"\t"+
|
||||
ref_allele+"\t"+
|
||||
alt_allele+"\t"+
|
||||
"99\t0\t.\tGT:GQ";
|
||||
out.print(vcf);
|
||||
|
||||
// Print each sample's genotype info
|
||||
|
||||
String allele_strings = "";
|
||||
for (String genotype : hapmap_rod.getGenotypes()) {
|
||||
String allele_str = ""; // one allele string
|
||||
Integer GQ = 99;
|
||||
for (Character allele : genotype.toCharArray()) {
|
||||
if (allele == ref_allele) {
|
||||
allele_str += "0";
|
||||
}else{
|
||||
if (allele == alt_allele) {
|
||||
allele_str += "1";
|
||||
}else{
|
||||
if (allele == 'N') {
|
||||
allele_str += "0";
|
||||
GQ = 0;
|
||||
}else{
|
||||
out.println("ERROR: Unexpected tri-allelic site detected");
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (allele_str.length() == 1) {
|
||||
allele_str += "/";
|
||||
}
|
||||
}
|
||||
allele_strings += "\t" + allele_str+":"+GQ;
|
||||
}
|
||||
|
||||
out.println(allele_strings);
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
public static Set<Character> getObservedAlleles (String[] genotypes) {
|
||||
Set<Character> observed_alleles = new HashSet<Character>();
|
||||
for (String genotype : genotypes)
|
||||
for (Character allele : genotype.toCharArray())
|
||||
if (!observed_alleles.contains(allele))
|
||||
observed_alleles.add(allele);
|
||||
|
||||
return observed_alleles;
|
||||
}
|
||||
|
||||
public String VCFHeaderString (String[] sample_ids) {
|
||||
String header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";
|
||||
for (String sample_id : sample_ids)
|
||||
header += "\t" + sample_id;
|
||||
|
||||
return header;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the number of loci processed to zero.
|
||||
*
|
||||
* @return 0
|
||||
*/
|
||||
public Integer reduceInit() {
|
||||
out.println("#fileformat=VCFv3.3");
|
||||
out.println("##reference=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta");
|
||||
out.println("##source=VariantsToVCF");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment the number of rods processed.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return the new number of rods processed.
|
||||
*/
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum + value;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer value) {}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue