ImputeAllelesWalker fills missing portions of HLA dictionary based on best allele matches
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1729 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
235de38c2e
commit
f7684d9e1b
|
|
@ -0,0 +1,269 @@
|
||||||
|
/*
|
||||||
|
* FindClosestAlleleWalker finds the most similar HLA allele per read
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.playground.gatk.walkers.HLAcaller.ReadCigarFormatter;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.DataInputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Hashtable;
|
||||||
|
import java.util.List;
|
||||||
|
import java.lang.Math;
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @author shermanjia
|
||||||
|
*/
|
||||||
|
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||||
|
public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
|
||||||
|
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.sam";
|
||||||
|
String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.closest";
|
||||||
|
|
||||||
|
boolean DatabaseLoaded = false;
|
||||||
|
boolean DEBUG = false;
|
||||||
|
|
||||||
|
ArrayList<String> HLAreads = new ArrayList<String>();
|
||||||
|
ArrayList<String> HLAcigars = new ArrayList<String>();
|
||||||
|
ArrayList<String> HLAnames = new ArrayList<String>();
|
||||||
|
ArrayList<String> HLApositions = new ArrayList<String>();
|
||||||
|
double[] SingleAlleleFrequencies;
|
||||||
|
|
||||||
|
int numHLAlleles = 0;
|
||||||
|
int[] HLAstartpos;
|
||||||
|
int[] HLAstoppos;
|
||||||
|
int minstartpos = 0;
|
||||||
|
int maxstoppos = 0;
|
||||||
|
|
||||||
|
int HLA_A_start = 30018310;
|
||||||
|
int HLA_A_end = 30021211;
|
||||||
|
int HLA_B_start = 31430239;
|
||||||
|
int HLA_B_end = 31432914;
|
||||||
|
int HLA_C_start = 31344925;
|
||||||
|
int HLA_C_end = 31347827;
|
||||||
|
|
||||||
|
|
||||||
|
ArrayList<String> PolymorphicSites = new ArrayList<String>();
|
||||||
|
|
||||||
|
Hashtable ClosestAllele = new Hashtable();
|
||||||
|
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
|
||||||
|
ReadCigarFormatter formatter = new ReadCigarFormatter();
|
||||||
|
|
||||||
|
public Integer reduceInit() {
|
||||||
|
if (!DatabaseLoaded){
|
||||||
|
try{
|
||||||
|
out.printf("Reading HLA database ...\n");
|
||||||
|
FileInputStream fstream = new FileInputStream(HLAdatabaseFile);
|
||||||
|
DataInputStream in = new DataInputStream(fstream);
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(in));
|
||||||
|
String strLine; String [] s = null;
|
||||||
|
//Read File Line By Line
|
||||||
|
int i = 0;
|
||||||
|
while ((strLine = br.readLine()) != null) {
|
||||||
|
s = strLine.split("\\t");
|
||||||
|
|
||||||
|
if (s.length>=10){
|
||||||
|
//Parse the reads with cigar parser
|
||||||
|
HLAreads.add(formatter.FormatRead(s[5],s[9]));
|
||||||
|
HLAcigars.add(s[5]);
|
||||||
|
HLAnames.add(s[0]);
|
||||||
|
|
||||||
|
HLApositions.add(s[3]);
|
||||||
|
if (s[0].indexOf("HLA_A") > -1){
|
||||||
|
if (iAstart < 0){iAstart=i;}
|
||||||
|
iAstop = i; i++;
|
||||||
|
}else if (s[0].indexOf("HLA_B") > -1){
|
||||||
|
if (iBstart < 0){iBstart=i;}
|
||||||
|
iBstop = i; i++;
|
||||||
|
}else if (s[0].indexOf("HLA_C") > -1){
|
||||||
|
if (iCstart < 0){iCstart=i;}
|
||||||
|
iCstop = i; i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
in.close();
|
||||||
|
int n = HLApositions.size(); numHLAlleles = n;
|
||||||
|
HLAstartpos = new int[n]; HLAstoppos = new int[n];
|
||||||
|
SingleAlleleFrequencies = new double[n];
|
||||||
|
|
||||||
|
|
||||||
|
for (i = 0; i < n; i++){
|
||||||
|
//Find start and stop positions for each allele
|
||||||
|
HLAstartpos[i]=Integer.parseInt(HLApositions.get(i));
|
||||||
|
HLAstoppos[i]=HLAstartpos[i]+HLAreads.get(i).length()-1;
|
||||||
|
if (minstartpos == 0){minstartpos = HLAstartpos[i];}
|
||||||
|
minstartpos = Math.min(minstartpos, HLAstartpos[i]);
|
||||||
|
maxstoppos = Math.max(maxstoppos, HLAstoppos[i]);
|
||||||
|
SingleAlleleFrequencies[i]=0.0;
|
||||||
|
//Initialize matrix of probabilities / likelihoods
|
||||||
|
|
||||||
|
}
|
||||||
|
out.printf("DONE! Read %s alleles\n",HLAreads.size());
|
||||||
|
}catch (Exception e){//Catch exception if any
|
||||||
|
System.err.println("Error: " + e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
try{
|
||||||
|
out.printf("Reading closest allele file ...");
|
||||||
|
FileInputStream fstream = new FileInputStream(ClosestAllelesFile);
|
||||||
|
DataInputStream in = new DataInputStream(fstream);
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(in));
|
||||||
|
String strLine; String [] s = null;
|
||||||
|
//Read File Line By Line
|
||||||
|
int count = 0;
|
||||||
|
while ((strLine = br.readLine()) != null) {
|
||||||
|
s = strLine.split("\\t");
|
||||||
|
ClosestAllele.put(s[0], s[2]);
|
||||||
|
// out.printf("loading: %s\t%s\n",s[0],s[2]);
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
in.close();
|
||||||
|
out.printf("Done! Read %s alleles\n",count);
|
||||||
|
}catch (Exception e){//Catch exception if any
|
||||||
|
System.err.println("Error: " + e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
char c;
|
||||||
|
DatabaseLoaded = true;
|
||||||
|
|
||||||
|
out.printf("Imputing alleles ...\n");
|
||||||
|
|
||||||
|
if (DEBUG){
|
||||||
|
//out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Integer map(char[] ref, SAMRecord read) {
|
||||||
|
int readstart = read.getAlignmentStart();
|
||||||
|
int readstop = read.getAlignmentEnd();
|
||||||
|
int startimputation = 0, stopimputation = 0;
|
||||||
|
|
||||||
|
String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString());
|
||||||
|
char c;
|
||||||
|
String readstring = "", name = "", cigar = "", qualitystring = "";
|
||||||
|
int numM = 0, numI = 0, numD = 0;
|
||||||
|
|
||||||
|
name = read.getReadName();
|
||||||
|
String matchedAllele = (String) ClosestAllele.get(name);
|
||||||
|
|
||||||
|
//out.printf("%s\t%s\n",name,matchedAllele);
|
||||||
|
int index = HLAnames.indexOf(matchedAllele);
|
||||||
|
String matchedRead = HLAreads.get(index);
|
||||||
|
|
||||||
|
if (name.indexOf("HLA_A") > -1){
|
||||||
|
startimputation = HLA_A_start;
|
||||||
|
stopimputation = HLA_A_end;
|
||||||
|
} else if (name.indexOf("HLA_B") > -1){
|
||||||
|
startimputation = HLA_B_start;
|
||||||
|
stopimputation = HLA_B_end;
|
||||||
|
} else if (name.indexOf("HLA_C") > -1){
|
||||||
|
startimputation = HLA_C_start;
|
||||||
|
stopimputation = HLA_C_end;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = startimputation; i <= stopimputation; i++){
|
||||||
|
//if position is within read
|
||||||
|
if (i >= readstart && i <= readstop){
|
||||||
|
c = s1.charAt(i-readstart);
|
||||||
|
//if position is not missing
|
||||||
|
if (c != 'D'){
|
||||||
|
readstring = readstring + c;
|
||||||
|
qualitystring = qualitystring + 'I';
|
||||||
|
numM++;
|
||||||
|
if (numD > 0){
|
||||||
|
cigar = cigar + String.valueOf(numD) + "D";
|
||||||
|
numD = 0;
|
||||||
|
} else if (numI > 0){
|
||||||
|
cigar = cigar + String.valueOf(numI) + "I";
|
||||||
|
numI = 0;
|
||||||
|
}
|
||||||
|
//if position is missing, get base from matched allele
|
||||||
|
}else{
|
||||||
|
c = matchedRead.charAt(i-HLAstartpos[index]);
|
||||||
|
//if matched allele is also missing / deleted at position
|
||||||
|
if (c == 'D'){
|
||||||
|
numD++;
|
||||||
|
if (numM > 0){
|
||||||
|
cigar = cigar + String.valueOf(numM) + "M";
|
||||||
|
numM = 0;
|
||||||
|
}
|
||||||
|
//if matched allele is not missing / deleted at position
|
||||||
|
}else{
|
||||||
|
readstring = readstring + c;
|
||||||
|
qualitystring = qualitystring + 'I';
|
||||||
|
numM++;
|
||||||
|
if (numD > 0){
|
||||||
|
cigar = cigar + String.valueOf(numD) + "D";
|
||||||
|
numD = 0;
|
||||||
|
} else if (numI > 0){
|
||||||
|
cigar = cigar + String.valueOf(numI) + "I";
|
||||||
|
numI = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//if position is outside of range of read, look at matched allele
|
||||||
|
}else{
|
||||||
|
//if within range of matched allele
|
||||||
|
if (i >= HLAstartpos[index] && i <= HLAstoppos[index]){
|
||||||
|
c = matchedRead.charAt(i-HLAstartpos[index]);
|
||||||
|
//if matched allele is also missing / deleted at position
|
||||||
|
if (c == 'D'){
|
||||||
|
numD++;
|
||||||
|
if (numM > 0){
|
||||||
|
cigar = cigar + String.valueOf(numM) + "M";
|
||||||
|
numM = 0;
|
||||||
|
}
|
||||||
|
//if matched allele is not missing / deleted at position
|
||||||
|
}else{
|
||||||
|
readstring = readstring + c;
|
||||||
|
qualitystring = qualitystring + 'I';
|
||||||
|
numM++;
|
||||||
|
if (numD > 0){
|
||||||
|
cigar = cigar + String.valueOf(numD) + "D";
|
||||||
|
numD = 0;
|
||||||
|
} else if (numI > 0){
|
||||||
|
cigar = cigar + String.valueOf(numI) + "I";
|
||||||
|
numI = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
numD++;
|
||||||
|
if (numM > 0){
|
||||||
|
cigar = cigar + String.valueOf(numM) + "M";
|
||||||
|
numM = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (numM > 0){
|
||||||
|
cigar = cigar + String.valueOf(numM) + "M";
|
||||||
|
}else if(numD > 0){
|
||||||
|
cigar = cigar + String.valueOf(numD) + "D";
|
||||||
|
}else if(numI > 0){
|
||||||
|
cigar = cigar + String.valueOf(numI) + "I";
|
||||||
|
}
|
||||||
|
|
||||||
|
out.printf("%s\t0\t6\t%s\t99\t%s\t*\t0\t0\t%s\t%s\n",name,startimputation,cigar,readstring,qualitystring);
|
||||||
|
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return value + sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Loading…
Reference in New Issue