git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1537 348d0f76-0448-11de-a6fe-93d51630548a

This commit is contained in:
sjia 2009-09-04 19:12:46 +00:00
parent 0cc634ed5d
commit 471ca8201e
1 changed files with 146 additions and 93 deletions

View File

@ -11,8 +11,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidGenotype;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoods;
import org.broadinstitute.sting.gatk.walkers.genotyper.*; import org.broadinstitute.sting.gatk.walkers.genotyper.*;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
@ -48,17 +46,21 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
int[] HLAstartpos; int[] HLAstartpos;
int[] HLAstoppos; int[] HLAstoppos;
int numHLAlleles = 0; int numHLAlleles = 0;
int numInterval = 1;
double[][] Prob; String[][] Alleles; double[][] Prob; String[][] Alleles;
int j1, k1, j2, k2; int j1, k1, j2, k2;
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
Hashtable Scores = new Hashtable(); Hashtable Scores = new Hashtable();
boolean DatabaseLoaded = false; boolean DatabaseLoaded = false;
boolean PrintedOutput = false; boolean PrintedOutput = false;
boolean DEBUG = false; boolean DEBUG = true;
public Pair<Long, Long> reduceInit() { public Pair<Long, Long> reduceInit() {
//Load sequences corresponding to HLA alleles from sam file //Load sequences corresponding to HLA alleles from sam file
if (!DatabaseLoaded){ if (!DatabaseLoaded){
try{ try{
out.printf("Reading HLA database ..."); out.printf("Reading HLA database ...");
@ -67,6 +69,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
BufferedReader br = new BufferedReader(new InputStreamReader(in)); BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine; String [] s = null; String strLine; String [] s = null;
//Read File Line By Line //Read File Line By Line
int i = 0;
while ((strLine = br.readLine()) != null) { while ((strLine = br.readLine()) != null) {
s = strLine.split("\\t"); s = strLine.split("\\t");
if (s.length>=10){ if (s.length>=10){
@ -75,6 +78,16 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
HLAcigars.add(s[5]); HLAcigars.add(s[5]);
HLAnames.add(s[0]); HLAnames.add(s[0]);
HLApositions.add(s[3]); HLApositions.add(s[3]);
if (s[0].indexOf("HLA_A") > -1){
if (iAstart < 0){iAstart=i;}
iAstop = i; i++;
}else if (s[0].indexOf("HLA_B") > -1){
if (iBstart < 0){iBstart=i;}
iBstop = i; i++;
}else if (s[0].indexOf("HLA_C") > -1){
if (iCstart < 0){iCstart=i;}
iCstop = i; i++;
}
} }
} }
in.close(); in.close();
@ -82,7 +95,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
HLAstartpos = new int[n]; HLAstoppos = new int[n]; HLAstartpos = new int[n]; HLAstoppos = new int[n];
Prob = new double[n][n]; Prob = new double[n][n];
for (int i = 0; i < n; i++){ for (i = 0; i < n; i++){
//Find start and stop positions for each allele //Find start and stop positions for each allele
HLAstartpos[i]=Integer.parseInt(HLApositions.get(i)); HLAstartpos[i]=Integer.parseInt(HLApositions.get(i));
HLAstoppos[i]=HLAstartpos[i]+HLAreads.get(i).length()-1; HLAstoppos[i]=HLAstartpos[i]+HLAreads.get(i).length()-1;
@ -109,11 +122,21 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
out.printf("Comparing reads to database ...\n"); out.printf("Comparing reads to database ...\n");
//For debugging: prints which HLA alleles were indexed before //For debugging: prints which HLA alleles were indexed before
if (DEBUG){out.printf("%s,%s\t%s,%s\n",HLAnames.get(j1),HLAnames.get(k1),HLAnames.get(j2),HLAnames.get(k2));} if (j1 > k1){int tmp = k1; k1 = j1; j1 = tmp;}
if (j2 > k2){int tmp = k2; k2 = j2; j2 = tmp;}
if (DEBUG){
out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles);
out.printf("%s,%s\t%s,%s\n",HLAnames.get(j1),HLAnames.get(k1),HLAnames.get(j2),HLAnames.get(k2));
} }
}
out.printf("Computing for interval %s...\n",numInterval);
numInterval++;
return new Pair<Long,Long>(0l,0l); return new Pair<Long,Long>(0l,0l);
} }
private String CigarFormatted(String cigar, String read){ private String CigarFormatted(String cigar, String read){
// returns a cigar-formatted sequence (removes insertions, inserts 'D' to where deletions occur // returns a cigar-formatted sequence (removes insertions, inserts 'D' to where deletions occur
String formattedRead = ""; char c; String count; String formattedRead = ""; char c; String count;
@ -186,6 +209,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
int numCs = 0; int numCs = 0;
int numGs = 0; int numGs = 0;
int numTs = 0; int numTs = 0;
int depth = 0;
String c1 = ""; String c2 = ""; String c1 = ""; String c2 = "";
long pos_k = 0; long pos_j = 0; long pos_k = 0; long pos_j = 0;
@ -198,7 +222,10 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
} }
//Calculate posterior probabilities! //Calculate posterior probabilities!
GenotypeLikelihoods G = new EmpiricalSubstitutionGenotypeLikelihoods();
GenotypeLikelihoods G = new ThreeStateErrorGenotypeLikelihoods();
//I've tried simply adding the entire pileup to g, but quality scores are not checked, and 'N' bases throw an error //I've tried simply adding the entire pileup to g, but quality scores are not checked, and 'N' bases throw an error
//(GenotypeLikelihoods potentially has bug on line 57) //(GenotypeLikelihoods potentially has bug on line 57)
@ -206,16 +233,17 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
//Check for bad bases and ensure mapping quality myself. This works. //Check for bad bases and ensure mapping quality myself. This works.
for (int i = 0; i < context.getReads().size(); i++) { for (int i = 0; i < context.getReads().size(); i++) {
SAMRecord read = context.getReads().get(i); SAMRecord read = context.getReads().get(i);
int offset = context.getOffsets().get(i); int offset = context.getOffsets().get(i);
char base = read.getReadString().charAt(offset); char base = read.getReadString().charAt(offset);
byte qual = read.getBaseQualities()[offset]; byte qual = read.getBaseQualities()[offset];
int mapquality = read.getMappingQuality(); int mapquality = read.getMappingQuality();
if (mapquality >= 5 && BaseUtils.simpleBaseToBaseIndex(base) != -1) { if (mapquality >= 5 && BaseUtils.simpleBaseToBaseIndex(base) != -1) {
if (base == 'A'){numAs++;} if (base == 'A'){numAs++; depth++;}
if (base == 'C'){numCs++;} if (base == 'C'){numCs++; depth++;}
if (base == 'T'){numTs++;} if (base == 'T'){numTs++; depth++;}
if (base == 'g'){numGs++;} if (base == 'G'){numGs++; depth++;}
//consider base in likelihood calculations if it looks good and has high mapping score //consider base in likelihood calculations if it looks good and has high mapping score
G.add(base, qual, read, offset); G.add(base, qual, read, offset);
} }
@ -224,11 +252,19 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
//Debugging purposes //Debugging purposes
if (DEBUG) {out.printf("A[%s]\tC[%s]\tT[%s]\tg[%s]\t",numAs,numCs,numTs,numGs);} if (DEBUG) {out.printf("A[%s]\tC[%s]\tT[%s]\tg[%s]\t",numAs,numCs,numTs,numGs);}
if (depth > 0){
//Store confidence scores - this is a local hash that we use to get likelihood given a particular genotype //Store confidence scores - this is a local hash that we use to get likelihood given a particular genotype
Scores = new Hashtable(); Scores = new Hashtable();
for ( DiploidGenotype g : DiploidGenotype.values() ) { for ( DiploidGenotype g : DiploidGenotype.values() ) {
Scores.put(g.toString(), G.getLikelihood(g)); Scores.put(g.toString(), G.getLikelihood(g));
//also hash other combination not stored by DiploidGenotype
if (g.toString().equals("AC")) Scores.put("CA", G.getLikelihood(g));
if (g.toString().equals("AG")) Scores.put("GA", G.getLikelihood(g));
if (g.toString().equals("AT")) Scores.put("TA", G.getLikelihood(g));
if (g.toString().equals("CG")) Scores.put("GC", G.getLikelihood(g));
if (g.toString().equals("CT")) Scores.put("TC", G.getLikelihood(g));
if (g.toString().equals("GT")) Scores.put("TG", G.getLikelihood(g));
} }
//Get likelihood score for homozygous ref: used to normalize likelihoood scores at 0. //Get likelihood score for homozygous ref: used to normalize likelihoood scores at 0.
@ -252,8 +288,25 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
if (j == j2) s1 = c1; if (j == j2) s1 = c1;
if (j == k2) s2 = c1; if (j == k2) s2 = c1;
//Updade likelihoods //Only check A-A, B-C, C-C combinations
for (int k = 0; k < numHLAlleles; k++){ int kStart = 0, kStop = 0;
if (j >= iAstart && j <= iAstop){
kStart = iAstart; kStop = iAstop;
} else if (j >= iBstart && j <= iBstop){
kStart = iBstart; kStop = iBstop;
} else if (j >= iCstart && j <= iCstop){
kStart = iCstart; kStop = iCstop;
}
//Fill half-matrix only to speed up process
if (j > kStart){kStart = j;}
if (DEBUG){
//out.printf("j[%s],k[%s,%s]\t",j,kStart,kStop);
}
//Update likelihoods
for (int k = kStart; k <= kStop; k++){
//check if allele 2 overlaps current position //check if allele 2 overlaps current position
if (loc >= HLAstartpos[k] && loc <= HLAstoppos[k]){ if (loc >= HLAstartpos[k] && loc <= HLAstoppos[k]){
@ -261,22 +314,23 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
c2 = Character.toString(Character.toUpperCase(HLAreads.get(k).charAt((int) pos_k))); c2 = Character.toString(Character.toUpperCase(HLAreads.get(k).charAt((int) pos_k)));
//updates likelihoods for both permutations of the alleles, normalized to the likelihood for homozygous reference //updates likelihoods for both permutations of the alleles, normalized to the likelihood for homozygous reference
if (!homref.equals(c1+c2)){
if (Scores.containsKey(c1 + c2)){ if (Scores.containsKey(c1 + c2)){
//out.printf("j[%s],k[%s],g[%s],s[%.0f]\t",j,k,c1+c2,Scores.get(c1 + c2));
likelihood = Double.parseDouble((String) Scores.get(c1 + c2).toString()); likelihood = Double.parseDouble((String) Scores.get(c1 + c2).toString());
Prob[j][k]= Prob[j][k]+ likelihood - homreflikelihood; Prob[j][k]= Prob[j][k]+ likelihood - homreflikelihood;
} else{
}else if(Scores.containsKey(c2 + c1)){ if (DEBUG){
//out.printf("\nCharacters [%s] not found,j[%s],k[%s],%s,%s\n",c1+c2,j,k,HLAnames.get(j),HLAnames.get(k));
likelihood = Double.parseDouble((String) Scores.get(c2 + c1).toString()); }
Prob[j][k]= Prob[j][k]+ likelihood - homreflikelihood; }
}
}
}
} }
} }
}
}
}
if ( DEBUG ){ if ( DEBUG ){
//Debugging: print updated likelihoods for 2 sets of HLA alleles, as well as normalized likelihoods for all 10 genotypes //Debugging: print updated likelihoods for 2 sets of HLA alleles, as well as normalized likelihoods for all 10 genotypes
@ -287,6 +341,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
out.printf("\n"); out.printf("\n");
} }
} }
}
return context.getReads().size(); return context.getReads().size();
} }
@ -318,10 +373,11 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
//Find max scores for each HLA gene. //Find max scores for each HLA gene.
for (int i = 0; i < numHLAlleles; i++){ for (int i = 0; i < numHLAlleles; i++){
for (int j = 0; j < numHLAlleles; j++){ for (int j = i; j < numHLAlleles; j++){
if (i >= j){
//Print likelihoods for all alleles //Print likelihoods for all alleles
if (!DEBUG){
out.printf("%s\t%s\t%5.0f\n",HLAnames.get(i),HLAnames.get(j),Prob[i][j]); out.printf("%s\t%s\t%5.0f\n",HLAnames.get(i),HLAnames.get(j),Prob[i][j]);
}
if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){ if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){
if (Prob[i][j] > maxA){ if (Prob[i][j] > maxA){
@ -341,12 +397,11 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
} }
} }
} }
}
//Print alleles with the highest likelihoods //Print alleles with the highest likelihoods
for (int i = 0; i < numHLAlleles; i++){ for (int i = 0; i < numHLAlleles; i++){
for (int j = 0; j < numHLAlleles; j++){ for (int j = i; j < numHLAlleles; j++){
if (i >= j){
if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){ if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){
if (Prob[i][j] == maxA && maxA > 0){ if (Prob[i][j] == maxA && maxA > 0){
out.printf("%s\t%s\t%5.0f\t%5.0f\tBEST\n",HLAnames.get(i),HLAnames.get(j),maxA,maxA-maxA2); out.printf("%s\t%s\t%5.0f\t%5.0f\tBEST\n",HLAnames.get(i),HLAnames.get(j),maxA,maxA-maxA2);
@ -360,21 +415,19 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
out.printf("%s\t%s\t%5.0f\t%5.0f\tBEST\n",HLAnames.get(i),HLAnames.get(j),maxC,maxC-maxC2); out.printf("%s\t%s\t%5.0f\t%5.0f\tBEST\n",HLAnames.get(i),HLAnames.get(j),maxC,maxC-maxC2);
} }
} }
}
} }
} }
//2nd Highest likelihoods //2nd Highest likelihoods
for (int i = 0; i < numHLAlleles; i++){ for (int i = 0; i < numHLAlleles; i++){
for (int j = 0; j < numHLAlleles; j++){ for (int j = i; j < numHLAlleles; j++){
if (i >= j){
if (Prob[i][j] == maxA2){ if (Prob[i][j] == maxA2){
//out.printf("2nd Highest likelihood: %5.0f in %s and %s; i=%s, j=%s\n",maxA2,HLAnames.get(i),HLAnames.get(j),i,j); //out.printf("2nd Highest likelihood: %5.0f in %s and %s; i=%s, j=%s\n",maxA2,HLAnames.get(i),HLAnames.get(j),i,j);
} }
} }
} }
}
PrintedOutput = true; PrintedOutput = true;
//out.printf("Average depth of coverage is: %.2f in %d total coverage over %d sites\n",((double)result.getFirst() / (double)result.getSecond()), result.getFirst(), result.getSecond()); //out.printf("Average depth of coverage is: %.2f in %d total coverage over %d sites\n",((double)result.getFirst() / (double)result.getSecond()), result.getFirst(), result.getSecond());