Now using larger database of HLA alleles

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1405 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
sjia 2009-08-11 03:11:14 +00:00
parent 0e7c158949
commit 1851613de4
1 changed files with 63 additions and 32 deletions

View File

@ -27,7 +27,9 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
public boolean suppressPrinting = false; public boolean suppressPrinting = false;
//String HLAdatabaseFile = "/Users/shermanjia/Work/HLA.sam"; //String HLAdatabaseFile = "/Users/shermanjia/Work/HLA.sam";
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.sam"; //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.sam";
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam";
ArrayList<String> HLAreads = new ArrayList<String>(); ArrayList<String> HLAreads = new ArrayList<String>();
@ -44,6 +46,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
boolean DatabaseLoaded = false; boolean DatabaseLoaded = false;
boolean PrintedOutput = false; boolean PrintedOutput = false;
boolean DEBUG = false;
public Pair<Long, Long> reduceInit() { public Pair<Long, Long> reduceInit() {
//Load sequences corresponding to HLA alleles from sam file //Load sequences corresponding to HLA alleles from sam file
@ -79,13 +82,13 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
Prob[i][j]=0; Prob[i][j]=0;
} }
//For debugging: get index for specific alleles //For debugging: get index for specific alleles
if (HLAnames.get(i).equals("HLA_A*310102")) if (HLAnames.get(i).equals("HLA_A*110101"))
j1 = i; j1 = i;
if (HLAnames.get(i).equals("HLA_A*320101")) if (HLAnames.get(i).equals("HLA_A*01010101"))
k1 = i; k1 = i;
if (HLAnames.get(i).equals("HLA_A*330301")) if (HLAnames.get(i).equals("HLA_A*110201"))
j2 = i; j2 = i;
if (HLAnames.get(i).equals("HLA_A*0265")) if (HLAnames.get(i).equals("HLA_A*0109"))
k2 = i; k2 = i;
} }
out.printf("DONE!\n"); out.printf("DONE!\n");
@ -97,7 +100,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
out.printf("Comparing reads to database ...\n"); out.printf("Comparing reads to database ...\n");
//For debugging: prints which HLA alleles were indexed before //For debugging: prints which HLA alleles were indexed before
//out.printf("%s,%s\t%s,%s\n",HLAnames.get(j1),HLAnames.get(k1),HLAnames.get(j2),HLAnames.get(k2)); if (DEBUG){out.printf("%s,%s\t%s,%s\n",HLAnames.get(j1),HLAnames.get(k1),HLAnames.get(j2),HLAnames.get(k2));}
} }
return new Pair<Long,Long>(0l,0l); return new Pair<Long,Long>(0l,0l);
} }
@ -178,11 +181,12 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
long pos_k = 0; long pos_j = 0; long pos_k = 0; long pos_j = 0;
//Debugging purposes: print location, reference base, pileup, and count (before quality filtering) //Debugging purposes: print location, reference base, pileup, and count (before quality filtering)
//out.printf("%s\t", context.getLocation()); if (DEBUG){
//out.printf("ref=%s\t", ref.getBase()); out.printf("%s\t", context.getLocation());
//out.printf("%s\t",pileup.getBases().toString()); out.printf("ref=%s\t", ref.getBase());
//out.printf("%s\t",pileup.getBasePileupAsCountsString()); //out.printf("%s\t",pileup.getBases().toString());
//out.printf("%s\t",pileup.getBasePileupAsCountsString());
}
//Calculate posterior probabilities! //Calculate posterior probabilities!
NewHotnessGenotypeLikelihoods G = new NewHotnessGenotypeLikelihoods(); NewHotnessGenotypeLikelihoods G = new NewHotnessGenotypeLikelihoods();
@ -198,7 +202,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
char base = read.getReadString().charAt(offset); char base = read.getReadString().charAt(offset);
byte qual = read.getBaseQualities()[offset]; byte qual = read.getBaseQualities()[offset];
int mapquality = read.getMappingQuality(); int mapquality = read.getMappingQuality();
if (mapquality >= 20 && BaseUtils.simpleBaseToBaseIndex(base) != -1) { if (mapquality >= 5 && BaseUtils.simpleBaseToBaseIndex(base) != -1) {
if (base == 'A'){numAs++;} if (base == 'A'){numAs++;}
if (base == 'C'){numCs++;} if (base == 'C'){numCs++;}
if (base == 'T'){numTs++;} if (base == 'T'){numTs++;}
@ -209,7 +213,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
} }
//Debugging purposes //Debugging purposes
//out.printf("A[%s]\tC[%s]\tT[%s]\tG[%s]\t",numAs,numCs,numTs,numGs); if (DEBUG) {out.printf("A[%s]\tC[%s]\tT[%s]\tG[%s]\t",numAs,numCs,numTs,numGs);}
//Store confidence scores - this is a local hash that we use to get likelihood given a particular genotype //Store confidence scores - this is a local hash that we use to get likelihood given a particular genotype
@ -265,13 +269,13 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
} }
} }
if ( !suppressPrinting ){ if ( DEBUG ){
//Debugging: print updated likelihoods for 2 sets of HLA alleles, as well as normalized likelihoods for all 10 genotypes //Debugging: print updated likelihoods for 2 sets of HLA alleles, as well as normalized likelihoods for all 10 genotypes
//out.printf("%s%s:%5.0f\t%s%s:%5.0f\t",r1,r2,Prob[j1][k1],s1,s2,Prob[j2][k2]); out.printf("%s%s:%5.0f\t%s%s:%5.0f\t",r1,r2,Prob[j1][k1],s1,s2,Prob[j2][k2]);
for ( DiploidGenotype g : DiploidGenotype.values() ) { for ( DiploidGenotype g : DiploidGenotype.values() ) {
//out.printf("%s %5.0f\t",g.toString(),likelihood - homreflikelihood); out.printf("%s %5.0f\t",g.toString(),likelihood - homreflikelihood);
} }
//out.printf("\n"); out.printf("\n");
} }
} }
@ -292,45 +296,72 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
//Print HLA allele combinations with highest likelihood sums //Print HLA allele combinations with highest likelihood sums
if (!PrintedOutput){ if (!PrintedOutput){
double max = 0; int i_max =0; int j_max = 0; double maxA = 0; int i_maxA =0; int j_maxA = 0;
double m2 = 0; int i_max_2 =0; int j_max_2 = 0; double maxA2 = 0; int i_maxA_2 =0; int j_maxA_2 = 0;
double m3 = 0; int i_max_3 =0; int j_max_3 = 0;
double maxB = 0; int i_maxB =0; int j_maxB = 0;
double maxB2 = 0; int i_maxB_2 =0; int j_maxB_2 = 0;
double maxC = 0; int i_maxC =0; int j_maxC = 0;
double maxC2 = 0; int i_maxC_2 =0; int j_maxC_2 = 0;
out.printf("\n"); out.printf("\n");
//Find max scores. //Find max scores for each HLA gene.
for (int i = 0; i < numHLAlleles; i++){ for (int i = 0; i < numHLAlleles; i++){
for (int j = 0; j < numHLAlleles; j++){ for (int j = 0; j < numHLAlleles; j++){
if (i >= j){ if (i >= j){
//Print likelihoods for all alleles //Print likelihoods for all alleles
out.printf("%s\t%s\t%5.0f",HLAnames.get(i),HLAnames.get(j),Prob[i][j]); out.printf("%s\t%s\t%5.0f\n",HLAnames.get(i),HLAnames.get(j),Prob[i][j]);
if (Prob[i][j] > max){
m3 = m2; i_max_3 = i_max_2; j_max_3 = j_max_2; if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){
m2 = max; i_max_2 = i_max; j_max_2 = j_max; if (Prob[i][j] > maxA){
max = Prob[i][j]; i_max = i; j_max = j; maxA2 = maxA; i_maxA_2 = i_maxA; j_maxA_2 = j_maxA;
maxA = Prob[i][j]; i_maxA = i; j_maxA = j;
}
} else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1){
if (Prob[i][j] > maxB){
maxB2 = maxB; i_maxB_2 = i_maxB; j_maxB_2 = j_maxB;
maxB = Prob[i][j]; i_maxB = i; j_maxB = j;
}
} else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1){
if (Prob[i][j] > maxC){
maxC2 = maxC; i_maxC_2 = i_maxC; j_maxC_2 = j_maxC;
maxC = Prob[i][j]; i_maxC = i; j_maxC = j;
}
} }
} }
} }
} }
//Highest likelihoods //Print alleles with the highest likelihoods
for (int i = 0; i < numHLAlleles; i++){ for (int i = 0; i < numHLAlleles; i++){
for (int j = 0; j < numHLAlleles; j++){ for (int j = 0; j < numHLAlleles; j++){
if (i >= j){ if (i >= j){
if (Prob[i][j] == max){ if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){
out.printf("Highest likelihood: %5.0f in %s and %s; i=%s, j=%s\n",max,HLAnames.get(i),HLAnames.get(j),i,j); if (Prob[i][j] == maxA && maxA > 0){
out.printf("%s\t%s\t%5.0f\t%5.0f\tBEST\n",HLAnames.get(i),HLAnames.get(j),maxA,maxA-maxA2);
}
} else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1){
if (Prob[i][j] == maxB && maxB > 0){
out.printf("%s\t%s\t%5.0f\t%5.0f\tBEST\n",HLAnames.get(i),HLAnames.get(j),maxB,maxB-maxB2);
}
} else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1){
if (Prob[i][j] == maxC && maxC > 0){
out.printf("%s\t%s\t%5.0f\t%5.0f\tBEST\n",HLAnames.get(i),HLAnames.get(j),maxC,maxC-maxC2);
}
} }
} }
} }
} }
//2nd Highest likelihoods //2nd Highest likelihoods
for (int i = 0; i < numHLAlleles; i++){ for (int i = 0; i < numHLAlleles; i++){
for (int j = 0; j < numHLAlleles; j++){ for (int j = 0; j < numHLAlleles; j++){
if (i >= j){ if (i >= j){
if (Prob[i][j] == m2){ if (Prob[i][j] == maxA2){
out.printf("2nd Highest likelihood: %5.0f in %s and %s; i=%s, j=%s\n",m2,HLAnames.get(i),HLAnames.get(j),i,j); //out.printf("2nd Highest likelihood: %5.0f in %s and %s; i=%s, j=%s\n",maxA2,HLAnames.get(i),HLAnames.get(j),i,j);
} }
} }
} }