From 9be1832d7b2fa4cd690c99eecebbca13a150eda6 Mon Sep 17 00:00:00 2001 From: sjia Date: Mon, 14 Sep 2009 16:10:37 +0000 Subject: [PATCH] Phasing version 1 git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1613 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/HLAcaller/CallHLAWalker.java | 294 +++++++++++++++--- 1 file changed, 246 insertions(+), 48 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java index bf0a2d1a5..844fff653 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java @@ -52,12 +52,17 @@ public class CallHLAWalker extends LocusWalker>{ int[] HLAstoppos; int numHLAlleles = 0; int numInterval = 1; - double[][] Prob; String[][] Alleles; + double[][] LOD; String[][] Alleles; + double[][] ActualLikelihoods; int j1, k1, j2, k2; int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1; Hashtable Scores = new Hashtable(); + Hashtable SNPs = new Hashtable(); + ArrayList SNPlocations = new ArrayList(); + Integer SNPcount = 0; + int[][] SNPcorrelation; boolean DatabaseLoaded = false; boolean PrintedOutput = false; @@ -98,7 +103,8 @@ public class CallHLAWalker extends LocusWalker>{ in.close(); int n = HLApositions.size(); numHLAlleles = n; HLAstartpos = new int[n]; HLAstoppos = new int[n]; - Prob = new double[n][n]; + LOD = new double[n][n]; + ActualLikelihoods = new double[n][n]; for (i = 0; i < n; i++){ //Find start and stop positions for each allele @@ -106,16 +112,17 @@ public class CallHLAWalker extends LocusWalker>{ HLAstoppos[i]=HLAstartpos[i]+HLAreads.get(i).length()-1; //Initialize matrix of probabilities / likelihoods for (int j = 0; j >{ //increment placeholders without adding inserted bases to sequence (effectively removes insertion). cigarPlaceholder = i+1; - readPlaceholder = readPlaceholder + subreadLength; + readPlaceholder = readPlaceholder + subreadLength; + } else if (c == 'H'){ + //(Headers removed here)*** + + //If reaches H for insertion, get number before 'H' and skip that many characters in sequence + count = cigar.substring(cigarPlaceholder, i); + subreadLength = Integer.parseInt(count); + + //increment cigar placeholder without adding inserted bases to sequence (effectively removes insertion). + cigarPlaceholder = i+1; } else if (c == 'D'){ //If reaches D for deletion, insert 'D' into sequence as placeholder count = cigar.substring(cigarPlaceholder, i); @@ -232,11 +248,6 @@ public class CallHLAWalker extends LocusWalker>{ GenotypeLikelihoods G = new ThreeStateErrorGenotypeLikelihoods(); - - //I've tried simply adding the entire pileup to g, but quality scores are not checked, and 'N' bases throw an error - //(GenotypeLikelihoods potentially has bug on line 57) - //g.add(pileup); - //Check for bad bases and ensure mapping quality myself. This works. for (int i = 0; i < context.getReads().size(); i++) { @@ -247,11 +258,13 @@ public class CallHLAWalker extends LocusWalker>{ int mapquality = read.getMappingQuality(); if (mapquality >= 5 && BaseUtils.simpleBaseToBaseIndex(base) != -1) { - //Adds read to list of reads for final phasing comparison String name = read.getReadName(); if (!AllReadNames.contains(name)){ AllReadNames.add(name); AllReads.add(read); + if( DEBUG){ + //out.print("\n" + read.getReadName() + "\t" + read.getCigarString() + "\t" + read.getReadLength() + "\n"); + } } if (base == 'A'){numAs++; depth++;} @@ -269,25 +282,43 @@ public class CallHLAWalker extends LocusWalker>{ if (depth > 0){ //Store confidence scores - this is a local hash that we use to get likelihood given a particular genotype Scores = new Hashtable(); - for ( DiploidGenotype g : DiploidGenotype.values() ) { - Scores.put(g.toString(), G.getLikelihood(g)); - //also hash other combination not stored by DiploidGenotype - if (g.toString().equals("AC")) Scores.put("CA", G.getLikelihood(g)); - if (g.toString().equals("AG")) Scores.put("GA", G.getLikelihood(g)); - if (g.toString().equals("AT")) Scores.put("TA", G.getLikelihood(g)); - if (g.toString().equals("CG")) Scores.put("GC", G.getLikelihood(g)); - if (g.toString().equals("CT")) Scores.put("TC", G.getLikelihood(g)); - if (g.toString().equals("GT")) Scores.put("TG", G.getLikelihood(g)); + Double likelihood = 0.0; + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + likelihood = G.getLikelihood(g); + + Scores.put(g.toString(), likelihood); + //also hash other combination not stored by DiploidGenotype + if (g.toString().equals("AC")) { + Scores.put("CA", likelihood); + } else if (g.toString().equals("AG")){ + Scores.put("GA", likelihood); + } else if (g.toString().equals("AT")){ + Scores.put("TA", likelihood); + } else if (g.toString().equals("CG")){ + Scores.put("GC", likelihood); + } else if (g.toString().equals("CT")){ + Scores.put("TC", likelihood); + } else if (g.toString().equals("GT")){ + Scores.put("TG", likelihood); + } } //Get likelihood score for homozygous ref: used to normalize likelihoood scores at 0. String homref = String.valueOf(ref.getBase())+String.valueOf(ref.getBase()); Double homreflikelihood = Double.parseDouble((String) Scores.get(homref).toString()); + //Add SNP if it is a SNP and hasn't been added before + for ( DiploidGenotype g : DiploidGenotype.values() ) { + likelihood = G.getLikelihood(g); + if (likelihood > homreflikelihood && !SNPs.containsKey(Long.toString(loc))){ + SNPcount++; SNPs.put(Long.toString(loc),SNPcount); SNPlocations.add(Integer.valueOf(Long.toString(loc))); + + } + } //Update likelihood for each combinations of alleles - Double likelihood = 0.0; String r1 = "", r2 = "", s1 = "", s2 = ""; for (int j = 0; j < numHLAlleles; j++){ @@ -302,7 +333,7 @@ public class CallHLAWalker extends LocusWalker>{ if (j == j2) s1 = c1; if (j == k2) s2 = c1; - //Only check A-A, B-C, C-C combinations + //Only check HLA A-A, B-B, C-C combinations int kStart = 0, kStop = 0; if (j >= iAstart && j <= iAstop){ kStart = iAstart; kStop = iAstop; @@ -332,7 +363,9 @@ public class CallHLAWalker extends LocusWalker>{ if (Scores.containsKey(c1 + c2)){ //out.printf("j[%s],k[%s],g[%s],s[%.0f]\t",j,k,c1+c2,Scores.get(c1 + c2)); likelihood = Double.parseDouble((String) Scores.get(c1 + c2).toString()); - Prob[j][k]= Prob[j][k]+ likelihood - homreflikelihood; + ActualLikelihoods[j][k] = LOD[j][k]+ likelihood; + LOD[j][k]= ActualLikelihoods[j][k] - homreflikelihood; + } else{ if (DEBUG){ //out.printf("\nCharacters [%s] not found,j[%s],k[%s],%s,%s\n",c1+c2,j,k,HLAnames.get(j),HLAnames.get(k)); @@ -348,9 +381,9 @@ public class CallHLAWalker extends LocusWalker>{ if ( DEBUG ){ //Debugging: print updated likelihoods for 2 sets of HLA alleles, as well as normalized likelihoods for all 10 genotypes - out.printf("%s%s:%5.0f\t%s%s:%5.0f\t",r1,r2,Prob[j1][k1],s1,s2,Prob[j2][k2]); + out.printf("%s%s:%5.0f\t%s%s:%5.0f\t",r1,r2,LOD[j1][k1],s1,s2,LOD[j2][k2]); for ( DiploidGenotype g : DiploidGenotype.values() ) { - out.printf("%s %5.0f\t",g.toString(),likelihood - homreflikelihood); + out.printf("%s %5.0f\t",g.toString(),Scores.get(g.toString())); } out.printf("\n"); } @@ -360,6 +393,97 @@ public class CallHLAWalker extends LocusWalker>{ return context.getReads().size(); } + private int GetPhaseScore(int alleleindex){ + int score = 0; + ArrayList SNPsInRead = new ArrayList(); + ArrayList readindex = new ArrayList(); + Hashtable indexer = new Hashtable(); + indexer.put('A', (Integer) 0); + indexer.put('C', (Integer) 1); + indexer.put('G', (Integer) 2); + indexer.put('T', (Integer) 3); + indexer.put('D', (Integer) 4); // D for deletion + + //Get HLA allele sequence and position given index + String allele = HLAreads.get(alleleindex); + int allelestart = HLAstartpos[alleleindex], allelestop = HLAstoppos[alleleindex]; + + //Finds SNPs in allele + for (int i = allelestart; i <= allelestop; i++){ + if (SNPs.containsKey(String.valueOf(i))){ + //Stores matrix index + SNPsInRead.add((Integer) SNPs.get(String.valueOf(i))); + //stores position along read + readindex.add((Integer) i - allelestart); + } + } + + //sum score for every pair of SNPs in the allele + for (int i = 0; i < SNPsInRead.size(); i++){ + for (int j = i+1; j < SNPsInRead.size(); j ++){ + char c1 = allele.charAt((int) readindex.get(i)); + char c2 = allele.charAt((int) readindex.get(j)); + if (indexer.get(c1) != null && indexer.get(c2) != null){ + int a = (SNPsInRead.get(i)-1)*5 + (Integer) indexer.get(c1); + int b = (SNPsInRead.get(j)-1)*5 + (Integer) indexer.get(c2); + score += SNPcorrelation[a][b]; + } + } + } + return score; + } + + private void UpdateCorrelation(SAMRecord read, boolean PRINT){ + //Updates correlation table with SNPs from specific read (for phasing) + String s = CigarFormatted(read.getCigarString(), read.getReadString()); + ArrayList SNPsInRead = new ArrayList(); + ArrayList readindex = new ArrayList(); + Hashtable indexer = new Hashtable(); + + indexer.put('A', (Integer) 0); + indexer.put('C', (Integer) 1); + indexer.put('G', (Integer) 2); + indexer.put('T', (Integer) 3); + indexer.put('D', (Integer) 4); // D for deletion + + + int readstart = read.getAlignmentStart(); + + //Find all SNPs in read + for (int i = read.getAlignmentStart(); i <= read.getAlignmentEnd(); i++){ + if (SNPs.containsKey(String.valueOf(i))){ + //Stores matrix index + SNPsInRead.add((Integer) SNPs.get(String.valueOf(i))); + + //stores position along read + readindex.add((Integer) i - readstart); + } + } + + //Update correlation table; for each combination of SNP positions + for (int i = 0; i < SNPsInRead.size(); i++){ + for (int j = i+1; j < SNPsInRead.size(); j ++){ + char c1 = s.charAt((int) readindex.get(i)); + char c2 = s.charAt((int) readindex.get(j)); + + if (indexer.get(c1) != null && indexer.get(c2) != null){ + + int a = (SNPsInRead.get(i)-1)*5 + (Integer) indexer.get(c1); + int b = (SNPsInRead.get(j)-1)*5 + (Integer) indexer.get(c2); + if (PRINT){ + out.printf("ReadIndex[%s,%s] of %s\t", readindex.get(i),readindex.get(j),read.getAlignmentEnd()-readstart); + out.printf("SNP#[%s,%s] of %s\tPOS:%s[%s]\t%s[%s]\t", SNPsInRead.get(i),SNPsInRead.get(j), SNPs.size(), readindex.get(i)+readstart,c1, readindex.get(j)+readstart,c2); + out.printf("MatrixIndex[%s,%s] of %s",a,b,SNPcorrelation.length); + out.printf("\tc1=%s,c2=%s",c1,c2); + out.printf("\ta=%s,b=%s",a,b); + out.printf("\tSNPcorrelation[a][b]=%s\n",SNPcorrelation[a][b]); + } + SNPcorrelation[a][b]+=1; + } + } + } + } + public boolean isReduceByInterval() { return true; } @@ -371,7 +495,6 @@ public class CallHLAWalker extends LocusWalker>{ } public void onTraversalDone(Pair result) { - //Print HLA allele combinations with highest likelihood sums if (!PrintedOutput){ ArrayList TopAlleles = new ArrayList(); @@ -392,46 +515,71 @@ public class CallHLAWalker extends LocusWalker>{ for (int j = i; j < numHLAlleles; j++){ //Print likelihoods for all alleles if (!DEBUG){ - out.printf("%s\t%s\t%5.0f\n",HLAnames.get(i),HLAnames.get(j),Prob[i][j]); + out.printf("%s\t%s\t%5.0f\n",HLAnames.get(i),HLAnames.get(j),LOD[i][j]); } if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){ - if (Prob[i][j] > maxA){ + if (LOD[i][j] > maxA){ maxA2 = maxA; i_maxA_2 = i_maxA; j_maxA_2 = j_maxA; - maxA = Prob[i][j]; i_maxA = i; j_maxA = j; + maxA = LOD[i][j]; i_maxA = i; j_maxA = j; } } else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1){ - if (Prob[i][j] > maxB){ + if (LOD[i][j] > maxB){ maxB2 = maxB; i_maxB_2 = i_maxB; j_maxB_2 = j_maxB; - maxB = Prob[i][j]; i_maxB = i; j_maxB = j; + maxB = LOD[i][j]; i_maxB = i; j_maxB = j; } } else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1){ - if (Prob[i][j] > maxC){ + if (LOD[i][j] > maxC){ maxC2 = maxC; i_maxC_2 = i_maxC; j_maxC_2 = j_maxC; - maxC = Prob[i][j]; i_maxC = i; j_maxC = j; + maxC = LOD[i][j]; i_maxC = i; j_maxC = j; } } } } - - //Print alleles with the highest likelihoods + /* + maxB = 12912; for (int i = 0; i < numHLAlleles; i++){ for (int j = i; j < numHLAlleles; j++){ + + if (HLAnames.get(i).equals("HLA_B*1501") && HLAnames.get(j).equals("HLA_B*1801")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1501") && HLAnames.get(j).equals("HLA_B*1817N")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1505") && HLAnames.get(j).equals("HLA_B*1819")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1515") && HLAnames.get(j).equals("HLA_B*1812")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1801") && HLAnames.get(j).equals("HLA_B*9546")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1801") && HLAnames.get(j).equals("HLA_B*9502")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1801") && HLAnames.get(j).equals("HLA_B*9504")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1801") && HLAnames.get(j).equals("HLA_B*9540")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1801") && HLAnames.get(j).equals("HLA_B*1579N")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1817N") && HLAnames.get(j).equals("HLA_B*9546")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1817N") && HLAnames.get(j).equals("HLA_B*9502")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1817N") && HLAnames.get(j).equals("HLA_B*9504")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1817N") && HLAnames.get(j).equals("HLA_B*9540")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1817N") && HLAnames.get(j).equals("HLA_B*1579N")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1825") && HLAnames.get(j).equals("HLA_B*1571")){LOD[i][j] = 12912;} + if (HLAnames.get(i).equals("HLA_B*1538") && HLAnames.get(j).equals("HLA_B*1811")){LOD[i][j] = 12912;} + + } + } + */ + + //Print alleles with the highest likelihoods + for (Integer i = 0; i < numHLAlleles; i++){ + for (Integer j = i; j < numHLAlleles; j++){ if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){ - if (Prob[i][j] == maxA && maxA > 0){ + if (LOD[i][j] == maxA && maxA > 0){ out.printf("%s\t%s\t%5.0f\t%5.0f\tBEST\n",HLAnames.get(i),HLAnames.get(j),maxA,maxA-maxA2); if (!TopAlleles.contains(i)){TopAlleles.add(i);} if (!TopAlleles.contains(j)){TopAlleles.add(j);} } } else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1){ - if (Prob[i][j] == maxB && maxB > 0){ + if (LOD[i][j] == maxB && maxB > 0){ out.printf("%s\t%s\t%5.0f\t%5.0f\tBEST\n",HLAnames.get(i),HLAnames.get(j),maxB,maxB-maxB2); if (!TopAlleles.contains(i)){TopAlleles.add(i);} if (!TopAlleles.contains(j)){TopAlleles.add(j);} } } else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1){ - if (Prob[i][j] == maxC && maxC > 0){ + if (LOD[i][j] == maxC && maxC > 0){ out.printf("%s\t%s\t%5.0f\t%5.0f\tBEST\n",HLAnames.get(i),HLAnames.get(j),maxC,maxC-maxC2); if (!TopAlleles.contains(i)){TopAlleles.add(i);} if (!TopAlleles.contains(j)){TopAlleles.add(j);} @@ -441,20 +589,70 @@ public class CallHLAWalker extends LocusWalker>{ } } + out.printf("\nSNP count: %s, Number of SNPs %s\n",SNPcount,SNPs.size()); + SNPcorrelation = new int[SNPs.size()*5][SNPs.size()*5]; + + + + //Create correlation matrix and update correlation scores for all reads + for (int i = 0; i < AllReads.size(); i++){ + if (i == 2045){ + UpdateCorrelation(AllReads.get(i), false); + }else{ + UpdateCorrelation(AllReads.get(i), false); + } + //out.printf("[%s,%s]\n", ((Integer) SNPs.get("31431982")) * 4,((Integer) SNPs.get("31432003")) * 4 ); + //out.printf("%s\t[%s,%s]\t31431982[A] 31432003[A]\t%s\n",i,((Integer) SNPs.get("31431982")) * 4,((Integer) SNPs.get("31432003")) * 4 , SNPcorrelation[((Integer) SNPs.get("31431982")) * 4][((Integer) SNPs.get("31432003")) * 4]); + } + + Hashtable indexer = new Hashtable(); + indexer.put('A', (Integer) 0); + indexer.put('C', (Integer) 1); + indexer.put('G', (Integer) 2); + indexer.put('T', (Integer) 3); + indexer.put('D', (Integer) 4); // D for deletion + char[] bases = {'A','C','G','T','D'}; + + if ( false ){ + //prints entries in the correlation matrix that are > 0 + out.print("\n"); + for (int i = 0; i < SNPs.size(); i++){ + int loc1 = SNPlocations.get(i); + for (int j = i ; j < SNPs.size(); j++){ + int loc2 = SNPlocations.get(j); + for (char c1 : bases){ + for (char c2 : bases){ + int a = i*5 + (Integer) indexer.get(c1); + int b = j*5 + (Integer) indexer.get(c2); + if (SNPcorrelation[a][b] > 0){ + out.printf("[i,j]=[%s,%s]\t[a,b]=[%s,%s]\tPOS:%s[%s],%s[%s]\tCorr=%s\n",i,j,a,b,loc1,c1,loc2,c2,SNPcorrelation[a][b]); + } + } + } + + } + } + } + int i, j, k, readstart, readstop, allelestart, allelestop, pos_k; + + out.printf("Number of top alleles: %s\n",TopAlleles.size()); //Check top alleles for phasing - for (int i = 0; i< TopAlleles.size(); i++){ + + for (i = 0; i < TopAlleles.size(); i++){ + //String allele = HLAreads.get(TopAlleles.get(i)); + int phasescore = GetPhaseScore(TopAlleles.get(i)); if ( DEBUG ){ - //Debugging: print list of alleles to be checked for phasing - out.printf("%s\t%s",TopAlleles.get(i),HLAnames.get(TopAlleles.get(i))); - out.printf("\n"); + //Debugging: print list of alleles to be checked for phasing + out.printf("index=%s\t%s\tscore=%s\n",TopAlleles.get(i),HLAnames.get(TopAlleles.get(i)),phasescore); + } } //2nd Highest likelihoods - for (int i = 0; i < numHLAlleles; i++){ - for (int j = i; j < numHLAlleles; j++){ - if (Prob[i][j] == maxA2){ + for (i = 0; i < numHLAlleles; i++){ + for (j = i; j < numHLAlleles; j++){ + if (LOD[i][j] == maxA2){ //out.printf("2nd Highest likelihood: %5.0f in %s and %s; i=%s, j=%s\n",maxA2,HLAnames.get(i),HLAnames.get(j),i,j); } }