Compares each read to the HLA dictionary and outputs closest allele, as well as other stats

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1693 348d0f76-0448-11de-a6fe-93d51630548a
2009-09-23 16:17:23 +00:00 · 2009-09-23 16:17:23 +00:00 · aa66074a0e
parent 3928751bf5
commit aa66074a0e
1 changed files with 256 additions and 0 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java
@ -0,0 +1,256 @@
+/*
+ * FindClosestAlleleWalker finds the most similar HLA allele per read
+ */
+
+package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
+
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.walkers.*;
+
+import java.io.FileInputStream;
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.List;
+import java.lang.Math;
+/**
+ *
+ * @author shermanjia
+ */
+@Requires({DataSource.READS, DataSource.REFERENCE})
+public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
+    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.4digitUnique.sam";
+    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
+    
+    boolean DatabaseLoaded = false;
+    boolean DEBUG = true;
+
+    ArrayList<String> HLAreads = new ArrayList<String>();
+    ArrayList<String> HLAcigars = new ArrayList<String>();
+    ArrayList<String> HLAnames = new ArrayList<String>();
+    ArrayList<String> HLApositions = new ArrayList<String>();
+    double[] SingleAlleleFrequencies;
+
+    int numHLAlleles = 0;
+    int[] HLAstartpos;
+    int[] HLAstoppos;
+
+    Hashtable AlleleFrequencies = new Hashtable();
+    int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
+
+
+    public Integer reduceInit() { 
+    if (!DatabaseLoaded){
+            try{
+                out.printf("Reading HLA database ...");
+                FileInputStream fstream = new FileInputStream(HLAdatabaseFile);
+                DataInputStream in = new DataInputStream(fstream);
+                BufferedReader br = new BufferedReader(new InputStreamReader(in));
+                String strLine; String [] s = null;
+                //Read File Line By Line
+                int i = 0;
+                while ((strLine = br.readLine()) != null)   {
+                    s = strLine.split("\\t");
+                    if (s.length>=10){
+                        //Parse the reads with cigar parser
+                        HLAreads.add(CigarFormatted(s[5],s[9]));
+                        HLAcigars.add(s[5]);
+                        HLAnames.add(s[0]);
+                        HLApositions.add(s[3]);
+                        if (s[0].indexOf("HLA_A") > -1){
+                            if (iAstart < 0){iAstart=i;}
+                            iAstop = i; i++;
+                        }else if (s[0].indexOf("HLA_B") > -1){
+                            if (iBstart < 0){iBstart=i;}
+                            iBstop = i; i++;
+                        }else if (s[0].indexOf("HLA_C") > -1){
+                            if (iCstart < 0){iCstart=i;}
+                            iCstop = i; i++;
+                        }
+                    }
+                }
+                in.close();
+                int n = HLApositions.size(); numHLAlleles = n;
+                HLAstartpos = new int[n]; HLAstoppos = new int[n];
+                SingleAlleleFrequencies = new double[n];
+
+
+                for (i = 0; i < n; i++){
+                    //Find start and stop positions for each allele
+                    HLAstartpos[i]=Integer.parseInt(HLApositions.get(i));
+                    HLAstoppos[i]=HLAstartpos[i]+HLAreads.get(i).length()-1;
+                    SingleAlleleFrequencies[i]=0.0;
+                    //Initialize matrix of probabilities / likelihoods
+
+                }
+                out.printf("DONE! Read %s alleles\n",HLAreads.size());
+            }catch (Exception e){//Catch exception if any
+              System.err.println("Error: " + e.getMessage());
+            }
+
+            try{
+                out.printf("Reading allele frequences ...");
+                FileInputStream fstream = new FileInputStream(CaucasianAlleleFrequencyFile);
+                DataInputStream in = new DataInputStream(fstream);
+                BufferedReader br = new BufferedReader(new InputStreamReader(in));
+                String strLine; String [] s = null;
+                //Read File Line By Line
+                int count = 0;
+                while ((strLine = br.readLine()) != null)   {
+                    s = strLine.split("\\t");
+                    AlleleFrequencies.put(s[0], s[1]);
+                    count++;
+                }
+                in.close();
+                out.printf("Done! Read %s alleles\n",count);
+            }catch (Exception e){//Catch exception if any
+              System.err.println("Error: " + e.getMessage());
+            }
+
+            DatabaseLoaded = true;
+            out.printf("Comparing reads to database ...\n");
+
+            if (DEBUG){
+                //out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles);
+            }
+        }
+        return 0;
+    }
+
+    private String CigarFormatted(String cigar, String read){
+        // returns a cigar-formatted sequence (removes insertions, inserts 'D' to where deletions occur
+        String formattedRead = ""; char c; String count;
+        int cigarPlaceholder = 0; int subcigarLength = 0;
+        int readPlaceholder = 0; int subreadLength = 0;
+
+        //reads cigar string
+        for (int i = 0; i < cigar.length(); i++){
+            c = cigar.charAt(i);
+            if (c == 'M'){
+                //If reach M for match/mismatch, get number immediately preceeding 'M' and tack on that many characters to sequence
+                subcigarLength = i-cigarPlaceholder;
+                count = cigar.substring(cigarPlaceholder, i);
+
+                subreadLength = Integer.parseInt(count);
+                formattedRead = formattedRead + read.substring(readPlaceholder, readPlaceholder+subreadLength);
+
+                //increment placeholders
+                cigarPlaceholder = i+1;
+                readPlaceholder = readPlaceholder + subreadLength;
+            } else if (c == 'I'){
+                //***NOTE: To be modified later if needed (insertions removed here)***
+
+                //If reaches I for insertion, get number before 'I' and skip that many characters in sequence
+                count = cigar.substring(cigarPlaceholder, i);
+                subreadLength = Integer.parseInt(count);
+
+                //increment placeholders without adding inserted bases to sequence (effectively removes insertion).
+                cigarPlaceholder = i+1;
+                readPlaceholder = readPlaceholder + subreadLength;
+            } else if (c == 'H' || c == 'S'){
+                //(H = Headers or S = Soft clipped removed here)***
+
+                //If reaches H for insertion, get number before 'H' and skip that many characters in sequence
+                count = cigar.substring(cigarPlaceholder, i);
+                subreadLength = Integer.parseInt(count);
+
+                //increment cigar placeholder without adding inserted bases to sequence (effectively removes insertion).
+                cigarPlaceholder = i+1;
+            } else if (c == 'D'){
+                //If reaches D for deletion, insert 'D' into sequence as placeholder
+                count = cigar.substring(cigarPlaceholder, i);
+                subreadLength = Integer.parseInt(count);
+
+                //Add one 'D' for each deleted base
+                String deletion = "";
+                for (int j = 1; j <= subreadLength; j++){
+                    deletion = deletion + "D";
+                }
+
+                //update placeholders
+                formattedRead = formattedRead + deletion;
+                cigarPlaceholder = i+1;
+            }
+
+        }
+        return formattedRead;
+    }
+
+
+    public Integer map(char[] ref, SAMRecord read) {
+        int readstart = read.getAlignmentStart();
+        int readstop = read.getAlignmentEnd();
+        double[] nummatched = new double[HLAreads.size()];
+        double[] concordance = new double[HLAreads.size()];
+        double[] numcompared = new double[HLAreads.size()];
+        double maxConcordance = 0;
+        String s1 = CigarFormatted(read.getCigarString(), read.getReadString());
+        char c1, c2;
+        String s2 = "", name = "";
+
+        //For every allele that overlaps with current read
+        for (int i = 0; i < HLAreads.size(); i++){
+            nummatched[i] = 0;
+            //Get concordance between read and specific allele
+            if (DEBUG){
+                //out.printf("%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames.get(i),readstart,readstop,HLAstartpos[i],HLAstoppos[i]);
+            }
+            if (readstart <= HLAstoppos[i] && readstop >= HLAstartpos[i]){
+                s2 = HLAreads.get(i);
+                for (int j = read.getAlignmentStart(); j <= read.getAlignmentEnd(); j++){
+                    c1 = s1.charAt(j-readstart);
+                    c2 = s2.charAt(j-HLAstartpos[i]);
+                    if (DEBUG){
+                        //out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames.get(i),j,j-readstart,j-HLAstartpos[i],c1,c2);
+                    }
+                    if (c1 != 'D'){
+                        numcompared[i]++;
+                        if (c1 == c2){
+                            nummatched[i]++;
+                        }
+                    }
+                }
+            }
+            concordance[i]=nummatched[i]/numcompared[i];
+            if (concordance[i] > maxConcordance){maxConcordance = concordance[i];}
+        }
+        double freq, maxFreq = 0.0;
+        for (int i = 0; i < HLAreads.size(); i++){
+            if (concordance[i] == maxConcordance && maxConcordance > 0){
+                name=HLAnames.get(i).substring(4);
+                if (AlleleFrequencies.containsKey(name)){
+                    freq = Double.parseDouble((String) AlleleFrequencies.get(name).toString());
+                }else{
+                    freq=0.0001;
+                }
+                if (freq > maxFreq){maxFreq = freq;}
+            }
+        }
+
+        out.printf("%s", read.getReadName());
+        for (int i = 0; i < HLAreads.size(); i++){
+            if (concordance[i] == maxConcordance && maxConcordance > 0){
+                name=HLAnames.get(i).substring(4);
+                if (AlleleFrequencies.containsKey(name)){
+                    freq = Double.parseDouble((String) AlleleFrequencies.get(name).toString());
+                }else{
+                    freq=0.0001;
+                }
+                if (freq == maxFreq){
+                    out.printf("\t%s\t%.3f\t%.0f\t%.0f\t%.3f",HLAnames.get(i),concordance[i],numcompared[i],numcompared[i]-nummatched[i],freq);
+                }
+            }
+        }
+        out.print("\n");
+        return 1;
+    }
+
+
+    public Integer reduce(Integer value, Integer sum) {
+        return value + sum;
+    }
+}
+