HLA caller updated - now searches all (common and rare) alleles, more efficient read filtering and allele comparison runs.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3543 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
sjia 2010-06-14 15:14:40 +00:00
parent d51e6c45a7
commit 5704294f9d
9 changed files with 1122 additions and 184 deletions

View File

@ -33,7 +33,9 @@ import org.broadinstitute.sting.commandline.Argument;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.Enumeration;
import java.util.Vector;
import java.util.Collections;
/**
* Calculates likelihood of observing the data given pairs of HLA alleles. NOTE: run CalculateBaseLikelihoods first! Usage: java -jar GenomeAnalysisTK.jar -T CalculateAlleleLikelihoods -I /humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl INPUT.baselikelihoods -eth\
@ -60,17 +62,20 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAllelesCommon";
Hashtable AlleleFrequencies,UniqueAlleles;
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles4Digit";
String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.txt";
String HLA2DigitFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY_2DIGIT.txt";
Hashtable AlleleFrequencies,UniqueAlleles,Alleles2Digit;
CigarParser formatter = new CigarParser();
double[][] baseLikelihoods;
int[] positions;
boolean loaded = false;
String[] HLAnames, HLAreads;
Integer[] HLAstartpos, HLAstoppos;
ArrayList<String> HLAnamesAL, HLAreadsAL;
String[] HLAnames, HLAreads, HLAnames2, HLAreads2;
Integer[] HLAstartpos, HLAstoppos, HLAstartpos2, HLAstoppos2;
ArrayList<String> HLAnamesAL, HLAreadsAL, Loci, AllelesToSearch;
ArrayList<Integer> HLAstartposAL, HLAstopposAL;
public Integer reduceInit() {
@ -86,21 +91,27 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
HLAstartposAL = new ArrayList<Integer>();
HLAstopposAL = new ArrayList<Integer>();
if (!ethnicity.equals("CaucasianUSA")){
AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_" + ethnicity + ".freq";
}
out.printf("INFO Reading HLA allele frequencies ... ");
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
UniqueAlleles = HLAfreqReader.GetUniqueAlleles();
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
out.printf("INFO Reading HLA alleles ... ");
HLAFileReader HLADictionaryReader = new HLAFileReader();
HLADictionaryReader.ReadFile(HLAdatabaseFile);
HLAreads = HLADictionaryReader.GetSequences();
HLAnames = HLADictionaryReader.GetNames();
HLAstartpos = HLADictionaryReader.GetStartPositions();
HLAstoppos = HLADictionaryReader.GetStopPositions();
HLADictionaryReader = new HLAFileReader();
HLADictionaryReader.ReadFile(HLA2DigitFile);
HLAreads2 = HLADictionaryReader.GetSequences();
HLAnames2 = HLADictionaryReader.GetNames();
HLAstartpos2 = HLADictionaryReader.GetStartPositions();
HLAstoppos2 = HLADictionaryReader.GetStopPositions();
out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
//out.printf("INFO Common alleles:\n");
for (int i = 1; i < UniqueAlleles.size(); i++){
//out.printf("INFO %s\n",UniqueAlleles.values().toArray()[i]);
}
out.printf("INFO Reading HLA dictionary ...");
//out.printf("INFO Reading HLA dictionary ...");
}
@ -108,10 +119,11 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
}
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
HLAnamesAL.add(read.getReadName());
HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString()));
HLAstartposAL.add(read.getAlignmentStart());
HLAstopposAL.add(read.getAlignmentEnd());
//HLAnamesAL.add(read.getReadName());
//HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString()));
//HLAstartposAL.add(read.getAlignmentStart());
//HLAstopposAL.add(read.getAlignmentEnd());
//out.printf("INFO\t%s\t%s\t%s\t%s\n",read.getReadName(),read.getAlignmentStart(),read.getAlignmentEnd(),formatter.FormatRead(read.getCigarString(), read.getReadString()));
return 1;
}
@ -154,11 +166,11 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
}
public void onTraversalDone(Integer numreads) {
out.printf("Done! %s alleles found\n", numreads);
HLAnames = HLAnamesAL.toArray(new String[numreads]);
HLAreads = HLAreadsAL.toArray(new String[numreads]);
HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]);
HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]);
//out.printf("Done! %s alleles found\n", numreads);
//HLAnames = HLAnamesAL.toArray(new String[numreads]);
//HLAreads = HLAreadsAL.toArray(new String[numreads]);
//HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]);
//HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]);
double[][] AlleleLikelihoods = new double[numreads][numreads];
@ -173,6 +185,7 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
int numcombinations = 0;
out.printf("NUM\tAllele1\tAllele2\tSSG\n");
//debugging specific alleles
int index1 = -1, index2 = -1;
if (!debugAlleles.equals("")){
String s[] = debugAlleles.split(",");
@ -185,52 +198,96 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
}
if (index1 > -1 && index2 > -1){
out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2);
double dl = CalculateLikelihood(index1,index2,true);
double dl = CalculateLikelihood(index1,index2,HLAreads2,true);
break;
}
}
}
//Pre-process homozygous combinations to determine top possible alleles (for efficiency)
int numreads2 = HLAnames2.length;
Alleles2Digit = new Hashtable();
Loci = new ArrayList<String>();
double[] AlleleLikelihoods2 = new double[numreads];
for (int i = 0; i < numreads; i++){
name1 = HLAnames[i].substring(4);
String [] n1 = name1.split("\\*");
// out.printf("1: %s\n",n1[0] + "*" + n1[1].substring(0, 3));
if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){
//out.printf("1: %s\n",name1);
//frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
//if (frq1 > minfrq){
numcombinations++;
AlleleLikelihoods2[i] = CalculateLikelihood(i,i,HLAreads,false);
if (AlleleLikelihoods2[i] < 0){
name2 = n1[0] + "*" + n1[1].substring(0, 2);
if (!Loci.contains(n1[0])){Loci.add(n1[0]);}
if (!Alleles2Digit.containsKey(name2)){
Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
}else if ((Double) Alleles2Digit.get(name2) < AlleleLikelihoods2[i]){
Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
}
}
}
//Sort alleles at 2 digit resolution for each locus
AllelesToSearch = new ArrayList<String>();
for (int i = 0; i < Loci.size(); i++){
Enumeration k = Alleles2Digit.keys();
Hashtable AllelesAtLoci = new Hashtable();
//find alleles at the locus
while( k.hasMoreElements() ){
name1 = k.nextElement().toString();
String [] n1 = name1.split("\\*");
if (Loci.get(i).equals(n1[0])){
AllelesAtLoci.put(-1 * (Double) Alleles2Digit.get(name1), name1);
}
}
//Sort alleles at locus, mark top six 2-digit classes for deep search
int num = 1;
Vector v = new Vector(AllelesAtLoci.keySet());
Collections.sort(v);
for (Enumeration e = v.elements(); e.hasMoreElements();) {
Double key = Double.valueOf(e.nextElement().toString());
String allele = AllelesAtLoci.get(key).toString();
if (num <= 10){
AllelesToSearch.add(allele);
num++;
}
//out.printf("%s\t%s\n",allele,key);
}
}
//Iterate through allele pairs to calculate likelihoods
if (true){
numcombinations = 0;
for (int i = 0; i < numreads; i++){
name1 = HLAnames[i].substring(4);
String [] n1 = name1.split("\\*");
if (AllelesToSearch.contains(n1[0] + "*" + n1[1].substring(0, 2))){
//out.printf("1: %s\n",name1);
//frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
//if (frq1 > minfrq){
for (int j = i; j < numreads; j++){
name2 = HLAnames[j].substring(4);
String [] n2 = name2.split("\\*");
// out.printf("2: %s\n",n2[0] + "*" + n2[1].substring(0, 3));
if (UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){
// frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
// if (frq2 > minfrq){
if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
numcombinations++;
AlleleLikelihoods[i][j] = CalculateLikelihood(i,j,false);
if (AllelesToSearch.contains(n2[0] + "*" + n2[1].substring(0, 2))){
if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
numcombinations++;
AlleleLikelihoods[i][j] = CalculateLikelihood(i,j,HLAreads,false);
if (AlleleLikelihoods[i][j] < 0){
out.printf("%s\t%s\t%s\t%.2f\n",numcombinations,name1,name2,AlleleLikelihoods[i][j]);
}
// }else{
// if (DEBUG){out.printf("%s has allele frequency%.5f\n",name2,frq2);}
// }
// }else{
// if (DEBUG){out.printf("%s not found in allele frequency file\n",name2);}
}
}
}
//}else{
// if (DEBUG){out.printf("%s has allele frequency%.5f\n",name1,frq1);}
//}
//}else{
// if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
}
}
}
}
private double CalculateLikelihood(int a1, int a2, boolean debug){
String read1 = HLAreads[a1];
String read2 = HLAreads[a2];
private double CalculateLikelihood(int a1, int a2, String[] HLAalleles, boolean debug){
//Calculates likelihood for specific allele pair
String read1 = HLAalleles[a1];
String read2 = HLAalleles[a2];
int start1 = HLAstartpos[a1];
int start2 = HLAstartpos[a2];
int stop1 = HLAstoppos[a1];

View File

@ -72,34 +72,13 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
ArrayList<SAMRecord> AllReads = new ArrayList<SAMRecord>();
ArrayList<String> AllReadNames = new ArrayList<String>();
boolean HLAdataLoaded = false;
boolean dataLoaded = false;
//Loads HLA dictionary, allele frequencies, and reads to filter
//Loads reads to filter
public Pair<Long, Long> reduceInit() {
if (!HLAdataLoaded){
HLAdataLoaded = true;
out.printf("INFO Reading HLA database ... ");
HLADictionaryReader.ReadFile(HLAdatabaseFile);
HLAreads = HLADictionaryReader.GetReads();
HLAnames = HLADictionaryReader.GetReadNames();
HLAstartpos = HLADictionaryReader.GetStartPositions();
HLAstoppos = HLADictionaryReader.GetStopPositions();
InitializeVariables(HLAreads.length);
out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
if (!ethnicity.equals("CaucasianUSA")){
AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_" + ethnicity + ".freq";
}
out.printf("INFO Reading HLA allele frequencies ... ");
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
if (!dataLoaded){
dataLoaded = true;
if (!filterFile.equals("")){
out.printf("INFO Reading properties file ... ");
@ -137,7 +116,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
int numAs = 0, numCs = 0, numGs = 0, numTs = 0;
//if (DEBUG){
out.printf("%s\t%s\t", context.getLocation(),ref.getBase());
out.printf("%s\t%s\t", context.getLocation(),(char)ref.getBase());
//}
//Calculate posterior probabilities

View File

@ -53,24 +53,25 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
public String ethnicity = "Caucasian";
@Argument(fullName = "useInterval", shortName = "useInterval", doc = "Use only these intervals", required = false)
public String intervalFile = "";
@Argument(fullName = "dictionary", shortName = "dictionary", doc = "bam file of HLA ditionary", required = false)
public String HLAdictionaryFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam";
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
public boolean ONLYFREQUENT = false;
String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt";
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt";
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt";
SAMFileReader HLADictionaryReader = new SAMFileReader();
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile;
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
HLAFileReader HLADictionaryReader = new HLAFileReader();
boolean DatabaseLoaded = false;
boolean DEBUG = false;
ArrayList<String> ClosestAlleles = new ArrayList<String>();
String[] HLAnames, HLAreads;
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
@ -88,6 +89,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
Hashtable AlleleFrequencies = new Hashtable();
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
CigarParser formatter = new CigarParser();
int [][] intervals; int numIntervals;
public Integer reduceInit() {
if (!DatabaseLoaded){
@ -96,9 +98,9 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
//Load HLA dictionary
out.printf("INFO Loading HLA dictionary ... ");
HLADictionaryReader.ReadFile(HLAdictionaryFile);
HLAreads = HLADictionaryReader.GetReads();
HLAnames = HLADictionaryReader.GetReadNames();
HLADictionaryReader.ReadFile(HLAdatabaseFile);
HLAreads = HLADictionaryReader.GetSequences();
HLAnames = HLADictionaryReader.GetNames();
HLAstartpos = HLADictionaryReader.GetStartPositions();
HLAstoppos = HLADictionaryReader.GetStopPositions();
minstartpos = HLADictionaryReader.GetMinStartPos();
@ -110,26 +112,27 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
concordance = new double[HLAreads.length];
numcompared = new double[HLAreads.length];
//Read allele frequencies
if (ethnicity.equals("Black")){
AlleleFrequencyFile = BlackAlleleFrequencyFile;
}else{
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
}
out.printf("INFO Reading HLA allele frequencies ... ");
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
//FindPolymorphicSites(minstartpos,maxstoppos);
//Load list of polymorphic sites
PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader();
siteFileReader.ReadFile(PolymorphicSitesFile);
PolymorphicSites = siteFileReader.GetPolymorphicSites();
NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
numpolymorphicsites = PolymorphicSites.length;
numnonpolymorphicsites = NonPolymorphicSites.length;
if (!intervalFile.equals("")){
TextFileReader fileReader = new TextFileReader();
fileReader.ReadFile(intervalFile);
String[] lines = fileReader.GetLines();
intervals = new int[lines.length][2];
for (int i = 0; i < lines.length; i++) {
String[] s = lines[i].split(":");
String[] intervalPieces = s[1].split("-");
intervals[i][0] = Integer.valueOf(intervalPieces[0]);
intervals[i][1] = Integer.valueOf(intervalPieces[1]);
}
numIntervals = intervals.length;
}
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
out.printf("INFO Comparing reads to database ...\n");
@ -167,7 +170,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
//Polymorphic sites: always increment denominator, increment numerator when bases are concordant
for (int j = 0; j < numpolymorphicsites; j++){
pos = PolymorphicSites[j];
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop && IsWithinInterval(pos)){
c1 = s1.charAt(pos-readstart);
c2 = s2.charAt(pos-allelestart);
if (c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
@ -176,7 +179,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
nummatched[i]++;
}else{
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, pos,c1,c2);
}
}
}
@ -187,13 +190,13 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
if (numcompared[i] > 0){
for (int j = 0; j < numnonpolymorphicsites; j++){
pos = NonPolymorphicSites[j];
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop && IsWithinInterval(pos)){
c1 = s1.charAt(pos-readstart);
c2 = s2.charAt(pos-allelestart);
if (c1 != c2 && c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
numcompared[i]++;
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
}
}
}
@ -205,7 +208,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
concordance[i]=nummatched[i]/numcompared[i];
if (concordance[i] > maxConcordance){maxConcordance = concordance[i];}
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
}
if (findFirst && (concordance[i] == 1)){
break;
@ -228,32 +231,42 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
return maxFreq;
}
private boolean IsWithinInterval(int pos){
boolean isWithinInterval = false;
for (int i = 0; i < numIntervals; i++){
if (pos >= intervals[i][0] && pos <= intervals[i][1]){
isWithinInterval = true;
break;
}
}
return isWithinInterval;
}
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
//Calculate concordance for this read and all overlapping reads
if (read.getMappingQuality() > 0){
double maxConcordance = CalculateConcordance(read);
String stats = "", topAlleles = "";
if (maxConcordance > 0){
String readname = read.getReadName(), allelename = ""; double freq;
//For input bam files that contain HLA alleles, find and print allele frequency
//freq = GetAlleleFrequency(readname);
out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());
//Find the maximum frequency of the alleles most concordant with the read
//double maxFreq = FindMaxAlleleFrequency(maxConcordance);
//Print concordance statistics between this read and the most similar HLA allele(s)
for (int i = 0; i < HLAreads.length; i++){
if (concordance[i] == maxConcordance){
freq = GetAlleleFrequency(HLAnames[i]);
//if (freq == maxFreq){
out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
//}
break;
//freq = GetAlleleFrequency(HLAnames[i]);
if (topAlleles.equals("")){
topAlleles = HLAnames[i];
}else{
topAlleles = topAlleles + "," + HLAnames[i];
}
stats = String.format("%.1f\t%.3f\t%.0f\t%.0f",1.0,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
}
}
out.print("\n");
out.printf("\t%s\t%s\n",stats,topAlleles);
}
}
return 1;

View File

@ -29,18 +29,12 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
public boolean ONLYFREQUENT = false;
//String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
SAMFileReader HLADictionaryReader = new SAMFileReader();
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt";
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
//String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile;
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt";
String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt";
HLAFileReader HLADictionaryReader = new HLAFileReader();
boolean DatabaseLoaded = false;
boolean DEBUG = false;
@ -69,8 +63,8 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
out.printf("INFO Loading HLA dictionary ... ");
HLADictionaryReader.ReadFile(HLAdatabaseFile);
HLAreads = HLADictionaryReader.GetReads();
HLAnames = HLADictionaryReader.GetReadNames();
HLAreads = HLADictionaryReader.GetSequences();
HLAnames = HLADictionaryReader.GetNames();
HLAstartpos = HLADictionaryReader.GetStartPositions();
HLAstoppos = HLADictionaryReader.GetStopPositions();
minstartpos = HLADictionaryReader.GetMinStartPos();
@ -82,18 +76,6 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
concordance = new double[HLAreads.length];
numcompared = new double[HLAreads.length];
//Read allele frequencies
if (ethnicity.equals("Black")){
AlleleFrequencyFile = BlackAlleleFrequencyFile;
}else{
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
}
out.printf("INFO Reading HLA allele frequencies ... ");
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
FindPolymorphicSites(minstartpos,maxstoppos);
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
@ -108,13 +90,15 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
private void FindPolymorphicSites(int start, int stop){
boolean initialized, polymorphic, examined;
char c = ' ';
char c = ' ', ch = ' ';
int A = 0, C = 0, G = 0, T = 0;
ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
//Find polymorphic sites in dictionary
for (int pos = start; pos <= stop; pos++){
initialized = false; polymorphic = false; examined = false;
//look across all alleles at specific position to see if it is polymorphic
A = 0; C = 0; G = 0; T = 0;
for (int i = 0; i < HLAreads.length; i++){
if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
if (!initialized){
@ -122,19 +106,28 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
initialized = true;
examined = true;
}
if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
polymorphicsites.add(pos);
out.printf("POLYMORPHIC\t6\t%s\n", pos);
ch = HLAreads[i].charAt(pos-HLAstartpos[i]);
if (ch == 'A'){A++;}
else if (ch == 'C'){C++;}
else if (ch == 'T'){T++;}
else if (ch == 'G'){G++;}
if (ch != c){
// polymorphicsites.add(pos);
// out.printf("POLYMORPHIC\t6\t%s\n", pos);
polymorphic = true;
break;
// break;
}
}
}
if (polymorphic){
out.printf("%s\t%s\t%s\t%s\t%s\n",pos,A,C,G,T);
}
//if (!polymorphic && examined){
// nonpolymorphicsites.add(pos);
// out.printf("CONSERVED\t6\t%s\n", pos);
//}
}
if (!polymorphic && examined){
nonpolymorphicsites.add(pos);
out.printf("CONSERVED\t6\t%s\n", pos);
}
}
PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);

View File

@ -7,41 +7,68 @@ import java.util.Hashtable;
* @author shermanjia
*/
public class FrequencyFileReader {
Hashtable AlleleFrequencies = new Hashtable();
Hashtable UniqueAlleles = new Hashtable();
Hashtable MaxFrequencies = new Hashtable();
Hashtable CommonAlleles = new Hashtable();
Hashtable [] AlleleFrequencies = null;
String [] Populations = null;
public Hashtable GetAlleleFrequencies(){
public Hashtable [] GetAlleleFrequencies(){
//return allele frequencies for all populations
return AlleleFrequencies;
}
public Hashtable GetUniqueAlleles(){
return UniqueAlleles;
public Hashtable GetCommonAlleles(){
//return list of common alleles
return CommonAlleles;
}
public void ReadFile(String filename, String uniqueAllelesFile){
public Hashtable GetMaxFrequencies(){
//return list of common alleles
return MaxFrequencies;
}
public String[] GetPopulations(){
//Return name of populations
return Populations;
}
public void ReadFile(String filename, String ethnicity){
try{
int linenum = 0;
FileInputStream fstream = new FileInputStream(filename);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine; String [] s = null;
//Read File Line By Line
while ((strLine = br.readLine()) != null) {
linenum++;
s = strLine.split("\\t");
AlleleFrequencies.put(s[0], s[1]);
//System.out.printf("Loaded: %s\t%s\n",s[0],AlleleFrequencies.get(s[0]).toString());
}
in.close();
fstream = new FileInputStream(uniqueAllelesFile);
in = new DataInputStream(fstream);
br = new BufferedReader(new InputStreamReader(in));
//Read File Line By Line
while ((strLine = br.readLine()) != null) {
UniqueAlleles.put(strLine,strLine);
//System.out.printf("Loaded: %s\t%s\n",s[0],AlleleFrequencies.get(s[0]).toString());
if (linenum == 1){
//Determine number of populations, create a hash table for each population
AlleleFrequencies = new Hashtable[s.length-1];
Populations = new String[s.length-1];
for (int i = 1; i < s.length; i++){
Populations[i-1]=s[i];
AlleleFrequencies[i-1] = new Hashtable();
}
}else{
//assign allele frequencies for each population
for (int i = 1; i < s.length; i++){
if (Double.valueOf(s[i]) > 0.0001){
CommonAlleles.put(s[0], s[0]);
}
AlleleFrequencies[i-1].put(s[0],s[i]);
if (!MaxFrequencies.containsKey(s[0])){
MaxFrequencies.put(s[0], s[i]);
}else if (Double.valueOf(MaxFrequencies.get(s[0]).toString()) < Double.valueOf(s[i])){
MaxFrequencies.put(s[0], s[i]);
}
}
}
}
in.close();
}catch (Exception e){//Catch exception if any
System.err.println("FrequencyFileReader Error: " + e.getMessage());
System.err.println("Exception in FrequencyFileReader (" + e.getMessage() + ").");
}
}
}

View File

@ -0,0 +1,737 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.commandline.Argument;
import java.util.*;
import java.util.Map.Entry;
/**
* Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T CalculatePhaseLikelihoods -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\
dMismatches 7] -ethnicity Caucasian | grep -v "INFO" | grep -v "DEBUG" | grep -v "DONE!" > OUTPUT.phaselikelihoods
* @author shermanjia
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class HLACallerWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true)
public String baseLikelihoodsFile = "";
@Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false)
public boolean DEBUG = false;
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
public String filterFile = "";
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
public String ethnicity = "CaucasiansUSA";
@Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false)
public String debugAlleles = "";
@Argument(fullName = "phaseInterval", shortName = "phaseInterval", doc = "Use only these intervals in phase calculation", required = false)
public String phaseIntervalFile = "";
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
public boolean ONLYFREQUENT = false;
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
public int MINALLOWEDMISMATCHES = 7;
GATKArgumentCollection args = this.getToolkit().getArguments();
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt";
String PolymorphicSitesfile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt";
String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt";
// Initializing variables
HLAFileReader HLADictionaryReader = new HLAFileReader();
boolean HLAdataLoaded = false;
String[] HLAnames, HLAreads, Populations;
ArrayList<String> ReadsToDiscard;
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites;
int[][] numObservations, totalObservations, intervals;
int[] SNPnumInRead, SNPposInRead, positions;
CigarParser cigarparser = new CigarParser();
Hashtable MaxLikelihoods = new Hashtable();
Hashtable MaxFrequencies, CommonAlleles, AlleleCount, LocusCount;
Hashtable[] AlleleFrequencies;
int numIntervals;
double[][] baseLikelihoods;
ArrayList AllelesToSearch = new ArrayList<String>();
// setting error rates for phasing algorithm (1% expected error rate for any genotype)
double P_err = 0.01;
double P_correct = 1 - P_err;
double L_err = Math.log10(P_err);
double L_correct = Math.log10(P_correct);
public Integer reduceInit() {
if (!HLAdataLoaded){
HLAdataLoaded = true;
//Load HLA dictionary
HLADictionaryReader.ReadFile(HLAdatabaseFile);
HLAreads = HLADictionaryReader.GetSequences();
HLAnames = HLADictionaryReader.GetNames();
HLAstartpos = HLADictionaryReader.GetStartPositions();
HLAstoppos = HLADictionaryReader.GetStopPositions();
//Load pre-processing file for misaligned reads and list of alleles to search
SimilarityFileReader similarityReader = new SimilarityFileReader();
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
ReadsToDiscard = similarityReader.GetReadsToDiscard();
AllelesToSearch = similarityReader.GetAllelesToSearch();
AlleleCount = similarityReader.GetAlleleCount();
LocusCount = similarityReader.GetLocusCount();
for (int i = 0; i < AllelesToSearch.size(); i++){
out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i)));
}
//Load genotypes and find polymorphic sites (sites that differ from reference)
BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader();
baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, true);
baseLikelihoods = baseLikelihoodsReader.GetBaseLikelihoods();
positions = baseLikelihoodsReader.GetPositions();
PolymorphicSites = baseLikelihoodsReader.GetPolymorphicSites();
out.printf("INFO\t%s polymorphic sites found\n",PolymorphicSites.length);
int l = PolymorphicSites.length;
SNPnumInRead = new int[l];
SNPposInRead = new int[l];
numObservations = new int[l*5][l*5];
totalObservations = new int[l][l];
//Load allele frequencies for different populations
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
HLAfreqReader.ReadFile(AlleleFrequencyFile,ethnicity);
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
MaxFrequencies = HLAfreqReader.GetMaxFrequencies();
CommonAlleles = HLAfreqReader.GetCommonAlleles();
Populations = HLAfreqReader.GetPopulations();
//Load genomic intervals for bam file
if (!phaseIntervalFile.equals("")){
TextFileReader fileReader = new TextFileReader();
fileReader.ReadFile(phaseIntervalFile);
String[] lines = fileReader.GetLines();
intervals = new int[lines.length][2];
for (int i = 0; i < lines.length; i++) {
String[] s = lines[i].split(":");
String[] intervalPieces = s[1].split("-");
intervals[i][0] = Integer.valueOf(intervalPieces[0]);
intervals[i][1] = Integer.valueOf(intervalPieces[1]);
}
numIntervals = intervals.length;
}
}
return 0;
}
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
if (!ReadsToDiscard.contains(read.getReadName())){
UpdateCorrelation(read);
}else{
}
return 1;
}
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
public void onTraversalDone(Integer numreads) {
String name1, name2, d4_name1, d4_name2, d2_name1, d2_name2;
Double frq1 = 0.0, frq2 = 0.0, log1 = 0.0, log2 = 0.0,alleleLikelihood= 0.0, phaseLikelihood=0.0, minfrq = 0.0, likelihood = 0.0;
int numCombinations = 0;
//For debugging specific alleles
if (!debugAlleles.equals("")){
String s[] = debugAlleles.split(",");
int index1 = HLADictionaryReader.GetIndex(s[0]);
int index2 = HLADictionaryReader.GetIndex(s[1]);
out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2);
if (index1 > -1 && index2 > -1){
alleleLikelihood = CalculateAlleleLikelihood(index1,index2,HLAreads,true);
phaseLikelihood = CalculatePhaseLikelihood(index1,index2,true,false);
}
}
if (ONLYFREQUENT){
minfrq = 0.0001;
}
double max;
ArrayList Output = new ArrayList<String>();
ArrayList Likelihoods = new ArrayList<Double>();
Hashtable TotalProb = new Hashtable();
//Search pairs of alleles that satisfy initial search criteria
// Allele 1
for (int i = 0; i < HLAnames.length; i++){
name1 = HLAnames[i].substring(4);
String [] n1 = name1.split("\\*");
d4_name1 = n1[0] + "*" + n1[1].substring(0, 4);
d2_name1 = n1[0] + "*" + n1[1].substring(0, 2);
if (AllelesToSearch.contains(d4_name1)){// || CommonAlleles.contains(d4_name1)
if (MaxFrequencies.containsKey(d4_name1)){
frq1 = Double.parseDouble(MaxFrequencies.get(d4_name1).toString());
}else{
if (n1[1].length() > 4){if (n1[1].substring(4, 5).equals("N")){frq1 = .00000005;}else{frq1 = .000001;}}else{frq1 = .000001;}
}
// Allele 2
for (int j = i; j < HLAnames.length; j++){
name2 = HLAnames[j].substring(4);
String [] n2 = name2.split("\\*");
d4_name2 = n2[0] + "*" + n2[1].substring(0, 4);
d2_name2 = n2[0] + "*" + n2[1].substring(0, 2);
if (n1[0].equals(n2[0]) && (AllelesToSearch.contains(d4_name2))){// || CommonAlleles.contains(d4_name2)
if (MaxFrequencies.containsKey(d4_name2)){
frq2 = Double.parseDouble(MaxFrequencies.get(d4_name2).toString());
}else{
if (n2[1].length() > 4){if (n2[1].substring(4, 5).equals("N")){frq2 = .00000005;}else{frq2 = .000001;}}else{frq2 = .000001;}
}
//Calculate allele and phase likelihoods for each allele pair
alleleLikelihood = CalculateAlleleLikelihood(i,j,HLAreads,false);
numCombinations++;
//If there is data at the allele pair, continue with other calculations
if (alleleLikelihood < 0){
phaseLikelihood = CalculatePhaseLikelihood(i,j,false,false);
log1=Math.log10(frq1);
log2=Math.log10(frq2);
//sum likelihoods
likelihood = alleleLikelihood+phaseLikelihood+log1+log2;
if (!MaxLikelihoods.containsKey(n1[0])){MaxLikelihoods.put(n1[0], likelihood);}
if (likelihood > (Double) MaxLikelihoods.get(n1[0])) {
MaxLikelihoods.put(n1[0], likelihood);
}
Likelihoods.add(likelihood);
String data = String.format("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f",n1[0],name1,name2,alleleLikelihood,phaseLikelihood,log1,log2,likelihood);
Output.add(data);
out.printf("INFO\t%s\n",data);
if (DEBUG){
}
}
}
}
}
}
//Print output
out.printf("Locus\tA1\tA2\tGeno\tPhase\tFrq1\tFrq2\tL\tProb\tReads1\tReads2\tLocus\tEXP");
for (int i = 0; i < Populations.length; i++){
out.printf("\t%s",Populations[i]);
}
out.printf("\n");
//Calculate probabilities for each locus
Double probSum = 0.0, prob = 0.0, f1 = 0.0, f2 = 0.0, aLikelihood4 = 0.0, pLikelihood4 = 0.0;
Integer count = 0;
Hashtable HLA4DigitProbs = new Hashtable();
Hashtable HLA4DigitLs = new Hashtable();
Hashtable HLA4DigitCount = new Hashtable();
Hashtable HLA4DigitF1 = new Hashtable();
Hashtable HLA4DigitF2 = new Hashtable();
Hashtable HLA4DigitA = new Hashtable();
Hashtable HLA4DigitP = new Hashtable();
String key;
Enumeration keys = LocusCount.keys();
while (keys.hasMoreElements()){
String locus = keys.nextElement().toString();
probSum = 0.0;
ArrayList localOutput = new ArrayList<String>();
ArrayList localLikelihoods = new ArrayList<Double>();
//Sum probabilities for each locus
for (int j = 0; j < Output.size(); j++){
String data = Output.get(j).toString();
String [] d = data.split("\\t");
if (d[0].equals(locus)){
localOutput.add(data);
likelihood = (Double)Likelihoods.get(j)-(Double)MaxLikelihoods.get(locus);
localLikelihoods.add(likelihood);
probSum = probSum + Math.pow(10, likelihood);
//out.printf("INFO\t%s\t%s\t%.2f\t%.2f\t%.2f\t%s\n",locus,data,likelihood,(Double)MaxLikelihoods.get(locus),(Double)Likelihoods.get(j),probSum);
}
}
//aggregate statistics for 4-digit types
String A1 = "", A2 = "", a1 = "", a2 = "";
String [] s, s1, s2;
Double prob4digit = 0.0;
int n = 0;
for (int j = 0; j < localOutput.size(); j++){
String data = localOutput.get(j).toString();
prob = Math.pow(10, (Double)localLikelihoods.get(j))/probSum;
if (prob > 0.005){
s = data.split("\\t");
s1 = s[1].split("\\*");
s2 = s[2].split("\\*");
a1 = s1[0] + "*" + s1[1].substring(0,4);
a2 = s2[0] + "*" + s2[1].substring(0,4);
key = a1 + "," + a2;
aLikelihood4 = Double.valueOf(s[3]);
pLikelihood4 = Double.valueOf(s[4]);
likelihood = aLikelihood4 + pLikelihood4 + f1 + f2;
f1 = Double.valueOf(s[5]);
f2 = Double.valueOf(s[6]);
if (!HLA4DigitProbs.containsKey(key)){
HLA4DigitProbs.put(key, prob);
HLA4DigitLs.put(key, likelihood);
HLA4DigitCount.put(key, 1);
HLA4DigitF1.put(key,f1);
HLA4DigitF2.put(key,f2);
HLA4DigitA.put(key,aLikelihood4);
HLA4DigitP.put(key,pLikelihood4);
}else{
prob = prob + Double.valueOf(HLA4DigitProbs.get(key).toString());
HLA4DigitProbs.put(key, prob);
likelihood = likelihood + Double.valueOf(HLA4DigitLs.get(key).toString());
HLA4DigitLs.put(key, likelihood);
n = Integer.valueOf(HLA4DigitCount.get(key).toString()) + 1;
HLA4DigitCount.put(key, n);
aLikelihood4 = aLikelihood4 + Double.valueOf(HLA4DigitA.get(key).toString());
HLA4DigitA.put(key, aLikelihood4);
pLikelihood4 = pLikelihood4 + Double.valueOf(HLA4DigitP.get(key).toString());
HLA4DigitP.put(key, pLikelihood4);
}
}
}
}
//Print results
Enumeration P = HLA4DigitProbs.keys();
String K = ""; String [] s, s1, s2;
double count1, count2, locusCount, accountedFor;
// Sort hashtable.
Vector v = new Vector(HLA4DigitProbs.keySet());
Collections.sort(v);
// Display (sorted) hashtable.
for (Enumeration e = v.elements(); e.hasMoreElements();) {
K = (String)e.nextElement();
prob = (Double) HLA4DigitProbs.get(K);
likelihood = (Double) HLA4DigitLs.get(K);
count = (Integer) HLA4DigitCount.get(K);
s = K.split("\\,");
s1 = s[0].split("\\*"); name1 = s1[1];
s2 = s[1].split("\\*"); name2 = s2[1];
aLikelihood4 = (Double) HLA4DigitA.get(K);
pLikelihood4 = (Double) HLA4DigitP.get(K);
f1 = (Double) HLA4DigitF1.get(K);
f2 = (Double) HLA4DigitF2.get(K);
count1 = Double.valueOf(AlleleCount.get(s[0]).toString());
count2 = Double.valueOf(AlleleCount.get(s[1]).toString());
locusCount = Double.valueOf(LocusCount.get(s1[0]).toString());
if (s[0].equals(s[1])){
accountedFor = count1 / locusCount;
}else{
accountedFor = (count1 + count2) / locusCount;
}
if (prob > 0.1){
out.printf("%s\t%s\t%s\t%.1f\t%.1f\t%.2f\t%.2f\t%.1f\t%.2f\t%.0f\t%.0f\t%.0f\t%.2f",s1[0],name1,name2,aLikelihood4/count,pLikelihood4/count,f1,f2,likelihood/count,prob,count1,count2,locusCount,accountedFor);
for (int i = 0; i < Populations.length; i++){
if (AlleleFrequencies[i].containsKey(s[0])){f1 = Double.valueOf(AlleleFrequencies[i].get(s[0]).toString());}else{f1=.000001;}
if (AlleleFrequencies[i].containsKey(s[1])){f2 = Double.valueOf(AlleleFrequencies[i].get(s[1]).toString());}else{f2=.000001;}
if (!Double.isInfinite(-1*Math.log10(f1*f2))){out.printf("\t%.2f",Math.log10(f1*f2));}else{out.printf("\t-INF");}
}
out.print("\n");
}
out.printf("INFO\t%s\t%s\t%s\t%.1f\t%.1f\t%.2f\t%.2f\t%.1f\t%.2f\t%.0f\t%.0f\t%.0f\t%.2f",s1[0],name1,name2,aLikelihood4/count,pLikelihood4/count,f1,f2,likelihood/count,prob,count1,count2,locusCount,accountedFor);
for (int i = 0; i < Populations.length; i++){
if (AlleleFrequencies[i].containsKey(s[0])){f1 = Double.valueOf(AlleleFrequencies[i].get(s[0]).toString());}else{f1=.000001;}
if (AlleleFrequencies[i].containsKey(s[1])){f2 = Double.valueOf(AlleleFrequencies[i].get(s[1]).toString());}else{f2=.000001;}
if (!Double.isInfinite(-1*Math.log10(f1*f2))){out.printf("\t%.2f",Math.log10(f1*f2));}else{out.printf("\t-INF");}
}
out.print("\n");
}
}
Comparator<Double> valueComparator = new Comparator<Double>() {
@Override public int compare(Double val1, Double val2) {
return val1.compareTo(val2);
}
};
private Integer[] InitializePolymorphicSites(){
int HLA_A_start = 30018310, HLA_A_end = 30021211, num_A_positions = HLA_A_end - HLA_A_start + 1;
int HLA_B_start = 31430239, HLA_B_end = 31432914, num_B_positions = HLA_B_end - HLA_B_start + 1;
int HLA_C_start = 31344925, HLA_C_end = 31347827, num_C_positions = HLA_C_end - HLA_C_start + 1;
Integer[] polymorphicSites = new Integer[num_A_positions+num_B_positions+num_C_positions];
for (int i = 0; i < num_A_positions; i++){
polymorphicSites[i]=HLA_A_start + i;
}
for (int i = 0; i < num_C_positions; i++){
polymorphicSites[i+num_A_positions]=HLA_C_start + i;
}
for (int i = 0; i < num_B_positions; i++){
polymorphicSites[i+num_A_positions+num_C_positions]=HLA_B_start + i;
}
return polymorphicSites;
}
private int IndexOf(char c){
switch(c){
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
//case 'D': return 4;
default: return -1;
}
}
private boolean IsWithinInterval(int pos){
boolean isWithinInterval = false;
for (int i = 0; i < numIntervals; i++){
if (pos >= intervals[i][0] && pos <= intervals[i][1]){
isWithinInterval = true;
break;
}
}
return isWithinInterval;
}
private void UpdateCorrelation(SAMRecord read){
//Updates correlation table with SNPs from specific read (for phasing)
String s = cigarparser.FormatRead(read.getCigarString(), read.getReadString());
ArrayList<Integer> SNPsInRead = new ArrayList<Integer>();
ArrayList<Integer> readindex = new ArrayList<Integer>();
int readstart = read.getAlignmentStart();
int readend = read.getAlignmentEnd();
int numPositions = PolymorphicSites.length;
char c1, c2;
int a, b, i, j, SNPcount = 0;
//Find all SNPs in read
for (i = 0; i < numPositions; i++){
if (PolymorphicSites[i] > readstart && PolymorphicSites[i] < readend){
SNPnumInRead[i] = SNPcount;
SNPposInRead[i] = PolymorphicSites[i]-readstart;
SNPcount++;
}else{
SNPnumInRead[i] = -1;
SNPposInRead[i] = -1;
}
}
//Update correlation table; for each combination of SNP positions
for (i = 0; i < numPositions; i++){
if (SNPnumInRead[i] > -1){
c1 = s.charAt(SNPposInRead[i]);
if (IndexOf(c1) > -1){
for (j = i+1; j < numPositions; j ++){
if (SNPnumInRead[j] > -1){
c2 = s.charAt(SNPposInRead[j]);
if (IndexOf(c2) > -1){
a = i*5 + IndexOf(c1);
b = j*5 + IndexOf(c2);
numObservations[a][b]++;
totalObservations[i][j]++;
if (DEBUG){
//out.printf("INFO %s\t%s %s\t[i=%s,j=%s]\t[%s,%s]\t[%s,%s]\n",read.getReadName(),PolymorphicSites[i],PolymorphicSites[j],i,j,c1,c2,a,b);
}
}
}
}
}
}
}
}
private int GenotypeIndex(char a, char b){
switch(a){
case 'A':
switch(b){
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
};
case 'C':
switch(b){
case 'A': return 1;
case 'C': return 4;
case 'G': return 5;
case 'T': return 6;
};
case 'G':
switch(b){
case 'A': return 2;
case 'C': return 5;
case 'G': return 7;
case 'T': return 8;
};
case 'T':
switch(b){
case 'A': return 3;
case 'C': return 6;
case 'G': return 8;
case 'T': return 9;
};
default: return -1;
}
}
private double CalculateAlleleLikelihood(int a1, int a2, String[] HLAalleles, boolean debug){
//Calculates likelihood for specific allele pair
String read1 = HLAalleles[a1];
String read2 = HLAalleles[a2];
int start1 = HLAstartpos[a1];
int start2 = HLAstartpos[a2];
int stop1 = HLAstoppos[a1];
int stop2 = HLAstoppos[a2];
double likelihood = 0;
int pos, index;
char c1, c2;
for (int i = 0; i < positions.length; i++){
pos = positions[i];
if (pos < stop1 && pos > start1 && pos < stop2 && pos > start2){
index = GenotypeIndex(read1.charAt(pos-start1),read2.charAt(pos-start2));
if (index > -1){
likelihood = likelihood + baseLikelihoods[i][index];
if (debug){
c1 = read1.charAt(pos-start1);
c2 = read2.charAt(pos-start2);
out.printf("INFO: DEBUG %s\t%s\t%s\t%s\t%s\t%s\t%.2f\n",HLAnames[a1],HLAnames[a2],pos,c1,c2,index,likelihood);
}
}
}
}
return likelihood;
}
private double CalculatePhaseLikelihood(int alleleIndex1, int alleleIndex2, boolean PRINTDEBUG, boolean SINGLEALLELE){
//calculate the likelihood that the particular combination of alleles satisfies the phase count data
double likelihood = 0, prob = 0;
int readstart1 = HLAstartpos[alleleIndex1]; int readend1 = HLAstoppos[alleleIndex1];
int readstart2 = HLAstartpos[alleleIndex2]; int readend2 = HLAstoppos[alleleIndex2];
int combinedstart = Math.max(readstart1,readstart2);
int combinedstop = Math.min(readend1,readend2);
int numPositions = PolymorphicSites.length, SNPcount = 0;
int i, j, a1, a2, b1, b2;
char c11, c12, c21, c22;
int numInPhase = 0, numOutOfPhase = 0;
double sumInPhase = 0.0, sumObservations = 0.0;
//Find all SNPs in read
for (i = 0; i < numPositions; i++){
if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop ){ // && IsWithinInterval(PolymorphicSites[i])
if (PRINTDEBUG){
out.printf("DEBUG\t%s\t%s\n",PolymorphicSites[i],SNPcount);
}
SNPnumInRead[i] = SNPcount;
SNPposInRead[i] = PolymorphicSites[i]-combinedstart;
SNPcount++;
}else{
SNPnumInRead[i] = -1;
SNPposInRead[i] = -1;
}
}
String s1 = HLAreads[alleleIndex1];
String s2 = HLAreads[alleleIndex2];
if (PRINTDEBUG){
out.printf("DEBUG %s SNPs found in %s and %s between %s and %s\n",SNPcount,HLAnames[alleleIndex1], HLAnames[alleleIndex2],combinedstart,combinedstop);
}
//Iterate through every pairwise combination of SNPs, and update likelihood for the allele combination
for (i = 0; i < numPositions; i++){
if (SNPnumInRead[i] > -1){
c11 = s1.charAt(SNPposInRead[i]);
c21 = s2.charAt(SNPposInRead[i]);
if (IndexOf(c11) > -1 && IndexOf(c21) > -1){
for (j = i+1; j < numPositions; j ++){
if (SNPnumInRead[j] > -1 && totalObservations[i][j] > 0){
c12 = s1.charAt(SNPposInRead[j]);
c22 = s2.charAt(SNPposInRead[j]);
if (IndexOf(c12) > -1 && IndexOf(c22) > -1){
a1 = i*5 + IndexOf(c11);
b1 = j*5 + IndexOf(c12);
a2 = i*5 + IndexOf(c21);
b2 = j*5 + IndexOf(c22);
//check if the two alleles are identical at the chosen 2 locations
if ((c11 == c21) && (c12 == c22)){
numInPhase = numObservations[a1][b1];
}else{
numInPhase = numObservations[a1][b1] + numObservations[a2][b2];
}
numOutOfPhase = totalObservations[i][j] - numInPhase;
sumInPhase += (double) numInPhase;
sumObservations += (double) totalObservations[i][j];
if (SINGLEALLELE){
likelihood = sumInPhase / sumObservations;
}else{
likelihood += numInPhase * L_correct + numOutOfPhase * L_err;
}
//prob = Math.max((double) numInPhase / (double) totalObservations[i][j], 0.0001);
//likelihood += Math.log10(prob);
//likelihood = Math.max(Math.log10(sumInPhase / sumObservations),-10);
if (PRINTDEBUG){
out.printf("DEBUG %s %s %s[%s%s] %s[%s%s]\t[%s,%s]\t[%s,%s] [%s,%s]\t%s / %s\t%s / %s\t %.2f\n",HLAnames[alleleIndex1],HLAnames[alleleIndex2],PolymorphicSites[i],c11,c21,PolymorphicSites[j],c12,c22, i,j,a1,b1,a2,b2,numInPhase,totalObservations[i][j],sumInPhase,sumObservations,likelihood);
}
break;
}
}
}
}
}
}
return likelihood;
}
private void ExtraCode(){
String name1, name2;
//Pre-process homozygous combinations to determine top possible alleles (for efficiency)
Hashtable Alleles2Digit = new Hashtable();
Hashtable Phase2Digit = new Hashtable();
Hashtable Count2Digit = new Hashtable();
Hashtable AllelesAtLocus = new Hashtable();
ArrayList Loci = new ArrayList<String>();
double[] AlleleLikelihoods2 = new double[HLAnames.length];
double[] PhaseLikelihoods2 = new double[HLAnames.length];
for (int i = 0; i < HLAnames.length; i++){
name1 = HLAnames[i].substring(4);
String [] n1 = name1.split("\\*");
AlleleLikelihoods2[i] = CalculateAlleleLikelihood(i,i,HLAreads,false);
PhaseLikelihoods2[i] = CalculatePhaseLikelihood(i,i,false,true);
if (AlleleLikelihoods2[i] < 0){
name2 = n1[0] + "*" + n1[1].substring(0, 4);
if (!Loci.contains(n1[0])){
Loci.add(n1[0]);
MaxLikelihoods.put(n1[0], 0.0);
AllelesAtLocus.put(n1[0], 1);
}else{
AllelesAtLocus.put(n1[0], 1+(Integer)AllelesAtLocus.get(n1[0]));
}
if (!Alleles2Digit.containsKey(name2)){
Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
Phase2Digit.put(name2, PhaseLikelihoods2[i]);
Count2Digit.put(name2, 1.0);
}else {
if (AlleleLikelihoods2[i] > (Double) Alleles2Digit.get(name2)){
Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
}
if (PhaseLikelihoods2[i] > (Double) Phase2Digit.get(name2)){
Phase2Digit.put(name2, PhaseLikelihoods2[i]);
}
Count2Digit.put(name2,1.0+(Double)Count2Digit.get(name2));
}
}
}
//Sort alleles at 2 digit resolution for each locus
for (int i = 0; i < Loci.size(); i++){
Enumeration k = Alleles2Digit.keys();
Hashtable AllelesAtLoci = new Hashtable();
HashMap map = new HashMap();
int numalleles = 0;
//find alleles at the locus
while( k.hasMoreElements() ){
name1 = k.nextElement().toString();
String [] n1 = name1.split("\\*");
if (Loci.get(i).equals(n1[0])){
numalleles++;
map.put(name1,-1 * (Double) Alleles2Digit.get(name1));
AllelesAtLoci.put(-1 * (Double) Alleles2Digit.get(name1), name1);
//out.printf("%s\t%.2f\n",name1,-1 * (Double) Alleles2Digit.get(name1));
}
}
//Sort alleles at locus, mark top six 2-digit classes for deep search
List<Map.Entry<String, Double>> entries = new ArrayList<Entry<String, Double>>(map.entrySet());
Collections.sort(entries, new Comparator<Entry<String, Double>>() {
public int compare(Entry<String, Double> e1, Entry<String, Double> e2) {
return e1.getValue().compareTo(e2.getValue());
}
});
int num = 1;
for (Map.Entry<String, Double> entry : entries) {
if (num <= Math.max(5,entries.size()/8)){
AllelesToSearch.add(entry.getKey());
out.printf("INFO\t%s\t%.2f\t%.2f\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey()));
num++;
}else{
if (!AllelesToSearch.contains(entry.getKey())){
out.printf("INFO\t%s\t%.2f\t%.2f\tNotSearched\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey()));
}else{
out.printf("INFO\t%s\t%.2f\t%.2f\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey()));
}
}
}
out.printf("INFO\n");
}
}
}

View File

@ -0,0 +1,79 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import java.io.*;
import java.util.ArrayList;
/**
*
* @author shermanjia
*/
public class HLAFileReader {
ArrayList<String> Sequences = new ArrayList<String>();
ArrayList<String> Names = new ArrayList<String>();
ArrayList<Integer> StartPositions = new ArrayList<Integer>();
ArrayList<Integer> StopPositions = new ArrayList<Integer>();
int minstartpos;
int maxstoppos;
CigarParser formatter = new CigarParser();
public String[] GetNames(){
return Names.toArray(new String[Names.size()]);
}
public String[] GetSequences(){
return Sequences.toArray(new String[Sequences.size()]);
}
public Integer[] GetStartPositions(){
return StartPositions.toArray(new Integer[StartPositions.size()]);
}
public Integer[] GetStopPositions(){
return StopPositions.toArray(new Integer[StopPositions.size()]);
}
public Integer GetMinStartPos(){
return minstartpos;
}
public Integer GetMaxStopPos(){
return maxstoppos;
}
public int GetIndex(String readname){
if (Names.contains(readname)){
return Names.indexOf(readname);
}else{
return -1;
}
}
public void ReadFile(String filename){
try{
FileInputStream fstream = new FileInputStream(filename);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine; String [] s = null;
//Read File Line By Line
while ((strLine = br.readLine()) != null) {
s = strLine.split("\\t");
Sequences.add(s[3]);
Names.add(s[0]);
StartPositions.add(Integer.valueOf(s[1]));
StopPositions.add(Integer.valueOf(s[2]));
minstartpos = Math.min(minstartpos, Integer.valueOf(s[1]));
maxstoppos = Math.max(maxstoppos, Integer.valueOf(s[2]));
}
in.close();
}catch (Exception e){//Catch exception if any
System.err.println("HLAFileReader Error: " + e.getMessage());
}
}
}

View File

@ -24,6 +24,14 @@ public class PolymorphicSitesFileReader {
return NonPolymorphicSites.toArray(new Integer[NonPolymorphicSites.size()]);
}
public void AddSites(Integer [] sites){
for (int i = 0; i < sites.length; i++){
if (!PolymorphicSites.contains(sites[i])){
PolymorphicSites.add(sites[i]);
}
}
}
public void ReadFile(String filename){
try{
FileInputStream fstream = new FileInputStream(filename);
@ -34,10 +42,10 @@ public class PolymorphicSitesFileReader {
int i = 0;
while ((strLine = br.readLine()) != null) {
s = strLine.split("\\t");
if (s[0].equals("POLYMORPHIC")){
PolymorphicSites.add(Integer.valueOf(s[2]));
if (Double.valueOf(s[8]) > 0.1){
PolymorphicSites.add(Integer.valueOf(s[0]));
}else{
NonPolymorphicSites.add(Integer.valueOf(s[2]));
NonPolymorphicSites.add(Integer.valueOf(s[0]));
}
}
in.close();

View File

@ -14,6 +14,9 @@ import java.util.Hashtable;
*/
public class SimilarityFileReader {
ArrayList<String> ReadsToDiscard = new ArrayList<String>();
ArrayList<String> AllelesToSearch = new ArrayList<String>();
Hashtable AlleleCount = new Hashtable();
Hashtable LocusCount = new Hashtable();
Hashtable Concordance = new Hashtable();
Hashtable NumMatches = new Hashtable();
Hashtable NumMismatches = new Hashtable();
@ -22,10 +25,22 @@ public class SimilarityFileReader {
return ReadsToDiscard;
}
public ArrayList<String> GetAllelesToSearch(){
return AllelesToSearch;
}
public String[] GetReadsToDiscardArray(){
return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]);
}
public Hashtable GetAlleleCount(){
return AlleleCount;
}
public Hashtable GetLocusCount(){
return LocusCount;
}
public Hashtable GetConcordance(){
return Concordance;
}
@ -43,26 +58,56 @@ public class SimilarityFileReader {
FileInputStream fstream = new FileInputStream(filename);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine; String [] s = null;
String strLine; String [] s = null, alleles = null, a = null; String allele;
//Read File Line By Line
int i = 0;
while ((strLine = br.readLine()) != null) {
s = strLine.split("\\t");
if (s.length >= 6){
Double matchFraction = Double.valueOf(s[4]);
int numMismatches = Integer.valueOf(s[6]);
Double matchFraction = Double.valueOf(s[3]);
int numMismatches = Integer.valueOf(s[5]);
int numMatches = Integer.valueOf(s[4]);
Concordance.put(s[0],matchFraction);
NumMatches.put(s[0], s[5]);
NumMatches.put(s[0], s[4]);
NumMismatches.put(s[0], numMismatches);
if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches > minAllowedMismatches)){
if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < 10){
ReadsToDiscard.add(s[0]);
}else{
Hashtable fourDigitAlleles = new Hashtable();
alleles = s[6].split("\\,");
if (alleles.length > 0){
a = alleles[0].split("\\_");
s = a[1].split("\\*");
if (!LocusCount.containsKey(s[0])){
LocusCount.put(s[0], 1);
}else{
LocusCount.put(s[0], (Integer) LocusCount.get(s[0]) + 1);
}
}
for (int j = 0; j < alleles.length; j++){
a = alleles[j].split("\\_");
s = a[1].split("\\*");
allele = s[0] + "*" + s[1].substring(0,4);
if (!fourDigitAlleles.containsKey(allele)){
fourDigitAlleles.put(allele, allele);
if (!AlleleCount.containsKey(allele)){
AlleleCount.put(allele, 1);
}else{
AlleleCount.put(allele, (Integer) AlleleCount.get(allele) + 1);
}
if ((Integer) AlleleCount.get(allele) > 1 && !AllelesToSearch.contains(allele)){
AllelesToSearch.add(allele);
}
}
}
}
}
}
in.close();
}catch (Exception e){//Catch exception if any
System.err.println("SimilarityFile Error: " + e.getMessage());
//System.err.println("SimilarityFile Error: " + e.getMessage());
}
}
}