HLA caller updated - now searches all (common and rare) alleles, more efficient read filtering and allele comparison runs.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3543 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d51e6c45a7
commit
5704294f9d
|
|
@ -33,7 +33,9 @@ import org.broadinstitute.sting.commandline.Argument;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Hashtable;
|
||||
|
||||
import java.util.Enumeration;
|
||||
import java.util.Vector;
|
||||
import java.util.Collections;
|
||||
|
||||
/**
|
||||
* Calculates likelihood of observing the data given pairs of HLA alleles. NOTE: run CalculateBaseLikelihoods first! Usage: java -jar GenomeAnalysisTK.jar -T CalculateAlleleLikelihoods -I /humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl INPUT.baselikelihoods -eth\
|
||||
|
|
@ -60,17 +62,20 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
|||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAllelesCommon";
|
||||
Hashtable AlleleFrequencies,UniqueAlleles;
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles4Digit";
|
||||
String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.txt";
|
||||
String HLA2DigitFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY_2DIGIT.txt";
|
||||
|
||||
Hashtable AlleleFrequencies,UniqueAlleles,Alleles2Digit;
|
||||
|
||||
CigarParser formatter = new CigarParser();
|
||||
double[][] baseLikelihoods;
|
||||
int[] positions;
|
||||
boolean loaded = false;
|
||||
|
||||
String[] HLAnames, HLAreads;
|
||||
Integer[] HLAstartpos, HLAstoppos;
|
||||
ArrayList<String> HLAnamesAL, HLAreadsAL;
|
||||
String[] HLAnames, HLAreads, HLAnames2, HLAreads2;
|
||||
Integer[] HLAstartpos, HLAstoppos, HLAstartpos2, HLAstoppos2;
|
||||
ArrayList<String> HLAnamesAL, HLAreadsAL, Loci, AllelesToSearch;
|
||||
ArrayList<Integer> HLAstartposAL, HLAstopposAL;
|
||||
|
||||
public Integer reduceInit() {
|
||||
|
|
@ -86,21 +91,27 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
|||
HLAstartposAL = new ArrayList<Integer>();
|
||||
HLAstopposAL = new ArrayList<Integer>();
|
||||
|
||||
if (!ethnicity.equals("CaucasianUSA")){
|
||||
AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_" + ethnicity + ".freq";
|
||||
}
|
||||
out.printf("INFO Reading HLA allele frequencies ... ");
|
||||
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
|
||||
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
|
||||
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
|
||||
UniqueAlleles = HLAfreqReader.GetUniqueAlleles();
|
||||
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
|
||||
out.printf("INFO Reading HLA alleles ... ");
|
||||
HLAFileReader HLADictionaryReader = new HLAFileReader();
|
||||
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
||||
HLAreads = HLADictionaryReader.GetSequences();
|
||||
HLAnames = HLADictionaryReader.GetNames();
|
||||
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
||||
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
||||
|
||||
HLADictionaryReader = new HLAFileReader();
|
||||
HLADictionaryReader.ReadFile(HLA2DigitFile);
|
||||
HLAreads2 = HLADictionaryReader.GetSequences();
|
||||
HLAnames2 = HLADictionaryReader.GetNames();
|
||||
HLAstartpos2 = HLADictionaryReader.GetStartPositions();
|
||||
HLAstoppos2 = HLADictionaryReader.GetStopPositions();
|
||||
out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
|
||||
|
||||
//out.printf("INFO Common alleles:\n");
|
||||
for (int i = 1; i < UniqueAlleles.size(); i++){
|
||||
//out.printf("INFO %s\n",UniqueAlleles.values().toArray()[i]);
|
||||
}
|
||||
out.printf("INFO Reading HLA dictionary ...");
|
||||
//out.printf("INFO Reading HLA dictionary ...");
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -108,10 +119,11 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
|||
}
|
||||
|
||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
HLAnamesAL.add(read.getReadName());
|
||||
HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString()));
|
||||
HLAstartposAL.add(read.getAlignmentStart());
|
||||
HLAstopposAL.add(read.getAlignmentEnd());
|
||||
//HLAnamesAL.add(read.getReadName());
|
||||
//HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString()));
|
||||
//HLAstartposAL.add(read.getAlignmentStart());
|
||||
//HLAstopposAL.add(read.getAlignmentEnd());
|
||||
//out.printf("INFO\t%s\t%s\t%s\t%s\n",read.getReadName(),read.getAlignmentStart(),read.getAlignmentEnd(),formatter.FormatRead(read.getCigarString(), read.getReadString()));
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
@ -154,11 +166,11 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
|||
}
|
||||
|
||||
public void onTraversalDone(Integer numreads) {
|
||||
out.printf("Done! %s alleles found\n", numreads);
|
||||
HLAnames = HLAnamesAL.toArray(new String[numreads]);
|
||||
HLAreads = HLAreadsAL.toArray(new String[numreads]);
|
||||
HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]);
|
||||
HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]);
|
||||
//out.printf("Done! %s alleles found\n", numreads);
|
||||
//HLAnames = HLAnamesAL.toArray(new String[numreads]);
|
||||
//HLAreads = HLAreadsAL.toArray(new String[numreads]);
|
||||
//HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]);
|
||||
//HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]);
|
||||
|
||||
double[][] AlleleLikelihoods = new double[numreads][numreads];
|
||||
|
||||
|
|
@ -173,6 +185,7 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
|||
int numcombinations = 0;
|
||||
out.printf("NUM\tAllele1\tAllele2\tSSG\n");
|
||||
|
||||
//debugging specific alleles
|
||||
int index1 = -1, index2 = -1;
|
||||
if (!debugAlleles.equals("")){
|
||||
String s[] = debugAlleles.split(",");
|
||||
|
|
@ -185,52 +198,96 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
|||
}
|
||||
if (index1 > -1 && index2 > -1){
|
||||
out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2);
|
||||
double dl = CalculateLikelihood(index1,index2,true);
|
||||
double dl = CalculateLikelihood(index1,index2,HLAreads2,true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//Pre-process homozygous combinations to determine top possible alleles (for efficiency)
|
||||
int numreads2 = HLAnames2.length;
|
||||
Alleles2Digit = new Hashtable();
|
||||
Loci = new ArrayList<String>();
|
||||
double[] AlleleLikelihoods2 = new double[numreads];
|
||||
for (int i = 0; i < numreads; i++){
|
||||
name1 = HLAnames[i].substring(4);
|
||||
String [] n1 = name1.split("\\*");
|
||||
// out.printf("1: %s\n",n1[0] + "*" + n1[1].substring(0, 3));
|
||||
if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){
|
||||
//out.printf("1: %s\n",name1);
|
||||
//frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
|
||||
//if (frq1 > minfrq){
|
||||
numcombinations++;
|
||||
AlleleLikelihoods2[i] = CalculateLikelihood(i,i,HLAreads,false);
|
||||
if (AlleleLikelihoods2[i] < 0){
|
||||
name2 = n1[0] + "*" + n1[1].substring(0, 2);
|
||||
if (!Loci.contains(n1[0])){Loci.add(n1[0]);}
|
||||
if (!Alleles2Digit.containsKey(name2)){
|
||||
Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
|
||||
}else if ((Double) Alleles2Digit.get(name2) < AlleleLikelihoods2[i]){
|
||||
Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Sort alleles at 2 digit resolution for each locus
|
||||
AllelesToSearch = new ArrayList<String>();
|
||||
for (int i = 0; i < Loci.size(); i++){
|
||||
Enumeration k = Alleles2Digit.keys();
|
||||
Hashtable AllelesAtLoci = new Hashtable();
|
||||
|
||||
//find alleles at the locus
|
||||
while( k.hasMoreElements() ){
|
||||
name1 = k.nextElement().toString();
|
||||
String [] n1 = name1.split("\\*");
|
||||
if (Loci.get(i).equals(n1[0])){
|
||||
AllelesAtLoci.put(-1 * (Double) Alleles2Digit.get(name1), name1);
|
||||
}
|
||||
}
|
||||
|
||||
//Sort alleles at locus, mark top six 2-digit classes for deep search
|
||||
int num = 1;
|
||||
Vector v = new Vector(AllelesAtLoci.keySet());
|
||||
Collections.sort(v);
|
||||
for (Enumeration e = v.elements(); e.hasMoreElements();) {
|
||||
Double key = Double.valueOf(e.nextElement().toString());
|
||||
String allele = AllelesAtLoci.get(key).toString();
|
||||
if (num <= 10){
|
||||
AllelesToSearch.add(allele);
|
||||
|
||||
num++;
|
||||
}
|
||||
//out.printf("%s\t%s\n",allele,key);
|
||||
}
|
||||
}
|
||||
|
||||
//Iterate through allele pairs to calculate likelihoods
|
||||
if (true){
|
||||
numcombinations = 0;
|
||||
for (int i = 0; i < numreads; i++){
|
||||
name1 = HLAnames[i].substring(4);
|
||||
String [] n1 = name1.split("\\*");
|
||||
if (AllelesToSearch.contains(n1[0] + "*" + n1[1].substring(0, 2))){
|
||||
//out.printf("1: %s\n",name1);
|
||||
//frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
|
||||
//if (frq1 > minfrq){
|
||||
for (int j = i; j < numreads; j++){
|
||||
name2 = HLAnames[j].substring(4);
|
||||
String [] n2 = name2.split("\\*");
|
||||
// out.printf("2: %s\n",n2[0] + "*" + n2[1].substring(0, 3));
|
||||
if (UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){
|
||||
|
||||
// frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
|
||||
// if (frq2 > minfrq){
|
||||
if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
|
||||
numcombinations++;
|
||||
AlleleLikelihoods[i][j] = CalculateLikelihood(i,j,false);
|
||||
if (AllelesToSearch.contains(n2[0] + "*" + n2[1].substring(0, 2))){
|
||||
if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
|
||||
numcombinations++;
|
||||
AlleleLikelihoods[i][j] = CalculateLikelihood(i,j,HLAreads,false);
|
||||
if (AlleleLikelihoods[i][j] < 0){
|
||||
out.printf("%s\t%s\t%s\t%.2f\n",numcombinations,name1,name2,AlleleLikelihoods[i][j]);
|
||||
}
|
||||
// }else{
|
||||
// if (DEBUG){out.printf("%s has allele frequency%.5f\n",name2,frq2);}
|
||||
// }
|
||||
// }else{
|
||||
// if (DEBUG){out.printf("%s not found in allele frequency file\n",name2);}
|
||||
}
|
||||
}
|
||||
}
|
||||
//}else{
|
||||
// if (DEBUG){out.printf("%s has allele frequency%.5f\n",name1,frq1);}
|
||||
//}
|
||||
//}else{
|
||||
// if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private double CalculateLikelihood(int a1, int a2, boolean debug){
|
||||
String read1 = HLAreads[a1];
|
||||
String read2 = HLAreads[a2];
|
||||
private double CalculateLikelihood(int a1, int a2, String[] HLAalleles, boolean debug){
|
||||
//Calculates likelihood for specific allele pair
|
||||
String read1 = HLAalleles[a1];
|
||||
String read2 = HLAalleles[a2];
|
||||
int start1 = HLAstartpos[a1];
|
||||
int start2 = HLAstartpos[a2];
|
||||
int stop1 = HLAstoppos[a1];
|
||||
|
|
|
|||
|
|
@ -72,34 +72,13 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
|||
ArrayList<SAMRecord> AllReads = new ArrayList<SAMRecord>();
|
||||
ArrayList<String> AllReadNames = new ArrayList<String>();
|
||||
|
||||
boolean HLAdataLoaded = false;
|
||||
boolean dataLoaded = false;
|
||||
|
||||
|
||||
//Loads HLA dictionary, allele frequencies, and reads to filter
|
||||
//Loads reads to filter
|
||||
public Pair<Long, Long> reduceInit() {
|
||||
if (!HLAdataLoaded){
|
||||
HLAdataLoaded = true;
|
||||
|
||||
out.printf("INFO Reading HLA database ... ");
|
||||
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
||||
HLAreads = HLADictionaryReader.GetReads();
|
||||
HLAnames = HLADictionaryReader.GetReadNames();
|
||||
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
||||
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
||||
InitializeVariables(HLAreads.length);
|
||||
out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
|
||||
|
||||
|
||||
|
||||
if (!ethnicity.equals("CaucasianUSA")){
|
||||
AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_" + ethnicity + ".freq";
|
||||
}
|
||||
out.printf("INFO Reading HLA allele frequencies ... ");
|
||||
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
|
||||
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
|
||||
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
|
||||
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
|
||||
|
||||
if (!dataLoaded){
|
||||
dataLoaded = true;
|
||||
|
||||
if (!filterFile.equals("")){
|
||||
out.printf("INFO Reading properties file ... ");
|
||||
|
|
@ -137,7 +116,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
|||
|
||||
int numAs = 0, numCs = 0, numGs = 0, numTs = 0;
|
||||
//if (DEBUG){
|
||||
out.printf("%s\t%s\t", context.getLocation(),ref.getBase());
|
||||
out.printf("%s\t%s\t", context.getLocation(),(char)ref.getBase());
|
||||
//}
|
||||
|
||||
//Calculate posterior probabilities
|
||||
|
|
|
|||
|
|
@ -53,24 +53,25 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
|
||||
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
|
||||
public String ethnicity = "Caucasian";
|
||||
|
||||
@Argument(fullName = "useInterval", shortName = "useInterval", doc = "Use only these intervals", required = false)
|
||||
public String intervalFile = "";
|
||||
|
||||
@Argument(fullName = "dictionary", shortName = "dictionary", doc = "bam file of HLA ditionary", required = false)
|
||||
public String HLAdictionaryFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam";
|
||||
|
||||
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
||||
public boolean ONLYFREQUENT = false;
|
||||
|
||||
String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt";
|
||||
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt";
|
||||
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt";
|
||||
|
||||
SAMFileReader HLADictionaryReader = new SAMFileReader();
|
||||
|
||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||
String AlleleFrequencyFile;
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||
|
||||
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
|
||||
HLAFileReader HLADictionaryReader = new HLAFileReader();
|
||||
|
||||
boolean DatabaseLoaded = false;
|
||||
boolean DEBUG = false;
|
||||
ArrayList<String> ClosestAlleles = new ArrayList<String>();
|
||||
|
||||
String[] HLAnames, HLAreads;
|
||||
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
|
||||
|
|
@ -88,6 +89,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
Hashtable AlleleFrequencies = new Hashtable();
|
||||
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
|
||||
CigarParser formatter = new CigarParser();
|
||||
int [][] intervals; int numIntervals;
|
||||
|
||||
public Integer reduceInit() {
|
||||
if (!DatabaseLoaded){
|
||||
|
|
@ -96,9 +98,9 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
//Load HLA dictionary
|
||||
out.printf("INFO Loading HLA dictionary ... ");
|
||||
|
||||
HLADictionaryReader.ReadFile(HLAdictionaryFile);
|
||||
HLAreads = HLADictionaryReader.GetReads();
|
||||
HLAnames = HLADictionaryReader.GetReadNames();
|
||||
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
||||
HLAreads = HLADictionaryReader.GetSequences();
|
||||
HLAnames = HLADictionaryReader.GetNames();
|
||||
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
||||
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
||||
minstartpos = HLADictionaryReader.GetMinStartPos();
|
||||
|
|
@ -110,26 +112,27 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
concordance = new double[HLAreads.length];
|
||||
numcompared = new double[HLAreads.length];
|
||||
|
||||
//Read allele frequencies
|
||||
if (ethnicity.equals("Black")){
|
||||
AlleleFrequencyFile = BlackAlleleFrequencyFile;
|
||||
}else{
|
||||
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
|
||||
}
|
||||
out.printf("INFO Reading HLA allele frequencies ... ");
|
||||
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
|
||||
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
|
||||
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
|
||||
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
|
||||
|
||||
//FindPolymorphicSites(minstartpos,maxstoppos);
|
||||
|
||||
//Load list of polymorphic sites
|
||||
PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader();
|
||||
siteFileReader.ReadFile(PolymorphicSitesFile);
|
||||
PolymorphicSites = siteFileReader.GetPolymorphicSites();
|
||||
NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
|
||||
numpolymorphicsites = PolymorphicSites.length;
|
||||
numnonpolymorphicsites = NonPolymorphicSites.length;
|
||||
|
||||
if (!intervalFile.equals("")){
|
||||
TextFileReader fileReader = new TextFileReader();
|
||||
fileReader.ReadFile(intervalFile);
|
||||
String[] lines = fileReader.GetLines();
|
||||
intervals = new int[lines.length][2];
|
||||
for (int i = 0; i < lines.length; i++) {
|
||||
String[] s = lines[i].split(":");
|
||||
String[] intervalPieces = s[1].split("-");
|
||||
intervals[i][0] = Integer.valueOf(intervalPieces[0]);
|
||||
intervals[i][1] = Integer.valueOf(intervalPieces[1]);
|
||||
}
|
||||
numIntervals = intervals.length;
|
||||
}
|
||||
|
||||
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
|
||||
out.printf("INFO Comparing reads to database ...\n");
|
||||
|
|
@ -167,7 +170,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
//Polymorphic sites: always increment denominator, increment numerator when bases are concordant
|
||||
for (int j = 0; j < numpolymorphicsites; j++){
|
||||
pos = PolymorphicSites[j];
|
||||
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
|
||||
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop && IsWithinInterval(pos)){
|
||||
c1 = s1.charAt(pos-readstart);
|
||||
c2 = s2.charAt(pos-allelestart);
|
||||
if (c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
|
||||
|
|
@ -176,7 +179,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
nummatched[i]++;
|
||||
}else{
|
||||
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
||||
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
|
||||
out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, pos,c1,c2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -187,13 +190,13 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
if (numcompared[i] > 0){
|
||||
for (int j = 0; j < numnonpolymorphicsites; j++){
|
||||
pos = NonPolymorphicSites[j];
|
||||
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
|
||||
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop && IsWithinInterval(pos)){
|
||||
c1 = s1.charAt(pos-readstart);
|
||||
c2 = s2.charAt(pos-allelestart);
|
||||
if (c1 != c2 && c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
|
||||
numcompared[i]++;
|
||||
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
||||
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
|
||||
out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -205,7 +208,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
concordance[i]=nummatched[i]/numcompared[i];
|
||||
if (concordance[i] > maxConcordance){maxConcordance = concordance[i];}
|
||||
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
||||
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
|
||||
out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
|
||||
}
|
||||
if (findFirst && (concordance[i] == 1)){
|
||||
break;
|
||||
|
|
@ -228,32 +231,42 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
return maxFreq;
|
||||
}
|
||||
|
||||
private boolean IsWithinInterval(int pos){
|
||||
boolean isWithinInterval = false;
|
||||
for (int i = 0; i < numIntervals; i++){
|
||||
if (pos >= intervals[i][0] && pos <= intervals[i][1]){
|
||||
isWithinInterval = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return isWithinInterval;
|
||||
}
|
||||
|
||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
//Calculate concordance for this read and all overlapping reads
|
||||
if (read.getMappingQuality() > 0){
|
||||
double maxConcordance = CalculateConcordance(read);
|
||||
|
||||
String stats = "", topAlleles = "";
|
||||
if (maxConcordance > 0){
|
||||
String readname = read.getReadName(), allelename = ""; double freq;
|
||||
//For input bam files that contain HLA alleles, find and print allele frequency
|
||||
//freq = GetAlleleFrequency(readname);
|
||||
out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());
|
||||
|
||||
//Find the maximum frequency of the alleles most concordant with the read
|
||||
//double maxFreq = FindMaxAlleleFrequency(maxConcordance);
|
||||
|
||||
//Print concordance statistics between this read and the most similar HLA allele(s)
|
||||
|
||||
for (int i = 0; i < HLAreads.length; i++){
|
||||
if (concordance[i] == maxConcordance){
|
||||
freq = GetAlleleFrequency(HLAnames[i]);
|
||||
//if (freq == maxFreq){
|
||||
out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
|
||||
//}
|
||||
break;
|
||||
//freq = GetAlleleFrequency(HLAnames[i]);
|
||||
if (topAlleles.equals("")){
|
||||
topAlleles = HLAnames[i];
|
||||
}else{
|
||||
topAlleles = topAlleles + "," + HLAnames[i];
|
||||
}
|
||||
stats = String.format("%.1f\t%.3f\t%.0f\t%.0f",1.0,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
|
||||
|
||||
}
|
||||
}
|
||||
out.print("\n");
|
||||
out.printf("\t%s\t%s\n",stats,topAlleles);
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
|
|
|
|||
|
|
@ -29,18 +29,12 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
|
|||
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
||||
public boolean ONLYFREQUENT = false;
|
||||
|
||||
//String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
|
||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
|
||||
|
||||
SAMFileReader HLADictionaryReader = new SAMFileReader();
|
||||
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt";
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||
|
||||
//String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||
String AlleleFrequencyFile;
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||
|
||||
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
|
||||
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt";
|
||||
String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt";
|
||||
HLAFileReader HLADictionaryReader = new HLAFileReader();
|
||||
|
||||
boolean DatabaseLoaded = false;
|
||||
boolean DEBUG = false;
|
||||
|
|
@ -69,8 +63,8 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
|
|||
out.printf("INFO Loading HLA dictionary ... ");
|
||||
|
||||
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
||||
HLAreads = HLADictionaryReader.GetReads();
|
||||
HLAnames = HLADictionaryReader.GetReadNames();
|
||||
HLAreads = HLADictionaryReader.GetSequences();
|
||||
HLAnames = HLADictionaryReader.GetNames();
|
||||
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
||||
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
||||
minstartpos = HLADictionaryReader.GetMinStartPos();
|
||||
|
|
@ -82,18 +76,6 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
|
|||
concordance = new double[HLAreads.length];
|
||||
numcompared = new double[HLAreads.length];
|
||||
|
||||
//Read allele frequencies
|
||||
if (ethnicity.equals("Black")){
|
||||
AlleleFrequencyFile = BlackAlleleFrequencyFile;
|
||||
}else{
|
||||
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
|
||||
}
|
||||
out.printf("INFO Reading HLA allele frequencies ... ");
|
||||
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
|
||||
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
|
||||
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
|
||||
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
|
||||
|
||||
FindPolymorphicSites(minstartpos,maxstoppos);
|
||||
|
||||
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
|
||||
|
|
@ -108,13 +90,15 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
|
|||
|
||||
private void FindPolymorphicSites(int start, int stop){
|
||||
boolean initialized, polymorphic, examined;
|
||||
char c = ' ';
|
||||
char c = ' ', ch = ' ';
|
||||
int A = 0, C = 0, G = 0, T = 0;
|
||||
ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
|
||||
ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
|
||||
//Find polymorphic sites in dictionary
|
||||
for (int pos = start; pos <= stop; pos++){
|
||||
initialized = false; polymorphic = false; examined = false;
|
||||
//look across all alleles at specific position to see if it is polymorphic
|
||||
A = 0; C = 0; G = 0; T = 0;
|
||||
for (int i = 0; i < HLAreads.length; i++){
|
||||
if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
|
||||
if (!initialized){
|
||||
|
|
@ -122,19 +106,28 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
|
|||
initialized = true;
|
||||
examined = true;
|
||||
}
|
||||
if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
|
||||
polymorphicsites.add(pos);
|
||||
out.printf("POLYMORPHIC\t6\t%s\n", pos);
|
||||
ch = HLAreads[i].charAt(pos-HLAstartpos[i]);
|
||||
if (ch == 'A'){A++;}
|
||||
else if (ch == 'C'){C++;}
|
||||
else if (ch == 'T'){T++;}
|
||||
else if (ch == 'G'){G++;}
|
||||
|
||||
if (ch != c){
|
||||
// polymorphicsites.add(pos);
|
||||
// out.printf("POLYMORPHIC\t6\t%s\n", pos);
|
||||
polymorphic = true;
|
||||
break;
|
||||
// break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (polymorphic){
|
||||
out.printf("%s\t%s\t%s\t%s\t%s\n",pos,A,C,G,T);
|
||||
}
|
||||
//if (!polymorphic && examined){
|
||||
// nonpolymorphicsites.add(pos);
|
||||
// out.printf("CONSERVED\t6\t%s\n", pos);
|
||||
//}
|
||||
|
||||
}
|
||||
if (!polymorphic && examined){
|
||||
nonpolymorphicsites.add(pos);
|
||||
out.printf("CONSERVED\t6\t%s\n", pos);
|
||||
}
|
||||
}
|
||||
PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
|
||||
NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
|
||||
|
|
|
|||
|
|
@ -7,41 +7,68 @@ import java.util.Hashtable;
|
|||
* @author shermanjia
|
||||
*/
|
||||
public class FrequencyFileReader {
|
||||
Hashtable AlleleFrequencies = new Hashtable();
|
||||
Hashtable UniqueAlleles = new Hashtable();
|
||||
Hashtable MaxFrequencies = new Hashtable();
|
||||
Hashtable CommonAlleles = new Hashtable();
|
||||
Hashtable [] AlleleFrequencies = null;
|
||||
String [] Populations = null;
|
||||
|
||||
public Hashtable GetAlleleFrequencies(){
|
||||
public Hashtable [] GetAlleleFrequencies(){
|
||||
//return allele frequencies for all populations
|
||||
return AlleleFrequencies;
|
||||
}
|
||||
|
||||
public Hashtable GetUniqueAlleles(){
|
||||
return UniqueAlleles;
|
||||
public Hashtable GetCommonAlleles(){
|
||||
//return list of common alleles
|
||||
return CommonAlleles;
|
||||
}
|
||||
public void ReadFile(String filename, String uniqueAllelesFile){
|
||||
|
||||
public Hashtable GetMaxFrequencies(){
|
||||
//return list of common alleles
|
||||
return MaxFrequencies;
|
||||
}
|
||||
|
||||
public String[] GetPopulations(){
|
||||
//Return name of populations
|
||||
return Populations;
|
||||
}
|
||||
|
||||
public void ReadFile(String filename, String ethnicity){
|
||||
try{
|
||||
int linenum = 0;
|
||||
FileInputStream fstream = new FileInputStream(filename);
|
||||
DataInputStream in = new DataInputStream(fstream);
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(in));
|
||||
String strLine; String [] s = null;
|
||||
//Read File Line By Line
|
||||
while ((strLine = br.readLine()) != null) {
|
||||
linenum++;
|
||||
s = strLine.split("\\t");
|
||||
AlleleFrequencies.put(s[0], s[1]);
|
||||
//System.out.printf("Loaded: %s\t%s\n",s[0],AlleleFrequencies.get(s[0]).toString());
|
||||
}
|
||||
in.close();
|
||||
|
||||
fstream = new FileInputStream(uniqueAllelesFile);
|
||||
in = new DataInputStream(fstream);
|
||||
br = new BufferedReader(new InputStreamReader(in));
|
||||
//Read File Line By Line
|
||||
while ((strLine = br.readLine()) != null) {
|
||||
UniqueAlleles.put(strLine,strLine);
|
||||
//System.out.printf("Loaded: %s\t%s\n",s[0],AlleleFrequencies.get(s[0]).toString());
|
||||
if (linenum == 1){
|
||||
//Determine number of populations, create a hash table for each population
|
||||
AlleleFrequencies = new Hashtable[s.length-1];
|
||||
Populations = new String[s.length-1];
|
||||
for (int i = 1; i < s.length; i++){
|
||||
Populations[i-1]=s[i];
|
||||
AlleleFrequencies[i-1] = new Hashtable();
|
||||
}
|
||||
}else{
|
||||
//assign allele frequencies for each population
|
||||
for (int i = 1; i < s.length; i++){
|
||||
if (Double.valueOf(s[i]) > 0.0001){
|
||||
CommonAlleles.put(s[0], s[0]);
|
||||
}
|
||||
AlleleFrequencies[i-1].put(s[0],s[i]);
|
||||
if (!MaxFrequencies.containsKey(s[0])){
|
||||
MaxFrequencies.put(s[0], s[i]);
|
||||
}else if (Double.valueOf(MaxFrequencies.get(s[0]).toString()) < Double.valueOf(s[i])){
|
||||
MaxFrequencies.put(s[0], s[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
in.close();
|
||||
}catch (Exception e){//Catch exception if any
|
||||
System.err.println("FrequencyFileReader Error: " + e.getMessage());
|
||||
System.err.println("Exception in FrequencyFileReader (" + e.getMessage() + ").");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,737 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
/**
|
||||
* Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T CalculatePhaseLikelihoods -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\
|
||||
dMismatches 7] -ethnicity Caucasian | grep -v "INFO" | grep -v "DEBUG" | grep -v "DONE!" > OUTPUT.phaselikelihoods
|
||||
* @author shermanjia
|
||||
*/
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
public class HLACallerWalker extends ReadWalker<Integer, Integer> {
|
||||
@Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true)
|
||||
public String baseLikelihoodsFile = "";
|
||||
|
||||
@Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false)
|
||||
public boolean DEBUG = false;
|
||||
|
||||
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
|
||||
public String filterFile = "";
|
||||
|
||||
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
|
||||
public String ethnicity = "CaucasiansUSA";
|
||||
|
||||
@Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false)
|
||||
public String debugAlleles = "";
|
||||
|
||||
@Argument(fullName = "phaseInterval", shortName = "phaseInterval", doc = "Use only these intervals in phase calculation", required = false)
|
||||
public String phaseIntervalFile = "";
|
||||
|
||||
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
||||
public boolean ONLYFREQUENT = false;
|
||||
|
||||
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
|
||||
public int MINALLOWEDMISMATCHES = 7;
|
||||
|
||||
GATKArgumentCollection args = this.getToolkit().getArguments();
|
||||
|
||||
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt";
|
||||
String PolymorphicSitesfile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt";
|
||||
String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt";
|
||||
|
||||
// Initializing variables
|
||||
|
||||
HLAFileReader HLADictionaryReader = new HLAFileReader();
|
||||
boolean HLAdataLoaded = false;
|
||||
String[] HLAnames, HLAreads, Populations;
|
||||
ArrayList<String> ReadsToDiscard;
|
||||
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites;
|
||||
|
||||
|
||||
int[][] numObservations, totalObservations, intervals;
|
||||
int[] SNPnumInRead, SNPposInRead, positions;
|
||||
CigarParser cigarparser = new CigarParser();
|
||||
Hashtable MaxLikelihoods = new Hashtable();
|
||||
Hashtable MaxFrequencies, CommonAlleles, AlleleCount, LocusCount;
|
||||
Hashtable[] AlleleFrequencies;
|
||||
int numIntervals;
|
||||
double[][] baseLikelihoods;
|
||||
|
||||
ArrayList AllelesToSearch = new ArrayList<String>();
|
||||
|
||||
// setting error rates for phasing algorithm (1% expected error rate for any genotype)
|
||||
|
||||
double P_err = 0.01;
|
||||
double P_correct = 1 - P_err;
|
||||
double L_err = Math.log10(P_err);
|
||||
double L_correct = Math.log10(P_correct);
|
||||
|
||||
public Integer reduceInit() {
|
||||
|
||||
if (!HLAdataLoaded){
|
||||
HLAdataLoaded = true;
|
||||
|
||||
//Load HLA dictionary
|
||||
|
||||
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
||||
HLAreads = HLADictionaryReader.GetSequences();
|
||||
HLAnames = HLADictionaryReader.GetNames();
|
||||
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
||||
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
||||
|
||||
//Load pre-processing file for misaligned reads and list of alleles to search
|
||||
|
||||
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
||||
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
|
||||
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
||||
AllelesToSearch = similarityReader.GetAllelesToSearch();
|
||||
AlleleCount = similarityReader.GetAlleleCount();
|
||||
LocusCount = similarityReader.GetLocusCount();
|
||||
for (int i = 0; i < AllelesToSearch.size(); i++){
|
||||
out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i)));
|
||||
}
|
||||
|
||||
//Load genotypes and find polymorphic sites (sites that differ from reference)
|
||||
|
||||
BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader();
|
||||
baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, true);
|
||||
baseLikelihoods = baseLikelihoodsReader.GetBaseLikelihoods();
|
||||
positions = baseLikelihoodsReader.GetPositions();
|
||||
PolymorphicSites = baseLikelihoodsReader.GetPolymorphicSites();
|
||||
out.printf("INFO\t%s polymorphic sites found\n",PolymorphicSites.length);
|
||||
|
||||
int l = PolymorphicSites.length;
|
||||
SNPnumInRead = new int[l];
|
||||
SNPposInRead = new int[l];
|
||||
numObservations = new int[l*5][l*5];
|
||||
totalObservations = new int[l][l];
|
||||
|
||||
//Load allele frequencies for different populations
|
||||
|
||||
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
|
||||
HLAfreqReader.ReadFile(AlleleFrequencyFile,ethnicity);
|
||||
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
|
||||
MaxFrequencies = HLAfreqReader.GetMaxFrequencies();
|
||||
CommonAlleles = HLAfreqReader.GetCommonAlleles();
|
||||
Populations = HLAfreqReader.GetPopulations();
|
||||
|
||||
//Load genomic intervals for bam file
|
||||
|
||||
if (!phaseIntervalFile.equals("")){
|
||||
TextFileReader fileReader = new TextFileReader();
|
||||
fileReader.ReadFile(phaseIntervalFile);
|
||||
String[] lines = fileReader.GetLines();
|
||||
intervals = new int[lines.length][2];
|
||||
for (int i = 0; i < lines.length; i++) {
|
||||
String[] s = lines[i].split(":");
|
||||
String[] intervalPieces = s[1].split("-");
|
||||
intervals[i][0] = Integer.valueOf(intervalPieces[0]);
|
||||
intervals[i][1] = Integer.valueOf(intervalPieces[1]);
|
||||
}
|
||||
numIntervals = intervals.length;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
if (!ReadsToDiscard.contains(read.getReadName())){
|
||||
UpdateCorrelation(read);
|
||||
}else{
|
||||
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer numreads) {
|
||||
String name1, name2, d4_name1, d4_name2, d2_name1, d2_name2;
|
||||
Double frq1 = 0.0, frq2 = 0.0, log1 = 0.0, log2 = 0.0,alleleLikelihood= 0.0, phaseLikelihood=0.0, minfrq = 0.0, likelihood = 0.0;
|
||||
int numCombinations = 0;
|
||||
|
||||
//For debugging specific alleles
|
||||
if (!debugAlleles.equals("")){
|
||||
String s[] = debugAlleles.split(",");
|
||||
int index1 = HLADictionaryReader.GetIndex(s[0]);
|
||||
int index2 = HLADictionaryReader.GetIndex(s[1]);
|
||||
out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2);
|
||||
if (index1 > -1 && index2 > -1){
|
||||
alleleLikelihood = CalculateAlleleLikelihood(index1,index2,HLAreads,true);
|
||||
phaseLikelihood = CalculatePhaseLikelihood(index1,index2,true,false);
|
||||
}
|
||||
}
|
||||
|
||||
if (ONLYFREQUENT){
|
||||
minfrq = 0.0001;
|
||||
}
|
||||
|
||||
double max;
|
||||
ArrayList Output = new ArrayList<String>();
|
||||
ArrayList Likelihoods = new ArrayList<Double>();
|
||||
Hashtable TotalProb = new Hashtable();
|
||||
//Search pairs of alleles that satisfy initial search criteria
|
||||
|
||||
// Allele 1
|
||||
|
||||
for (int i = 0; i < HLAnames.length; i++){
|
||||
name1 = HLAnames[i].substring(4);
|
||||
String [] n1 = name1.split("\\*");
|
||||
d4_name1 = n1[0] + "*" + n1[1].substring(0, 4);
|
||||
d2_name1 = n1[0] + "*" + n1[1].substring(0, 2);
|
||||
if (AllelesToSearch.contains(d4_name1)){// || CommonAlleles.contains(d4_name1)
|
||||
|
||||
if (MaxFrequencies.containsKey(d4_name1)){
|
||||
frq1 = Double.parseDouble(MaxFrequencies.get(d4_name1).toString());
|
||||
}else{
|
||||
if (n1[1].length() > 4){if (n1[1].substring(4, 5).equals("N")){frq1 = .00000005;}else{frq1 = .000001;}}else{frq1 = .000001;}
|
||||
}
|
||||
|
||||
// Allele 2
|
||||
|
||||
for (int j = i; j < HLAnames.length; j++){
|
||||
name2 = HLAnames[j].substring(4);
|
||||
String [] n2 = name2.split("\\*");
|
||||
d4_name2 = n2[0] + "*" + n2[1].substring(0, 4);
|
||||
d2_name2 = n2[0] + "*" + n2[1].substring(0, 2);
|
||||
if (n1[0].equals(n2[0]) && (AllelesToSearch.contains(d4_name2))){// || CommonAlleles.contains(d4_name2)
|
||||
if (MaxFrequencies.containsKey(d4_name2)){
|
||||
frq2 = Double.parseDouble(MaxFrequencies.get(d4_name2).toString());
|
||||
}else{
|
||||
if (n2[1].length() > 4){if (n2[1].substring(4, 5).equals("N")){frq2 = .00000005;}else{frq2 = .000001;}}else{frq2 = .000001;}
|
||||
}
|
||||
//Calculate allele and phase likelihoods for each allele pair
|
||||
alleleLikelihood = CalculateAlleleLikelihood(i,j,HLAreads,false);
|
||||
numCombinations++;
|
||||
|
||||
//If there is data at the allele pair, continue with other calculations
|
||||
|
||||
if (alleleLikelihood < 0){
|
||||
phaseLikelihood = CalculatePhaseLikelihood(i,j,false,false);
|
||||
log1=Math.log10(frq1);
|
||||
log2=Math.log10(frq2);
|
||||
|
||||
//sum likelihoods
|
||||
|
||||
likelihood = alleleLikelihood+phaseLikelihood+log1+log2;
|
||||
if (!MaxLikelihoods.containsKey(n1[0])){MaxLikelihoods.put(n1[0], likelihood);}
|
||||
|
||||
if (likelihood > (Double) MaxLikelihoods.get(n1[0])) {
|
||||
MaxLikelihoods.put(n1[0], likelihood);
|
||||
}
|
||||
Likelihoods.add(likelihood);
|
||||
String data = String.format("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f",n1[0],name1,name2,alleleLikelihood,phaseLikelihood,log1,log2,likelihood);
|
||||
Output.add(data);
|
||||
out.printf("INFO\t%s\n",data);
|
||||
if (DEBUG){
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
//Print output
|
||||
out.printf("Locus\tA1\tA2\tGeno\tPhase\tFrq1\tFrq2\tL\tProb\tReads1\tReads2\tLocus\tEXP");
|
||||
for (int i = 0; i < Populations.length; i++){
|
||||
out.printf("\t%s",Populations[i]);
|
||||
}
|
||||
out.printf("\n");
|
||||
|
||||
//Calculate probabilities for each locus
|
||||
Double probSum = 0.0, prob = 0.0, f1 = 0.0, f2 = 0.0, aLikelihood4 = 0.0, pLikelihood4 = 0.0;
|
||||
Integer count = 0;
|
||||
Hashtable HLA4DigitProbs = new Hashtable();
|
||||
Hashtable HLA4DigitLs = new Hashtable();
|
||||
Hashtable HLA4DigitCount = new Hashtable();
|
||||
Hashtable HLA4DigitF1 = new Hashtable();
|
||||
Hashtable HLA4DigitF2 = new Hashtable();
|
||||
Hashtable HLA4DigitA = new Hashtable();
|
||||
Hashtable HLA4DigitP = new Hashtable();
|
||||
|
||||
String key;
|
||||
Enumeration keys = LocusCount.keys();
|
||||
while (keys.hasMoreElements()){
|
||||
String locus = keys.nextElement().toString();
|
||||
|
||||
probSum = 0.0;
|
||||
ArrayList localOutput = new ArrayList<String>();
|
||||
ArrayList localLikelihoods = new ArrayList<Double>();
|
||||
|
||||
//Sum probabilities for each locus
|
||||
|
||||
for (int j = 0; j < Output.size(); j++){
|
||||
String data = Output.get(j).toString();
|
||||
String [] d = data.split("\\t");
|
||||
if (d[0].equals(locus)){
|
||||
localOutput.add(data);
|
||||
likelihood = (Double)Likelihoods.get(j)-(Double)MaxLikelihoods.get(locus);
|
||||
localLikelihoods.add(likelihood);
|
||||
probSum = probSum + Math.pow(10, likelihood);
|
||||
//out.printf("INFO\t%s\t%s\t%.2f\t%.2f\t%.2f\t%s\n",locus,data,likelihood,(Double)MaxLikelihoods.get(locus),(Double)Likelihoods.get(j),probSum);
|
||||
}
|
||||
}
|
||||
|
||||
//aggregate statistics for 4-digit types
|
||||
|
||||
String A1 = "", A2 = "", a1 = "", a2 = "";
|
||||
String [] s, s1, s2;
|
||||
Double prob4digit = 0.0;
|
||||
int n = 0;
|
||||
|
||||
for (int j = 0; j < localOutput.size(); j++){
|
||||
String data = localOutput.get(j).toString();
|
||||
prob = Math.pow(10, (Double)localLikelihoods.get(j))/probSum;
|
||||
|
||||
if (prob > 0.005){
|
||||
s = data.split("\\t");
|
||||
s1 = s[1].split("\\*");
|
||||
s2 = s[2].split("\\*");
|
||||
a1 = s1[0] + "*" + s1[1].substring(0,4);
|
||||
a2 = s2[0] + "*" + s2[1].substring(0,4);
|
||||
key = a1 + "," + a2;
|
||||
aLikelihood4 = Double.valueOf(s[3]);
|
||||
pLikelihood4 = Double.valueOf(s[4]);
|
||||
likelihood = aLikelihood4 + pLikelihood4 + f1 + f2;
|
||||
f1 = Double.valueOf(s[5]);
|
||||
f2 = Double.valueOf(s[6]);
|
||||
if (!HLA4DigitProbs.containsKey(key)){
|
||||
HLA4DigitProbs.put(key, prob);
|
||||
HLA4DigitLs.put(key, likelihood);
|
||||
HLA4DigitCount.put(key, 1);
|
||||
HLA4DigitF1.put(key,f1);
|
||||
HLA4DigitF2.put(key,f2);
|
||||
HLA4DigitA.put(key,aLikelihood4);
|
||||
HLA4DigitP.put(key,pLikelihood4);
|
||||
}else{
|
||||
prob = prob + Double.valueOf(HLA4DigitProbs.get(key).toString());
|
||||
HLA4DigitProbs.put(key, prob);
|
||||
likelihood = likelihood + Double.valueOf(HLA4DigitLs.get(key).toString());
|
||||
HLA4DigitLs.put(key, likelihood);
|
||||
n = Integer.valueOf(HLA4DigitCount.get(key).toString()) + 1;
|
||||
HLA4DigitCount.put(key, n);
|
||||
aLikelihood4 = aLikelihood4 + Double.valueOf(HLA4DigitA.get(key).toString());
|
||||
HLA4DigitA.put(key, aLikelihood4);
|
||||
pLikelihood4 = pLikelihood4 + Double.valueOf(HLA4DigitP.get(key).toString());
|
||||
HLA4DigitP.put(key, pLikelihood4);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Print results
|
||||
Enumeration P = HLA4DigitProbs.keys();
|
||||
String K = ""; String [] s, s1, s2;
|
||||
double count1, count2, locusCount, accountedFor;
|
||||
|
||||
// Sort hashtable.
|
||||
Vector v = new Vector(HLA4DigitProbs.keySet());
|
||||
Collections.sort(v);
|
||||
|
||||
// Display (sorted) hashtable.
|
||||
for (Enumeration e = v.elements(); e.hasMoreElements();) {
|
||||
K = (String)e.nextElement();
|
||||
prob = (Double) HLA4DigitProbs.get(K);
|
||||
|
||||
likelihood = (Double) HLA4DigitLs.get(K);
|
||||
count = (Integer) HLA4DigitCount.get(K);
|
||||
s = K.split("\\,");
|
||||
s1 = s[0].split("\\*"); name1 = s1[1];
|
||||
s2 = s[1].split("\\*"); name2 = s2[1];
|
||||
aLikelihood4 = (Double) HLA4DigitA.get(K);
|
||||
pLikelihood4 = (Double) HLA4DigitP.get(K);
|
||||
f1 = (Double) HLA4DigitF1.get(K);
|
||||
f2 = (Double) HLA4DigitF2.get(K);
|
||||
count1 = Double.valueOf(AlleleCount.get(s[0]).toString());
|
||||
count2 = Double.valueOf(AlleleCount.get(s[1]).toString());
|
||||
locusCount = Double.valueOf(LocusCount.get(s1[0]).toString());
|
||||
if (s[0].equals(s[1])){
|
||||
accountedFor = count1 / locusCount;
|
||||
}else{
|
||||
accountedFor = (count1 + count2) / locusCount;
|
||||
}
|
||||
if (prob > 0.1){
|
||||
out.printf("%s\t%s\t%s\t%.1f\t%.1f\t%.2f\t%.2f\t%.1f\t%.2f\t%.0f\t%.0f\t%.0f\t%.2f",s1[0],name1,name2,aLikelihood4/count,pLikelihood4/count,f1,f2,likelihood/count,prob,count1,count2,locusCount,accountedFor);
|
||||
for (int i = 0; i < Populations.length; i++){
|
||||
if (AlleleFrequencies[i].containsKey(s[0])){f1 = Double.valueOf(AlleleFrequencies[i].get(s[0]).toString());}else{f1=.000001;}
|
||||
if (AlleleFrequencies[i].containsKey(s[1])){f2 = Double.valueOf(AlleleFrequencies[i].get(s[1]).toString());}else{f2=.000001;}
|
||||
if (!Double.isInfinite(-1*Math.log10(f1*f2))){out.printf("\t%.2f",Math.log10(f1*f2));}else{out.printf("\t-INF");}
|
||||
}
|
||||
out.print("\n");
|
||||
}
|
||||
out.printf("INFO\t%s\t%s\t%s\t%.1f\t%.1f\t%.2f\t%.2f\t%.1f\t%.2f\t%.0f\t%.0f\t%.0f\t%.2f",s1[0],name1,name2,aLikelihood4/count,pLikelihood4/count,f1,f2,likelihood/count,prob,count1,count2,locusCount,accountedFor);
|
||||
for (int i = 0; i < Populations.length; i++){
|
||||
if (AlleleFrequencies[i].containsKey(s[0])){f1 = Double.valueOf(AlleleFrequencies[i].get(s[0]).toString());}else{f1=.000001;}
|
||||
if (AlleleFrequencies[i].containsKey(s[1])){f2 = Double.valueOf(AlleleFrequencies[i].get(s[1]).toString());}else{f2=.000001;}
|
||||
if (!Double.isInfinite(-1*Math.log10(f1*f2))){out.printf("\t%.2f",Math.log10(f1*f2));}else{out.printf("\t-INF");}
|
||||
}
|
||||
out.print("\n");
|
||||
}
|
||||
}
|
||||
|
||||
Comparator<Double> valueComparator = new Comparator<Double>() {
|
||||
@Override public int compare(Double val1, Double val2) {
|
||||
return val1.compareTo(val2);
|
||||
}
|
||||
};
|
||||
|
||||
private Integer[] InitializePolymorphicSites(){
|
||||
int HLA_A_start = 30018310, HLA_A_end = 30021211, num_A_positions = HLA_A_end - HLA_A_start + 1;
|
||||
int HLA_B_start = 31430239, HLA_B_end = 31432914, num_B_positions = HLA_B_end - HLA_B_start + 1;
|
||||
int HLA_C_start = 31344925, HLA_C_end = 31347827, num_C_positions = HLA_C_end - HLA_C_start + 1;
|
||||
Integer[] polymorphicSites = new Integer[num_A_positions+num_B_positions+num_C_positions];
|
||||
for (int i = 0; i < num_A_positions; i++){
|
||||
polymorphicSites[i]=HLA_A_start + i;
|
||||
}
|
||||
for (int i = 0; i < num_C_positions; i++){
|
||||
polymorphicSites[i+num_A_positions]=HLA_C_start + i;
|
||||
}
|
||||
for (int i = 0; i < num_B_positions; i++){
|
||||
polymorphicSites[i+num_A_positions+num_C_positions]=HLA_B_start + i;
|
||||
}
|
||||
return polymorphicSites;
|
||||
}
|
||||
|
||||
private int IndexOf(char c){
|
||||
switch(c){
|
||||
case 'A': return 0;
|
||||
case 'C': return 1;
|
||||
case 'G': return 2;
|
||||
case 'T': return 3;
|
||||
//case 'D': return 4;
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean IsWithinInterval(int pos){
|
||||
boolean isWithinInterval = false;
|
||||
for (int i = 0; i < numIntervals; i++){
|
||||
if (pos >= intervals[i][0] && pos <= intervals[i][1]){
|
||||
isWithinInterval = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return isWithinInterval;
|
||||
}
|
||||
|
||||
private void UpdateCorrelation(SAMRecord read){
|
||||
//Updates correlation table with SNPs from specific read (for phasing)
|
||||
String s = cigarparser.FormatRead(read.getCigarString(), read.getReadString());
|
||||
ArrayList<Integer> SNPsInRead = new ArrayList<Integer>();
|
||||
ArrayList<Integer> readindex = new ArrayList<Integer>();
|
||||
|
||||
int readstart = read.getAlignmentStart();
|
||||
int readend = read.getAlignmentEnd();
|
||||
int numPositions = PolymorphicSites.length;
|
||||
char c1, c2;
|
||||
int a, b, i, j, SNPcount = 0;
|
||||
|
||||
//Find all SNPs in read
|
||||
for (i = 0; i < numPositions; i++){
|
||||
if (PolymorphicSites[i] > readstart && PolymorphicSites[i] < readend){
|
||||
SNPnumInRead[i] = SNPcount;
|
||||
SNPposInRead[i] = PolymorphicSites[i]-readstart;
|
||||
SNPcount++;
|
||||
}else{
|
||||
SNPnumInRead[i] = -1;
|
||||
SNPposInRead[i] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
//Update correlation table; for each combination of SNP positions
|
||||
for (i = 0; i < numPositions; i++){
|
||||
if (SNPnumInRead[i] > -1){
|
||||
c1 = s.charAt(SNPposInRead[i]);
|
||||
if (IndexOf(c1) > -1){
|
||||
for (j = i+1; j < numPositions; j ++){
|
||||
if (SNPnumInRead[j] > -1){
|
||||
c2 = s.charAt(SNPposInRead[j]);
|
||||
if (IndexOf(c2) > -1){
|
||||
a = i*5 + IndexOf(c1);
|
||||
b = j*5 + IndexOf(c2);
|
||||
|
||||
numObservations[a][b]++;
|
||||
totalObservations[i][j]++;
|
||||
if (DEBUG){
|
||||
//out.printf("INFO %s\t%s %s\t[i=%s,j=%s]\t[%s,%s]\t[%s,%s]\n",read.getReadName(),PolymorphicSites[i],PolymorphicSites[j],i,j,c1,c2,a,b);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int GenotypeIndex(char a, char b){
|
||||
switch(a){
|
||||
case 'A':
|
||||
switch(b){
|
||||
case 'A': return 0;
|
||||
case 'C': return 1;
|
||||
case 'G': return 2;
|
||||
case 'T': return 3;
|
||||
};
|
||||
case 'C':
|
||||
switch(b){
|
||||
case 'A': return 1;
|
||||
case 'C': return 4;
|
||||
case 'G': return 5;
|
||||
case 'T': return 6;
|
||||
};
|
||||
case 'G':
|
||||
switch(b){
|
||||
case 'A': return 2;
|
||||
case 'C': return 5;
|
||||
case 'G': return 7;
|
||||
case 'T': return 8;
|
||||
};
|
||||
case 'T':
|
||||
switch(b){
|
||||
case 'A': return 3;
|
||||
case 'C': return 6;
|
||||
case 'G': return 8;
|
||||
case 'T': return 9;
|
||||
};
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
private double CalculateAlleleLikelihood(int a1, int a2, String[] HLAalleles, boolean debug){
|
||||
//Calculates likelihood for specific allele pair
|
||||
String read1 = HLAalleles[a1];
|
||||
String read2 = HLAalleles[a2];
|
||||
int start1 = HLAstartpos[a1];
|
||||
int start2 = HLAstartpos[a2];
|
||||
int stop1 = HLAstoppos[a1];
|
||||
int stop2 = HLAstoppos[a2];
|
||||
double likelihood = 0;
|
||||
int pos, index;
|
||||
char c1, c2;
|
||||
|
||||
|
||||
for (int i = 0; i < positions.length; i++){
|
||||
pos = positions[i];
|
||||
if (pos < stop1 && pos > start1 && pos < stop2 && pos > start2){
|
||||
index = GenotypeIndex(read1.charAt(pos-start1),read2.charAt(pos-start2));
|
||||
if (index > -1){
|
||||
likelihood = likelihood + baseLikelihoods[i][index];
|
||||
if (debug){
|
||||
c1 = read1.charAt(pos-start1);
|
||||
c2 = read2.charAt(pos-start2);
|
||||
out.printf("INFO: DEBUG %s\t%s\t%s\t%s\t%s\t%s\t%.2f\n",HLAnames[a1],HLAnames[a2],pos,c1,c2,index,likelihood);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return likelihood;
|
||||
}
|
||||
|
||||
private double CalculatePhaseLikelihood(int alleleIndex1, int alleleIndex2, boolean PRINTDEBUG, boolean SINGLEALLELE){
|
||||
//calculate the likelihood that the particular combination of alleles satisfies the phase count data
|
||||
double likelihood = 0, prob = 0;
|
||||
int readstart1 = HLAstartpos[alleleIndex1]; int readend1 = HLAstoppos[alleleIndex1];
|
||||
int readstart2 = HLAstartpos[alleleIndex2]; int readend2 = HLAstoppos[alleleIndex2];
|
||||
int combinedstart = Math.max(readstart1,readstart2);
|
||||
int combinedstop = Math.min(readend1,readend2);
|
||||
|
||||
int numPositions = PolymorphicSites.length, SNPcount = 0;
|
||||
int i, j, a1, a2, b1, b2;
|
||||
char c11, c12, c21, c22;
|
||||
int numInPhase = 0, numOutOfPhase = 0;
|
||||
double sumInPhase = 0.0, sumObservations = 0.0;
|
||||
|
||||
|
||||
//Find all SNPs in read
|
||||
for (i = 0; i < numPositions; i++){
|
||||
|
||||
if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop ){ // && IsWithinInterval(PolymorphicSites[i])
|
||||
if (PRINTDEBUG){
|
||||
out.printf("DEBUG\t%s\t%s\n",PolymorphicSites[i],SNPcount);
|
||||
}
|
||||
SNPnumInRead[i] = SNPcount;
|
||||
SNPposInRead[i] = PolymorphicSites[i]-combinedstart;
|
||||
SNPcount++;
|
||||
}else{
|
||||
SNPnumInRead[i] = -1;
|
||||
SNPposInRead[i] = -1;
|
||||
}
|
||||
}
|
||||
String s1 = HLAreads[alleleIndex1];
|
||||
String s2 = HLAreads[alleleIndex2];
|
||||
if (PRINTDEBUG){
|
||||
out.printf("DEBUG %s SNPs found in %s and %s between %s and %s\n",SNPcount,HLAnames[alleleIndex1], HLAnames[alleleIndex2],combinedstart,combinedstop);
|
||||
}
|
||||
//Iterate through every pairwise combination of SNPs, and update likelihood for the allele combination
|
||||
for (i = 0; i < numPositions; i++){
|
||||
if (SNPnumInRead[i] > -1){
|
||||
c11 = s1.charAt(SNPposInRead[i]);
|
||||
c21 = s2.charAt(SNPposInRead[i]);
|
||||
if (IndexOf(c11) > -1 && IndexOf(c21) > -1){
|
||||
for (j = i+1; j < numPositions; j ++){
|
||||
if (SNPnumInRead[j] > -1 && totalObservations[i][j] > 0){
|
||||
c12 = s1.charAt(SNPposInRead[j]);
|
||||
c22 = s2.charAt(SNPposInRead[j]);
|
||||
if (IndexOf(c12) > -1 && IndexOf(c22) > -1){
|
||||
a1 = i*5 + IndexOf(c11);
|
||||
b1 = j*5 + IndexOf(c12);
|
||||
a2 = i*5 + IndexOf(c21);
|
||||
b2 = j*5 + IndexOf(c22);
|
||||
//check if the two alleles are identical at the chosen 2 locations
|
||||
if ((c11 == c21) && (c12 == c22)){
|
||||
numInPhase = numObservations[a1][b1];
|
||||
}else{
|
||||
numInPhase = numObservations[a1][b1] + numObservations[a2][b2];
|
||||
}
|
||||
numOutOfPhase = totalObservations[i][j] - numInPhase;
|
||||
sumInPhase += (double) numInPhase;
|
||||
sumObservations += (double) totalObservations[i][j];
|
||||
if (SINGLEALLELE){
|
||||
likelihood = sumInPhase / sumObservations;
|
||||
}else{
|
||||
likelihood += numInPhase * L_correct + numOutOfPhase * L_err;
|
||||
}
|
||||
//prob = Math.max((double) numInPhase / (double) totalObservations[i][j], 0.0001);
|
||||
//likelihood += Math.log10(prob);
|
||||
//likelihood = Math.max(Math.log10(sumInPhase / sumObservations),-10);
|
||||
|
||||
if (PRINTDEBUG){
|
||||
out.printf("DEBUG %s %s %s[%s%s] %s[%s%s]\t[%s,%s]\t[%s,%s] [%s,%s]\t%s / %s\t%s / %s\t %.2f\n",HLAnames[alleleIndex1],HLAnames[alleleIndex2],PolymorphicSites[i],c11,c21,PolymorphicSites[j],c12,c22, i,j,a1,b1,a2,b2,numInPhase,totalObservations[i][j],sumInPhase,sumObservations,likelihood);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return likelihood;
|
||||
}
|
||||
|
||||
private void ExtraCode(){
|
||||
String name1, name2;
|
||||
//Pre-process homozygous combinations to determine top possible alleles (for efficiency)
|
||||
Hashtable Alleles2Digit = new Hashtable();
|
||||
Hashtable Phase2Digit = new Hashtable();
|
||||
Hashtable Count2Digit = new Hashtable();
|
||||
|
||||
Hashtable AllelesAtLocus = new Hashtable();
|
||||
ArrayList Loci = new ArrayList<String>();
|
||||
double[] AlleleLikelihoods2 = new double[HLAnames.length];
|
||||
double[] PhaseLikelihoods2 = new double[HLAnames.length];
|
||||
for (int i = 0; i < HLAnames.length; i++){
|
||||
name1 = HLAnames[i].substring(4);
|
||||
String [] n1 = name1.split("\\*");
|
||||
AlleleLikelihoods2[i] = CalculateAlleleLikelihood(i,i,HLAreads,false);
|
||||
PhaseLikelihoods2[i] = CalculatePhaseLikelihood(i,i,false,true);
|
||||
if (AlleleLikelihoods2[i] < 0){
|
||||
name2 = n1[0] + "*" + n1[1].substring(0, 4);
|
||||
if (!Loci.contains(n1[0])){
|
||||
Loci.add(n1[0]);
|
||||
MaxLikelihoods.put(n1[0], 0.0);
|
||||
AllelesAtLocus.put(n1[0], 1);
|
||||
}else{
|
||||
AllelesAtLocus.put(n1[0], 1+(Integer)AllelesAtLocus.get(n1[0]));
|
||||
}
|
||||
if (!Alleles2Digit.containsKey(name2)){
|
||||
Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
|
||||
Phase2Digit.put(name2, PhaseLikelihoods2[i]);
|
||||
Count2Digit.put(name2, 1.0);
|
||||
}else {
|
||||
if (AlleleLikelihoods2[i] > (Double) Alleles2Digit.get(name2)){
|
||||
Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
|
||||
}
|
||||
if (PhaseLikelihoods2[i] > (Double) Phase2Digit.get(name2)){
|
||||
Phase2Digit.put(name2, PhaseLikelihoods2[i]);
|
||||
}
|
||||
Count2Digit.put(name2,1.0+(Double)Count2Digit.get(name2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Sort alleles at 2 digit resolution for each locus
|
||||
|
||||
for (int i = 0; i < Loci.size(); i++){
|
||||
Enumeration k = Alleles2Digit.keys();
|
||||
Hashtable AllelesAtLoci = new Hashtable();
|
||||
HashMap map = new HashMap();
|
||||
int numalleles = 0;
|
||||
//find alleles at the locus
|
||||
while( k.hasMoreElements() ){
|
||||
name1 = k.nextElement().toString();
|
||||
String [] n1 = name1.split("\\*");
|
||||
if (Loci.get(i).equals(n1[0])){
|
||||
numalleles++;
|
||||
map.put(name1,-1 * (Double) Alleles2Digit.get(name1));
|
||||
AllelesAtLoci.put(-1 * (Double) Alleles2Digit.get(name1), name1);
|
||||
//out.printf("%s\t%.2f\n",name1,-1 * (Double) Alleles2Digit.get(name1));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//Sort alleles at locus, mark top six 2-digit classes for deep search
|
||||
List<Map.Entry<String, Double>> entries = new ArrayList<Entry<String, Double>>(map.entrySet());
|
||||
Collections.sort(entries, new Comparator<Entry<String, Double>>() {
|
||||
public int compare(Entry<String, Double> e1, Entry<String, Double> e2) {
|
||||
return e1.getValue().compareTo(e2.getValue());
|
||||
}
|
||||
});
|
||||
int num = 1;
|
||||
for (Map.Entry<String, Double> entry : entries) {
|
||||
if (num <= Math.max(5,entries.size()/8)){
|
||||
AllelesToSearch.add(entry.getKey());
|
||||
out.printf("INFO\t%s\t%.2f\t%.2f\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey()));
|
||||
num++;
|
||||
}else{
|
||||
if (!AllelesToSearch.contains(entry.getKey())){
|
||||
out.printf("INFO\t%s\t%.2f\t%.2f\tNotSearched\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey()));
|
||||
}else{
|
||||
out.printf("INFO\t%s\t%.2f\t%.2f\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out.printf("INFO\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* To change this template, choose Tools | Templates
|
||||
* and open the template in the editor.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
/**
|
||||
*
|
||||
* @author shermanjia
|
||||
*/
|
||||
public class HLAFileReader {
|
||||
ArrayList<String> Sequences = new ArrayList<String>();
|
||||
ArrayList<String> Names = new ArrayList<String>();
|
||||
ArrayList<Integer> StartPositions = new ArrayList<Integer>();
|
||||
ArrayList<Integer> StopPositions = new ArrayList<Integer>();
|
||||
int minstartpos;
|
||||
int maxstoppos;
|
||||
|
||||
CigarParser formatter = new CigarParser();
|
||||
|
||||
public String[] GetNames(){
|
||||
return Names.toArray(new String[Names.size()]);
|
||||
}
|
||||
|
||||
public String[] GetSequences(){
|
||||
return Sequences.toArray(new String[Sequences.size()]);
|
||||
}
|
||||
|
||||
public Integer[] GetStartPositions(){
|
||||
return StartPositions.toArray(new Integer[StartPositions.size()]);
|
||||
}
|
||||
|
||||
public Integer[] GetStopPositions(){
|
||||
return StopPositions.toArray(new Integer[StopPositions.size()]);
|
||||
}
|
||||
|
||||
|
||||
public Integer GetMinStartPos(){
|
||||
return minstartpos;
|
||||
}
|
||||
|
||||
public Integer GetMaxStopPos(){
|
||||
return maxstoppos;
|
||||
}
|
||||
|
||||
public int GetIndex(String readname){
|
||||
if (Names.contains(readname)){
|
||||
return Names.indexOf(readname);
|
||||
}else{
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
public void ReadFile(String filename){
|
||||
try{
|
||||
FileInputStream fstream = new FileInputStream(filename);
|
||||
DataInputStream in = new DataInputStream(fstream);
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(in));
|
||||
String strLine; String [] s = null;
|
||||
//Read File Line By Line
|
||||
while ((strLine = br.readLine()) != null) {
|
||||
s = strLine.split("\\t");
|
||||
Sequences.add(s[3]);
|
||||
Names.add(s[0]);
|
||||
StartPositions.add(Integer.valueOf(s[1]));
|
||||
StopPositions.add(Integer.valueOf(s[2]));
|
||||
minstartpos = Math.min(minstartpos, Integer.valueOf(s[1]));
|
||||
maxstoppos = Math.max(maxstoppos, Integer.valueOf(s[2]));
|
||||
}
|
||||
in.close();
|
||||
}catch (Exception e){//Catch exception if any
|
||||
System.err.println("HLAFileReader Error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -24,6 +24,14 @@ public class PolymorphicSitesFileReader {
|
|||
return NonPolymorphicSites.toArray(new Integer[NonPolymorphicSites.size()]);
|
||||
}
|
||||
|
||||
public void AddSites(Integer [] sites){
|
||||
for (int i = 0; i < sites.length; i++){
|
||||
if (!PolymorphicSites.contains(sites[i])){
|
||||
PolymorphicSites.add(sites[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void ReadFile(String filename){
|
||||
try{
|
||||
FileInputStream fstream = new FileInputStream(filename);
|
||||
|
|
@ -34,10 +42,10 @@ public class PolymorphicSitesFileReader {
|
|||
int i = 0;
|
||||
while ((strLine = br.readLine()) != null) {
|
||||
s = strLine.split("\\t");
|
||||
if (s[0].equals("POLYMORPHIC")){
|
||||
PolymorphicSites.add(Integer.valueOf(s[2]));
|
||||
if (Double.valueOf(s[8]) > 0.1){
|
||||
PolymorphicSites.add(Integer.valueOf(s[0]));
|
||||
}else{
|
||||
NonPolymorphicSites.add(Integer.valueOf(s[2]));
|
||||
NonPolymorphicSites.add(Integer.valueOf(s[0]));
|
||||
}
|
||||
}
|
||||
in.close();
|
||||
|
|
|
|||
|
|
@ -14,6 +14,9 @@ import java.util.Hashtable;
|
|||
*/
|
||||
public class SimilarityFileReader {
|
||||
ArrayList<String> ReadsToDiscard = new ArrayList<String>();
|
||||
ArrayList<String> AllelesToSearch = new ArrayList<String>();
|
||||
Hashtable AlleleCount = new Hashtable();
|
||||
Hashtable LocusCount = new Hashtable();
|
||||
Hashtable Concordance = new Hashtable();
|
||||
Hashtable NumMatches = new Hashtable();
|
||||
Hashtable NumMismatches = new Hashtable();
|
||||
|
|
@ -22,10 +25,22 @@ public class SimilarityFileReader {
|
|||
return ReadsToDiscard;
|
||||
}
|
||||
|
||||
public ArrayList<String> GetAllelesToSearch(){
|
||||
return AllelesToSearch;
|
||||
}
|
||||
|
||||
public String[] GetReadsToDiscardArray(){
|
||||
return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]);
|
||||
}
|
||||
|
||||
public Hashtable GetAlleleCount(){
|
||||
return AlleleCount;
|
||||
}
|
||||
|
||||
public Hashtable GetLocusCount(){
|
||||
return LocusCount;
|
||||
}
|
||||
|
||||
public Hashtable GetConcordance(){
|
||||
return Concordance;
|
||||
}
|
||||
|
|
@ -43,26 +58,56 @@ public class SimilarityFileReader {
|
|||
FileInputStream fstream = new FileInputStream(filename);
|
||||
DataInputStream in = new DataInputStream(fstream);
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(in));
|
||||
String strLine; String [] s = null;
|
||||
String strLine; String [] s = null, alleles = null, a = null; String allele;
|
||||
//Read File Line By Line
|
||||
int i = 0;
|
||||
while ((strLine = br.readLine()) != null) {
|
||||
s = strLine.split("\\t");
|
||||
if (s.length >= 6){
|
||||
Double matchFraction = Double.valueOf(s[4]);
|
||||
int numMismatches = Integer.valueOf(s[6]);
|
||||
|
||||
Double matchFraction = Double.valueOf(s[3]);
|
||||
int numMismatches = Integer.valueOf(s[5]);
|
||||
int numMatches = Integer.valueOf(s[4]);
|
||||
Concordance.put(s[0],matchFraction);
|
||||
NumMatches.put(s[0], s[5]);
|
||||
NumMatches.put(s[0], s[4]);
|
||||
NumMismatches.put(s[0], numMismatches);
|
||||
if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches > minAllowedMismatches)){
|
||||
if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < 10){
|
||||
ReadsToDiscard.add(s[0]);
|
||||
}else{
|
||||
Hashtable fourDigitAlleles = new Hashtable();
|
||||
alleles = s[6].split("\\,");
|
||||
if (alleles.length > 0){
|
||||
a = alleles[0].split("\\_");
|
||||
s = a[1].split("\\*");
|
||||
if (!LocusCount.containsKey(s[0])){
|
||||
LocusCount.put(s[0], 1);
|
||||
}else{
|
||||
LocusCount.put(s[0], (Integer) LocusCount.get(s[0]) + 1);
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < alleles.length; j++){
|
||||
a = alleles[j].split("\\_");
|
||||
s = a[1].split("\\*");
|
||||
allele = s[0] + "*" + s[1].substring(0,4);
|
||||
|
||||
if (!fourDigitAlleles.containsKey(allele)){
|
||||
fourDigitAlleles.put(allele, allele);
|
||||
if (!AlleleCount.containsKey(allele)){
|
||||
AlleleCount.put(allele, 1);
|
||||
}else{
|
||||
AlleleCount.put(allele, (Integer) AlleleCount.get(allele) + 1);
|
||||
}
|
||||
|
||||
if ((Integer) AlleleCount.get(allele) > 1 && !AllelesToSearch.contains(allele)){
|
||||
AllelesToSearch.add(allele);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
in.close();
|
||||
}catch (Exception e){//Catch exception if any
|
||||
System.err.println("SimilarityFile Error: " + e.getMessage());
|
||||
//System.err.println("SimilarityFile Error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue