HLA caller updated - now searches all (common and rare) alleles, more efficient read filtering and allele comparison runs.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3543 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d51e6c45a7
commit
5704294f9d
|
|
@ -33,7 +33,9 @@ import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Hashtable;
|
import java.util.Hashtable;
|
||||||
|
import java.util.Enumeration;
|
||||||
|
import java.util.Vector;
|
||||||
|
import java.util.Collections;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculates likelihood of observing the data given pairs of HLA alleles. NOTE: run CalculateBaseLikelihoods first! Usage: java -jar GenomeAnalysisTK.jar -T CalculateAlleleLikelihoods -I /humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl INPUT.baselikelihoods -eth\
|
* Calculates likelihood of observing the data given pairs of HLA alleles. NOTE: run CalculateBaseLikelihoods first! Usage: java -jar GenomeAnalysisTK.jar -T CalculateAlleleLikelihoods -I /humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl INPUT.baselikelihoods -eth\
|
||||||
|
|
@ -60,17 +62,20 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
||||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||||
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAllelesCommon";
|
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles4Digit";
|
||||||
Hashtable AlleleFrequencies,UniqueAlleles;
|
String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.txt";
|
||||||
|
String HLA2DigitFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY_2DIGIT.txt";
|
||||||
|
|
||||||
|
Hashtable AlleleFrequencies,UniqueAlleles,Alleles2Digit;
|
||||||
|
|
||||||
CigarParser formatter = new CigarParser();
|
CigarParser formatter = new CigarParser();
|
||||||
double[][] baseLikelihoods;
|
double[][] baseLikelihoods;
|
||||||
int[] positions;
|
int[] positions;
|
||||||
boolean loaded = false;
|
boolean loaded = false;
|
||||||
|
|
||||||
String[] HLAnames, HLAreads;
|
String[] HLAnames, HLAreads, HLAnames2, HLAreads2;
|
||||||
Integer[] HLAstartpos, HLAstoppos;
|
Integer[] HLAstartpos, HLAstoppos, HLAstartpos2, HLAstoppos2;
|
||||||
ArrayList<String> HLAnamesAL, HLAreadsAL;
|
ArrayList<String> HLAnamesAL, HLAreadsAL, Loci, AllelesToSearch;
|
||||||
ArrayList<Integer> HLAstartposAL, HLAstopposAL;
|
ArrayList<Integer> HLAstartposAL, HLAstopposAL;
|
||||||
|
|
||||||
public Integer reduceInit() {
|
public Integer reduceInit() {
|
||||||
|
|
@ -86,21 +91,27 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
||||||
HLAstartposAL = new ArrayList<Integer>();
|
HLAstartposAL = new ArrayList<Integer>();
|
||||||
HLAstopposAL = new ArrayList<Integer>();
|
HLAstopposAL = new ArrayList<Integer>();
|
||||||
|
|
||||||
if (!ethnicity.equals("CaucasianUSA")){
|
out.printf("INFO Reading HLA alleles ... ");
|
||||||
AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_" + ethnicity + ".freq";
|
HLAFileReader HLADictionaryReader = new HLAFileReader();
|
||||||
}
|
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
||||||
out.printf("INFO Reading HLA allele frequencies ... ");
|
HLAreads = HLADictionaryReader.GetSequences();
|
||||||
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
|
HLAnames = HLADictionaryReader.GetNames();
|
||||||
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
|
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
||||||
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
|
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
||||||
UniqueAlleles = HLAfreqReader.GetUniqueAlleles();
|
|
||||||
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
|
HLADictionaryReader = new HLAFileReader();
|
||||||
|
HLADictionaryReader.ReadFile(HLA2DigitFile);
|
||||||
|
HLAreads2 = HLADictionaryReader.GetSequences();
|
||||||
|
HLAnames2 = HLADictionaryReader.GetNames();
|
||||||
|
HLAstartpos2 = HLADictionaryReader.GetStartPositions();
|
||||||
|
HLAstoppos2 = HLADictionaryReader.GetStopPositions();
|
||||||
|
out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
|
||||||
|
|
||||||
//out.printf("INFO Common alleles:\n");
|
//out.printf("INFO Common alleles:\n");
|
||||||
for (int i = 1; i < UniqueAlleles.size(); i++){
|
for (int i = 1; i < UniqueAlleles.size(); i++){
|
||||||
//out.printf("INFO %s\n",UniqueAlleles.values().toArray()[i]);
|
//out.printf("INFO %s\n",UniqueAlleles.values().toArray()[i]);
|
||||||
}
|
}
|
||||||
out.printf("INFO Reading HLA dictionary ...");
|
//out.printf("INFO Reading HLA dictionary ...");
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -108,10 +119,11 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
HLAnamesAL.add(read.getReadName());
|
//HLAnamesAL.add(read.getReadName());
|
||||||
HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString()));
|
//HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString()));
|
||||||
HLAstartposAL.add(read.getAlignmentStart());
|
//HLAstartposAL.add(read.getAlignmentStart());
|
||||||
HLAstopposAL.add(read.getAlignmentEnd());
|
//HLAstopposAL.add(read.getAlignmentEnd());
|
||||||
|
//out.printf("INFO\t%s\t%s\t%s\t%s\n",read.getReadName(),read.getAlignmentStart(),read.getAlignmentEnd(),formatter.FormatRead(read.getCigarString(), read.getReadString()));
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -154,11 +166,11 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
||||||
}
|
}
|
||||||
|
|
||||||
public void onTraversalDone(Integer numreads) {
|
public void onTraversalDone(Integer numreads) {
|
||||||
out.printf("Done! %s alleles found\n", numreads);
|
//out.printf("Done! %s alleles found\n", numreads);
|
||||||
HLAnames = HLAnamesAL.toArray(new String[numreads]);
|
//HLAnames = HLAnamesAL.toArray(new String[numreads]);
|
||||||
HLAreads = HLAreadsAL.toArray(new String[numreads]);
|
//HLAreads = HLAreadsAL.toArray(new String[numreads]);
|
||||||
HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]);
|
//HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]);
|
||||||
HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]);
|
//HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]);
|
||||||
|
|
||||||
double[][] AlleleLikelihoods = new double[numreads][numreads];
|
double[][] AlleleLikelihoods = new double[numreads][numreads];
|
||||||
|
|
||||||
|
|
@ -173,6 +185,7 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
||||||
int numcombinations = 0;
|
int numcombinations = 0;
|
||||||
out.printf("NUM\tAllele1\tAllele2\tSSG\n");
|
out.printf("NUM\tAllele1\tAllele2\tSSG\n");
|
||||||
|
|
||||||
|
//debugging specific alleles
|
||||||
int index1 = -1, index2 = -1;
|
int index1 = -1, index2 = -1;
|
||||||
if (!debugAlleles.equals("")){
|
if (!debugAlleles.equals("")){
|
||||||
String s[] = debugAlleles.split(",");
|
String s[] = debugAlleles.split(",");
|
||||||
|
|
@ -185,52 +198,96 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
||||||
}
|
}
|
||||||
if (index1 > -1 && index2 > -1){
|
if (index1 > -1 && index2 > -1){
|
||||||
out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2);
|
out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2);
|
||||||
double dl = CalculateLikelihood(index1,index2,true);
|
double dl = CalculateLikelihood(index1,index2,HLAreads2,true);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Pre-process homozygous combinations to determine top possible alleles (for efficiency)
|
||||||
|
int numreads2 = HLAnames2.length;
|
||||||
|
Alleles2Digit = new Hashtable();
|
||||||
|
Loci = new ArrayList<String>();
|
||||||
|
double[] AlleleLikelihoods2 = new double[numreads];
|
||||||
for (int i = 0; i < numreads; i++){
|
for (int i = 0; i < numreads; i++){
|
||||||
name1 = HLAnames[i].substring(4);
|
name1 = HLAnames[i].substring(4);
|
||||||
String [] n1 = name1.split("\\*");
|
String [] n1 = name1.split("\\*");
|
||||||
// out.printf("1: %s\n",n1[0] + "*" + n1[1].substring(0, 3));
|
numcombinations++;
|
||||||
if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){
|
AlleleLikelihoods2[i] = CalculateLikelihood(i,i,HLAreads,false);
|
||||||
|
if (AlleleLikelihoods2[i] < 0){
|
||||||
|
name2 = n1[0] + "*" + n1[1].substring(0, 2);
|
||||||
|
if (!Loci.contains(n1[0])){Loci.add(n1[0]);}
|
||||||
|
if (!Alleles2Digit.containsKey(name2)){
|
||||||
|
Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
|
||||||
|
}else if ((Double) Alleles2Digit.get(name2) < AlleleLikelihoods2[i]){
|
||||||
|
Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//Sort alleles at 2 digit resolution for each locus
|
||||||
|
AllelesToSearch = new ArrayList<String>();
|
||||||
|
for (int i = 0; i < Loci.size(); i++){
|
||||||
|
Enumeration k = Alleles2Digit.keys();
|
||||||
|
Hashtable AllelesAtLoci = new Hashtable();
|
||||||
|
|
||||||
|
//find alleles at the locus
|
||||||
|
while( k.hasMoreElements() ){
|
||||||
|
name1 = k.nextElement().toString();
|
||||||
|
String [] n1 = name1.split("\\*");
|
||||||
|
if (Loci.get(i).equals(n1[0])){
|
||||||
|
AllelesAtLoci.put(-1 * (Double) Alleles2Digit.get(name1), name1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//Sort alleles at locus, mark top six 2-digit classes for deep search
|
||||||
|
int num = 1;
|
||||||
|
Vector v = new Vector(AllelesAtLoci.keySet());
|
||||||
|
Collections.sort(v);
|
||||||
|
for (Enumeration e = v.elements(); e.hasMoreElements();) {
|
||||||
|
Double key = Double.valueOf(e.nextElement().toString());
|
||||||
|
String allele = AllelesAtLoci.get(key).toString();
|
||||||
|
if (num <= 10){
|
||||||
|
AllelesToSearch.add(allele);
|
||||||
|
|
||||||
|
num++;
|
||||||
|
}
|
||||||
|
//out.printf("%s\t%s\n",allele,key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//Iterate through allele pairs to calculate likelihoods
|
||||||
|
if (true){
|
||||||
|
numcombinations = 0;
|
||||||
|
for (int i = 0; i < numreads; i++){
|
||||||
|
name1 = HLAnames[i].substring(4);
|
||||||
|
String [] n1 = name1.split("\\*");
|
||||||
|
if (AllelesToSearch.contains(n1[0] + "*" + n1[1].substring(0, 2))){
|
||||||
//out.printf("1: %s\n",name1);
|
//out.printf("1: %s\n",name1);
|
||||||
//frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
|
//frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
|
||||||
//if (frq1 > minfrq){
|
//if (frq1 > minfrq){
|
||||||
for (int j = i; j < numreads; j++){
|
for (int j = i; j < numreads; j++){
|
||||||
name2 = HLAnames[j].substring(4);
|
name2 = HLAnames[j].substring(4);
|
||||||
String [] n2 = name2.split("\\*");
|
String [] n2 = name2.split("\\*");
|
||||||
// out.printf("2: %s\n",n2[0] + "*" + n2[1].substring(0, 3));
|
if (AllelesToSearch.contains(n2[0] + "*" + n2[1].substring(0, 2))){
|
||||||
if (UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){
|
|
||||||
|
|
||||||
// frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
|
|
||||||
// if (frq2 > minfrq){
|
|
||||||
if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
|
if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
|
||||||
numcombinations++;
|
numcombinations++;
|
||||||
AlleleLikelihoods[i][j] = CalculateLikelihood(i,j,false);
|
AlleleLikelihoods[i][j] = CalculateLikelihood(i,j,HLAreads,false);
|
||||||
|
if (AlleleLikelihoods[i][j] < 0){
|
||||||
out.printf("%s\t%s\t%s\t%.2f\n",numcombinations,name1,name2,AlleleLikelihoods[i][j]);
|
out.printf("%s\t%s\t%s\t%.2f\n",numcombinations,name1,name2,AlleleLikelihoods[i][j]);
|
||||||
}
|
}
|
||||||
// }else{
|
|
||||||
// if (DEBUG){out.printf("%s has allele frequency%.5f\n",name2,frq2);}
|
|
||||||
// }
|
|
||||||
// }else{
|
|
||||||
// if (DEBUG){out.printf("%s not found in allele frequency file\n",name2);}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//}else{
|
}
|
||||||
// if (DEBUG){out.printf("%s has allele frequency%.5f\n",name1,frq1);}
|
}
|
||||||
//}
|
|
||||||
//}else{
|
|
||||||
// if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private double CalculateLikelihood(int a1, int a2, boolean debug){
|
private double CalculateLikelihood(int a1, int a2, String[] HLAalleles, boolean debug){
|
||||||
String read1 = HLAreads[a1];
|
//Calculates likelihood for specific allele pair
|
||||||
String read2 = HLAreads[a2];
|
String read1 = HLAalleles[a1];
|
||||||
|
String read2 = HLAalleles[a2];
|
||||||
int start1 = HLAstartpos[a1];
|
int start1 = HLAstartpos[a1];
|
||||||
int start2 = HLAstartpos[a2];
|
int start2 = HLAstartpos[a2];
|
||||||
int stop1 = HLAstoppos[a1];
|
int stop1 = HLAstoppos[a1];
|
||||||
|
|
|
||||||
|
|
@ -72,34 +72,13 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
||||||
ArrayList<SAMRecord> AllReads = new ArrayList<SAMRecord>();
|
ArrayList<SAMRecord> AllReads = new ArrayList<SAMRecord>();
|
||||||
ArrayList<String> AllReadNames = new ArrayList<String>();
|
ArrayList<String> AllReadNames = new ArrayList<String>();
|
||||||
|
|
||||||
boolean HLAdataLoaded = false;
|
boolean dataLoaded = false;
|
||||||
|
|
||||||
|
|
||||||
//Loads HLA dictionary, allele frequencies, and reads to filter
|
//Loads reads to filter
|
||||||
public Pair<Long, Long> reduceInit() {
|
public Pair<Long, Long> reduceInit() {
|
||||||
if (!HLAdataLoaded){
|
if (!dataLoaded){
|
||||||
HLAdataLoaded = true;
|
dataLoaded = true;
|
||||||
|
|
||||||
out.printf("INFO Reading HLA database ... ");
|
|
||||||
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
|
||||||
HLAreads = HLADictionaryReader.GetReads();
|
|
||||||
HLAnames = HLADictionaryReader.GetReadNames();
|
|
||||||
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
|
||||||
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
|
||||||
InitializeVariables(HLAreads.length);
|
|
||||||
out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if (!ethnicity.equals("CaucasianUSA")){
|
|
||||||
AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_" + ethnicity + ".freq";
|
|
||||||
}
|
|
||||||
out.printf("INFO Reading HLA allele frequencies ... ");
|
|
||||||
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
|
|
||||||
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
|
|
||||||
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
|
|
||||||
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
|
|
||||||
|
|
||||||
|
|
||||||
if (!filterFile.equals("")){
|
if (!filterFile.equals("")){
|
||||||
out.printf("INFO Reading properties file ... ");
|
out.printf("INFO Reading properties file ... ");
|
||||||
|
|
@ -137,7 +116,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
||||||
|
|
||||||
int numAs = 0, numCs = 0, numGs = 0, numTs = 0;
|
int numAs = 0, numCs = 0, numGs = 0, numTs = 0;
|
||||||
//if (DEBUG){
|
//if (DEBUG){
|
||||||
out.printf("%s\t%s\t", context.getLocation(),ref.getBase());
|
out.printf("%s\t%s\t", context.getLocation(),(char)ref.getBase());
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//Calculate posterior probabilities
|
//Calculate posterior probabilities
|
||||||
|
|
|
||||||
|
|
@ -54,23 +54,24 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
|
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
|
||||||
public String ethnicity = "Caucasian";
|
public String ethnicity = "Caucasian";
|
||||||
|
|
||||||
|
@Argument(fullName = "useInterval", shortName = "useInterval", doc = "Use only these intervals", required = false)
|
||||||
|
public String intervalFile = "";
|
||||||
|
|
||||||
@Argument(fullName = "dictionary", shortName = "dictionary", doc = "bam file of HLA ditionary", required = false)
|
@Argument(fullName = "dictionary", shortName = "dictionary", doc = "bam file of HLA ditionary", required = false)
|
||||||
public String HLAdictionaryFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam";
|
public String HLAdictionaryFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam";
|
||||||
|
|
||||||
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
||||||
public boolean ONLYFREQUENT = false;
|
public boolean ONLYFREQUENT = false;
|
||||||
|
|
||||||
SAMFileReader HLADictionaryReader = new SAMFileReader();
|
String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt";
|
||||||
|
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt";
|
||||||
|
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt";
|
||||||
|
|
||||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
HLAFileReader HLADictionaryReader = new HLAFileReader();
|
||||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
|
||||||
String AlleleFrequencyFile;
|
|
||||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
|
||||||
|
|
||||||
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
|
|
||||||
|
|
||||||
boolean DatabaseLoaded = false;
|
boolean DatabaseLoaded = false;
|
||||||
boolean DEBUG = false;
|
boolean DEBUG = false;
|
||||||
|
ArrayList<String> ClosestAlleles = new ArrayList<String>();
|
||||||
|
|
||||||
String[] HLAnames, HLAreads;
|
String[] HLAnames, HLAreads;
|
||||||
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
|
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
|
||||||
|
|
@ -88,6 +89,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
Hashtable AlleleFrequencies = new Hashtable();
|
Hashtable AlleleFrequencies = new Hashtable();
|
||||||
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
|
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
|
||||||
CigarParser formatter = new CigarParser();
|
CigarParser formatter = new CigarParser();
|
||||||
|
int [][] intervals; int numIntervals;
|
||||||
|
|
||||||
public Integer reduceInit() {
|
public Integer reduceInit() {
|
||||||
if (!DatabaseLoaded){
|
if (!DatabaseLoaded){
|
||||||
|
|
@ -96,9 +98,9 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
//Load HLA dictionary
|
//Load HLA dictionary
|
||||||
out.printf("INFO Loading HLA dictionary ... ");
|
out.printf("INFO Loading HLA dictionary ... ");
|
||||||
|
|
||||||
HLADictionaryReader.ReadFile(HLAdictionaryFile);
|
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
||||||
HLAreads = HLADictionaryReader.GetReads();
|
HLAreads = HLADictionaryReader.GetSequences();
|
||||||
HLAnames = HLADictionaryReader.GetReadNames();
|
HLAnames = HLADictionaryReader.GetNames();
|
||||||
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
||||||
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
||||||
minstartpos = HLADictionaryReader.GetMinStartPos();
|
minstartpos = HLADictionaryReader.GetMinStartPos();
|
||||||
|
|
@ -110,20 +112,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
concordance = new double[HLAreads.length];
|
concordance = new double[HLAreads.length];
|
||||||
numcompared = new double[HLAreads.length];
|
numcompared = new double[HLAreads.length];
|
||||||
|
|
||||||
//Read allele frequencies
|
//Load list of polymorphic sites
|
||||||
if (ethnicity.equals("Black")){
|
|
||||||
AlleleFrequencyFile = BlackAlleleFrequencyFile;
|
|
||||||
}else{
|
|
||||||
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
|
|
||||||
}
|
|
||||||
out.printf("INFO Reading HLA allele frequencies ... ");
|
|
||||||
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
|
|
||||||
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
|
|
||||||
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
|
|
||||||
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
|
|
||||||
|
|
||||||
//FindPolymorphicSites(minstartpos,maxstoppos);
|
|
||||||
|
|
||||||
PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader();
|
PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader();
|
||||||
siteFileReader.ReadFile(PolymorphicSitesFile);
|
siteFileReader.ReadFile(PolymorphicSitesFile);
|
||||||
PolymorphicSites = siteFileReader.GetPolymorphicSites();
|
PolymorphicSites = siteFileReader.GetPolymorphicSites();
|
||||||
|
|
@ -131,6 +120,20 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
numpolymorphicsites = PolymorphicSites.length;
|
numpolymorphicsites = PolymorphicSites.length;
|
||||||
numnonpolymorphicsites = NonPolymorphicSites.length;
|
numnonpolymorphicsites = NonPolymorphicSites.length;
|
||||||
|
|
||||||
|
if (!intervalFile.equals("")){
|
||||||
|
TextFileReader fileReader = new TextFileReader();
|
||||||
|
fileReader.ReadFile(intervalFile);
|
||||||
|
String[] lines = fileReader.GetLines();
|
||||||
|
intervals = new int[lines.length][2];
|
||||||
|
for (int i = 0; i < lines.length; i++) {
|
||||||
|
String[] s = lines[i].split(":");
|
||||||
|
String[] intervalPieces = s[1].split("-");
|
||||||
|
intervals[i][0] = Integer.valueOf(intervalPieces[0]);
|
||||||
|
intervals[i][1] = Integer.valueOf(intervalPieces[1]);
|
||||||
|
}
|
||||||
|
numIntervals = intervals.length;
|
||||||
|
}
|
||||||
|
|
||||||
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
|
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
|
||||||
out.printf("INFO Comparing reads to database ...\n");
|
out.printf("INFO Comparing reads to database ...\n");
|
||||||
|
|
||||||
|
|
@ -167,7 +170,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
//Polymorphic sites: always increment denominator, increment numerator when bases are concordant
|
//Polymorphic sites: always increment denominator, increment numerator when bases are concordant
|
||||||
for (int j = 0; j < numpolymorphicsites; j++){
|
for (int j = 0; j < numpolymorphicsites; j++){
|
||||||
pos = PolymorphicSites[j];
|
pos = PolymorphicSites[j];
|
||||||
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
|
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop && IsWithinInterval(pos)){
|
||||||
c1 = s1.charAt(pos-readstart);
|
c1 = s1.charAt(pos-readstart);
|
||||||
c2 = s2.charAt(pos-allelestart);
|
c2 = s2.charAt(pos-allelestart);
|
||||||
if (c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
|
if (c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
|
||||||
|
|
@ -176,7 +179,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
nummatched[i]++;
|
nummatched[i]++;
|
||||||
}else{
|
}else{
|
||||||
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
||||||
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
|
out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, pos,c1,c2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -187,13 +190,13 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
if (numcompared[i] > 0){
|
if (numcompared[i] > 0){
|
||||||
for (int j = 0; j < numnonpolymorphicsites; j++){
|
for (int j = 0; j < numnonpolymorphicsites; j++){
|
||||||
pos = NonPolymorphicSites[j];
|
pos = NonPolymorphicSites[j];
|
||||||
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
|
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop && IsWithinInterval(pos)){
|
||||||
c1 = s1.charAt(pos-readstart);
|
c1 = s1.charAt(pos-readstart);
|
||||||
c2 = s2.charAt(pos-allelestart);
|
c2 = s2.charAt(pos-allelestart);
|
||||||
if (c1 != c2 && c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
|
if (c1 != c2 && c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
|
||||||
numcompared[i]++;
|
numcompared[i]++;
|
||||||
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
||||||
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
|
out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -205,7 +208,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
concordance[i]=nummatched[i]/numcompared[i];
|
concordance[i]=nummatched[i]/numcompared[i];
|
||||||
if (concordance[i] > maxConcordance){maxConcordance = concordance[i];}
|
if (concordance[i] > maxConcordance){maxConcordance = concordance[i];}
|
||||||
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
||||||
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
|
out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
|
||||||
}
|
}
|
||||||
if (findFirst && (concordance[i] == 1)){
|
if (findFirst && (concordance[i] == 1)){
|
||||||
break;
|
break;
|
||||||
|
|
@ -228,32 +231,42 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
return maxFreq;
|
return maxFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean IsWithinInterval(int pos){
|
||||||
|
boolean isWithinInterval = false;
|
||||||
|
for (int i = 0; i < numIntervals; i++){
|
||||||
|
if (pos >= intervals[i][0] && pos <= intervals[i][1]){
|
||||||
|
isWithinInterval = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return isWithinInterval;
|
||||||
|
}
|
||||||
|
|
||||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
//Calculate concordance for this read and all overlapping reads
|
//Calculate concordance for this read and all overlapping reads
|
||||||
if (read.getMappingQuality() > 0){
|
if (read.getMappingQuality() > 0){
|
||||||
double maxConcordance = CalculateConcordance(read);
|
double maxConcordance = CalculateConcordance(read);
|
||||||
|
String stats = "", topAlleles = "";
|
||||||
if (maxConcordance > 0){
|
if (maxConcordance > 0){
|
||||||
String readname = read.getReadName(), allelename = ""; double freq;
|
String readname = read.getReadName(), allelename = ""; double freq;
|
||||||
//For input bam files that contain HLA alleles, find and print allele frequency
|
//For input bam files that contain HLA alleles, find and print allele frequency
|
||||||
//freq = GetAlleleFrequency(readname);
|
|
||||||
out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());
|
out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());
|
||||||
|
|
||||||
//Find the maximum frequency of the alleles most concordant with the read
|
|
||||||
//double maxFreq = FindMaxAlleleFrequency(maxConcordance);
|
|
||||||
|
|
||||||
//Print concordance statistics between this read and the most similar HLA allele(s)
|
//Print concordance statistics between this read and the most similar HLA allele(s)
|
||||||
|
|
||||||
for (int i = 0; i < HLAreads.length; i++){
|
for (int i = 0; i < HLAreads.length; i++){
|
||||||
if (concordance[i] == maxConcordance){
|
if (concordance[i] == maxConcordance){
|
||||||
freq = GetAlleleFrequency(HLAnames[i]);
|
//freq = GetAlleleFrequency(HLAnames[i]);
|
||||||
//if (freq == maxFreq){
|
if (topAlleles.equals("")){
|
||||||
out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
|
topAlleles = HLAnames[i];
|
||||||
//}
|
}else{
|
||||||
break;
|
topAlleles = topAlleles + "," + HLAnames[i];
|
||||||
|
}
|
||||||
|
stats = String.format("%.1f\t%.3f\t%.0f\t%.0f",1.0,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out.print("\n");
|
out.printf("\t%s\t%s\n",stats,topAlleles);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
||||||
|
|
@ -29,18 +29,12 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
|
||||||
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
||||||
public boolean ONLYFREQUENT = false;
|
public boolean ONLYFREQUENT = false;
|
||||||
|
|
||||||
//String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
|
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt";
|
||||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
|
|
||||||
|
|
||||||
SAMFileReader HLADictionaryReader = new SAMFileReader();
|
|
||||||
|
|
||||||
//String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
|
||||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
|
||||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
|
||||||
String AlleleFrequencyFile;
|
|
||||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||||
|
|
||||||
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
|
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt";
|
||||||
|
String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt";
|
||||||
|
HLAFileReader HLADictionaryReader = new HLAFileReader();
|
||||||
|
|
||||||
boolean DatabaseLoaded = false;
|
boolean DatabaseLoaded = false;
|
||||||
boolean DEBUG = false;
|
boolean DEBUG = false;
|
||||||
|
|
@ -69,8 +63,8 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
|
||||||
out.printf("INFO Loading HLA dictionary ... ");
|
out.printf("INFO Loading HLA dictionary ... ");
|
||||||
|
|
||||||
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
||||||
HLAreads = HLADictionaryReader.GetReads();
|
HLAreads = HLADictionaryReader.GetSequences();
|
||||||
HLAnames = HLADictionaryReader.GetReadNames();
|
HLAnames = HLADictionaryReader.GetNames();
|
||||||
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
||||||
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
||||||
minstartpos = HLADictionaryReader.GetMinStartPos();
|
minstartpos = HLADictionaryReader.GetMinStartPos();
|
||||||
|
|
@ -82,18 +76,6 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
|
||||||
concordance = new double[HLAreads.length];
|
concordance = new double[HLAreads.length];
|
||||||
numcompared = new double[HLAreads.length];
|
numcompared = new double[HLAreads.length];
|
||||||
|
|
||||||
//Read allele frequencies
|
|
||||||
if (ethnicity.equals("Black")){
|
|
||||||
AlleleFrequencyFile = BlackAlleleFrequencyFile;
|
|
||||||
}else{
|
|
||||||
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
|
|
||||||
}
|
|
||||||
out.printf("INFO Reading HLA allele frequencies ... ");
|
|
||||||
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
|
|
||||||
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
|
|
||||||
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
|
|
||||||
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
|
|
||||||
|
|
||||||
FindPolymorphicSites(minstartpos,maxstoppos);
|
FindPolymorphicSites(minstartpos,maxstoppos);
|
||||||
|
|
||||||
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
|
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
|
||||||
|
|
@ -108,13 +90,15 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
|
||||||
|
|
||||||
private void FindPolymorphicSites(int start, int stop){
|
private void FindPolymorphicSites(int start, int stop){
|
||||||
boolean initialized, polymorphic, examined;
|
boolean initialized, polymorphic, examined;
|
||||||
char c = ' ';
|
char c = ' ', ch = ' ';
|
||||||
|
int A = 0, C = 0, G = 0, T = 0;
|
||||||
ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
|
ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
|
||||||
ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
|
ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
|
||||||
//Find polymorphic sites in dictionary
|
//Find polymorphic sites in dictionary
|
||||||
for (int pos = start; pos <= stop; pos++){
|
for (int pos = start; pos <= stop; pos++){
|
||||||
initialized = false; polymorphic = false; examined = false;
|
initialized = false; polymorphic = false; examined = false;
|
||||||
//look across all alleles at specific position to see if it is polymorphic
|
//look across all alleles at specific position to see if it is polymorphic
|
||||||
|
A = 0; C = 0; G = 0; T = 0;
|
||||||
for (int i = 0; i < HLAreads.length; i++){
|
for (int i = 0; i < HLAreads.length; i++){
|
||||||
if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
|
if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
|
||||||
if (!initialized){
|
if (!initialized){
|
||||||
|
|
@ -122,20 +106,29 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
|
||||||
initialized = true;
|
initialized = true;
|
||||||
examined = true;
|
examined = true;
|
||||||
}
|
}
|
||||||
if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
|
ch = HLAreads[i].charAt(pos-HLAstartpos[i]);
|
||||||
polymorphicsites.add(pos);
|
if (ch == 'A'){A++;}
|
||||||
out.printf("POLYMORPHIC\t6\t%s\n", pos);
|
else if (ch == 'C'){C++;}
|
||||||
polymorphic = true;
|
else if (ch == 'T'){T++;}
|
||||||
break;
|
else if (ch == 'G'){G++;}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if (ch != c){
|
||||||
|
// polymorphicsites.add(pos);
|
||||||
|
// out.printf("POLYMORPHIC\t6\t%s\n", pos);
|
||||||
|
polymorphic = true;
|
||||||
|
// break;
|
||||||
}
|
}
|
||||||
if (!polymorphic && examined){
|
|
||||||
nonpolymorphicsites.add(pos);
|
|
||||||
out.printf("CONSERVED\t6\t%s\n", pos);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (polymorphic){
|
||||||
|
out.printf("%s\t%s\t%s\t%s\t%s\n",pos,A,C,G,T);
|
||||||
|
}
|
||||||
|
//if (!polymorphic && examined){
|
||||||
|
// nonpolymorphicsites.add(pos);
|
||||||
|
// out.printf("CONSERVED\t6\t%s\n", pos);
|
||||||
|
//}
|
||||||
|
|
||||||
|
}
|
||||||
PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
|
PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
|
||||||
NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
|
NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,41 +7,68 @@ import java.util.Hashtable;
|
||||||
* @author shermanjia
|
* @author shermanjia
|
||||||
*/
|
*/
|
||||||
public class FrequencyFileReader {
|
public class FrequencyFileReader {
|
||||||
Hashtable AlleleFrequencies = new Hashtable();
|
Hashtable MaxFrequencies = new Hashtable();
|
||||||
Hashtable UniqueAlleles = new Hashtable();
|
Hashtable CommonAlleles = new Hashtable();
|
||||||
|
Hashtable [] AlleleFrequencies = null;
|
||||||
|
String [] Populations = null;
|
||||||
|
|
||||||
public Hashtable GetAlleleFrequencies(){
|
public Hashtable [] GetAlleleFrequencies(){
|
||||||
|
//return allele frequencies for all populations
|
||||||
return AlleleFrequencies;
|
return AlleleFrequencies;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Hashtable GetUniqueAlleles(){
|
public Hashtable GetCommonAlleles(){
|
||||||
return UniqueAlleles;
|
//return list of common alleles
|
||||||
|
return CommonAlleles;
|
||||||
}
|
}
|
||||||
public void ReadFile(String filename, String uniqueAllelesFile){
|
|
||||||
|
public Hashtable GetMaxFrequencies(){
|
||||||
|
//return list of common alleles
|
||||||
|
return MaxFrequencies;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String[] GetPopulations(){
|
||||||
|
//Return name of populations
|
||||||
|
return Populations;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void ReadFile(String filename, String ethnicity){
|
||||||
try{
|
try{
|
||||||
|
int linenum = 0;
|
||||||
FileInputStream fstream = new FileInputStream(filename);
|
FileInputStream fstream = new FileInputStream(filename);
|
||||||
DataInputStream in = new DataInputStream(fstream);
|
DataInputStream in = new DataInputStream(fstream);
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(in));
|
BufferedReader br = new BufferedReader(new InputStreamReader(in));
|
||||||
String strLine; String [] s = null;
|
String strLine; String [] s = null;
|
||||||
//Read File Line By Line
|
//Read File Line By Line
|
||||||
while ((strLine = br.readLine()) != null) {
|
while ((strLine = br.readLine()) != null) {
|
||||||
|
linenum++;
|
||||||
s = strLine.split("\\t");
|
s = strLine.split("\\t");
|
||||||
AlleleFrequencies.put(s[0], s[1]);
|
if (linenum == 1){
|
||||||
//System.out.printf("Loaded: %s\t%s\n",s[0],AlleleFrequencies.get(s[0]).toString());
|
//Determine number of populations, create a hash table for each population
|
||||||
|
AlleleFrequencies = new Hashtable[s.length-1];
|
||||||
|
Populations = new String[s.length-1];
|
||||||
|
for (int i = 1; i < s.length; i++){
|
||||||
|
Populations[i-1]=s[i];
|
||||||
|
AlleleFrequencies[i-1] = new Hashtable();
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
//assign allele frequencies for each population
|
||||||
|
for (int i = 1; i < s.length; i++){
|
||||||
|
if (Double.valueOf(s[i]) > 0.0001){
|
||||||
|
CommonAlleles.put(s[0], s[0]);
|
||||||
|
}
|
||||||
|
AlleleFrequencies[i-1].put(s[0],s[i]);
|
||||||
|
if (!MaxFrequencies.containsKey(s[0])){
|
||||||
|
MaxFrequencies.put(s[0], s[i]);
|
||||||
|
}else if (Double.valueOf(MaxFrequencies.get(s[0]).toString()) < Double.valueOf(s[i])){
|
||||||
|
MaxFrequencies.put(s[0], s[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
in.close();
|
|
||||||
|
|
||||||
fstream = new FileInputStream(uniqueAllelesFile);
|
|
||||||
in = new DataInputStream(fstream);
|
|
||||||
br = new BufferedReader(new InputStreamReader(in));
|
|
||||||
//Read File Line By Line
|
|
||||||
while ((strLine = br.readLine()) != null) {
|
|
||||||
UniqueAlleles.put(strLine,strLine);
|
|
||||||
//System.out.printf("Loaded: %s\t%s\n",s[0],AlleleFrequencies.get(s[0]).toString());
|
|
||||||
}
|
}
|
||||||
in.close();
|
in.close();
|
||||||
}catch (Exception e){//Catch exception if any
|
}catch (Exception e){//Catch exception if any
|
||||||
System.err.println("FrequencyFileReader Error: " + e.getMessage());
|
System.err.println("Exception in FrequencyFileReader (" + e.getMessage() + ").");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,737 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T CalculatePhaseLikelihoods -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\
|
||||||
|
dMismatches 7] -ethnicity Caucasian | grep -v "INFO" | grep -v "DEBUG" | grep -v "DONE!" > OUTPUT.phaselikelihoods
|
||||||
|
* @author shermanjia
|
||||||
|
*/
|
||||||
|
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||||
|
public class HLACallerWalker extends ReadWalker<Integer, Integer> {
|
||||||
|
@Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true)
|
||||||
|
public String baseLikelihoodsFile = "";
|
||||||
|
|
||||||
|
@Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false)
|
||||||
|
public boolean DEBUG = false;
|
||||||
|
|
||||||
|
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
|
||||||
|
public String filterFile = "";
|
||||||
|
|
||||||
|
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
|
||||||
|
public String ethnicity = "CaucasiansUSA";
|
||||||
|
|
||||||
|
@Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false)
|
||||||
|
public String debugAlleles = "";
|
||||||
|
|
||||||
|
@Argument(fullName = "phaseInterval", shortName = "phaseInterval", doc = "Use only these intervals in phase calculation", required = false)
|
||||||
|
public String phaseIntervalFile = "";
|
||||||
|
|
||||||
|
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
||||||
|
public boolean ONLYFREQUENT = false;
|
||||||
|
|
||||||
|
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
|
||||||
|
public int MINALLOWEDMISMATCHES = 7;
|
||||||
|
|
||||||
|
GATKArgumentCollection args = this.getToolkit().getArguments();
|
||||||
|
|
||||||
|
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt";
|
||||||
|
String PolymorphicSitesfile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt";
|
||||||
|
String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt";
|
||||||
|
|
||||||
|
// Initializing variables
|
||||||
|
|
||||||
|
HLAFileReader HLADictionaryReader = new HLAFileReader();
|
||||||
|
boolean HLAdataLoaded = false;
|
||||||
|
String[] HLAnames, HLAreads, Populations;
|
||||||
|
ArrayList<String> ReadsToDiscard;
|
||||||
|
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites;
|
||||||
|
|
||||||
|
|
||||||
|
int[][] numObservations, totalObservations, intervals;
|
||||||
|
int[] SNPnumInRead, SNPposInRead, positions;
|
||||||
|
CigarParser cigarparser = new CigarParser();
|
||||||
|
Hashtable MaxLikelihoods = new Hashtable();
|
||||||
|
Hashtable MaxFrequencies, CommonAlleles, AlleleCount, LocusCount;
|
||||||
|
Hashtable[] AlleleFrequencies;
|
||||||
|
int numIntervals;
|
||||||
|
double[][] baseLikelihoods;
|
||||||
|
|
||||||
|
ArrayList AllelesToSearch = new ArrayList<String>();
|
||||||
|
|
||||||
|
// setting error rates for phasing algorithm (1% expected error rate for any genotype)
|
||||||
|
|
||||||
|
double P_err = 0.01;
|
||||||
|
double P_correct = 1 - P_err;
|
||||||
|
double L_err = Math.log10(P_err);
|
||||||
|
double L_correct = Math.log10(P_correct);
|
||||||
|
|
||||||
|
public Integer reduceInit() {
|
||||||
|
|
||||||
|
if (!HLAdataLoaded){
|
||||||
|
HLAdataLoaded = true;
|
||||||
|
|
||||||
|
//Load HLA dictionary
|
||||||
|
|
||||||
|
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
||||||
|
HLAreads = HLADictionaryReader.GetSequences();
|
||||||
|
HLAnames = HLADictionaryReader.GetNames();
|
||||||
|
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
||||||
|
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
||||||
|
|
||||||
|
//Load pre-processing file for misaligned reads and list of alleles to search
|
||||||
|
|
||||||
|
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
||||||
|
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
|
||||||
|
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
||||||
|
AllelesToSearch = similarityReader.GetAllelesToSearch();
|
||||||
|
AlleleCount = similarityReader.GetAlleleCount();
|
||||||
|
LocusCount = similarityReader.GetLocusCount();
|
||||||
|
for (int i = 0; i < AllelesToSearch.size(); i++){
|
||||||
|
out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i)));
|
||||||
|
}
|
||||||
|
|
||||||
|
//Load genotypes and find polymorphic sites (sites that differ from reference)
|
||||||
|
|
||||||
|
BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader();
|
||||||
|
baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, true);
|
||||||
|
baseLikelihoods = baseLikelihoodsReader.GetBaseLikelihoods();
|
||||||
|
positions = baseLikelihoodsReader.GetPositions();
|
||||||
|
PolymorphicSites = baseLikelihoodsReader.GetPolymorphicSites();
|
||||||
|
out.printf("INFO\t%s polymorphic sites found\n",PolymorphicSites.length);
|
||||||
|
|
||||||
|
int l = PolymorphicSites.length;
|
||||||
|
SNPnumInRead = new int[l];
|
||||||
|
SNPposInRead = new int[l];
|
||||||
|
numObservations = new int[l*5][l*5];
|
||||||
|
totalObservations = new int[l][l];
|
||||||
|
|
||||||
|
//Load allele frequencies for different populations
|
||||||
|
|
||||||
|
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
|
||||||
|
HLAfreqReader.ReadFile(AlleleFrequencyFile,ethnicity);
|
||||||
|
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
|
||||||
|
MaxFrequencies = HLAfreqReader.GetMaxFrequencies();
|
||||||
|
CommonAlleles = HLAfreqReader.GetCommonAlleles();
|
||||||
|
Populations = HLAfreqReader.GetPopulations();
|
||||||
|
|
||||||
|
//Load genomic intervals for bam file
|
||||||
|
|
||||||
|
if (!phaseIntervalFile.equals("")){
|
||||||
|
TextFileReader fileReader = new TextFileReader();
|
||||||
|
fileReader.ReadFile(phaseIntervalFile);
|
||||||
|
String[] lines = fileReader.GetLines();
|
||||||
|
intervals = new int[lines.length][2];
|
||||||
|
for (int i = 0; i < lines.length; i++) {
|
||||||
|
String[] s = lines[i].split(":");
|
||||||
|
String[] intervalPieces = s[1].split("-");
|
||||||
|
intervals[i][0] = Integer.valueOf(intervalPieces[0]);
|
||||||
|
intervals[i][1] = Integer.valueOf(intervalPieces[1]);
|
||||||
|
}
|
||||||
|
numIntervals = intervals.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
|
if (!ReadsToDiscard.contains(read.getReadName())){
|
||||||
|
UpdateCorrelation(read);
|
||||||
|
}else{
|
||||||
|
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return value + sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTraversalDone(Integer numreads) {
|
||||||
|
String name1, name2, d4_name1, d4_name2, d2_name1, d2_name2;
|
||||||
|
Double frq1 = 0.0, frq2 = 0.0, log1 = 0.0, log2 = 0.0,alleleLikelihood= 0.0, phaseLikelihood=0.0, minfrq = 0.0, likelihood = 0.0;
|
||||||
|
int numCombinations = 0;
|
||||||
|
|
||||||
|
//For debugging specific alleles
|
||||||
|
if (!debugAlleles.equals("")){
|
||||||
|
String s[] = debugAlleles.split(",");
|
||||||
|
int index1 = HLADictionaryReader.GetIndex(s[0]);
|
||||||
|
int index2 = HLADictionaryReader.GetIndex(s[1]);
|
||||||
|
out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2);
|
||||||
|
if (index1 > -1 && index2 > -1){
|
||||||
|
alleleLikelihood = CalculateAlleleLikelihood(index1,index2,HLAreads,true);
|
||||||
|
phaseLikelihood = CalculatePhaseLikelihood(index1,index2,true,false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ONLYFREQUENT){
|
||||||
|
minfrq = 0.0001;
|
||||||
|
}
|
||||||
|
|
||||||
|
double max;
|
||||||
|
ArrayList Output = new ArrayList<String>();
|
||||||
|
ArrayList Likelihoods = new ArrayList<Double>();
|
||||||
|
Hashtable TotalProb = new Hashtable();
|
||||||
|
//Search pairs of alleles that satisfy initial search criteria
|
||||||
|
|
||||||
|
// Allele 1
|
||||||
|
|
||||||
|
for (int i = 0; i < HLAnames.length; i++){
|
||||||
|
name1 = HLAnames[i].substring(4);
|
||||||
|
String [] n1 = name1.split("\\*");
|
||||||
|
d4_name1 = n1[0] + "*" + n1[1].substring(0, 4);
|
||||||
|
d2_name1 = n1[0] + "*" + n1[1].substring(0, 2);
|
||||||
|
if (AllelesToSearch.contains(d4_name1)){// || CommonAlleles.contains(d4_name1)
|
||||||
|
|
||||||
|
if (MaxFrequencies.containsKey(d4_name1)){
|
||||||
|
frq1 = Double.parseDouble(MaxFrequencies.get(d4_name1).toString());
|
||||||
|
}else{
|
||||||
|
if (n1[1].length() > 4){if (n1[1].substring(4, 5).equals("N")){frq1 = .00000005;}else{frq1 = .000001;}}else{frq1 = .000001;}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allele 2
|
||||||
|
|
||||||
|
for (int j = i; j < HLAnames.length; j++){
|
||||||
|
name2 = HLAnames[j].substring(4);
|
||||||
|
String [] n2 = name2.split("\\*");
|
||||||
|
d4_name2 = n2[0] + "*" + n2[1].substring(0, 4);
|
||||||
|
d2_name2 = n2[0] + "*" + n2[1].substring(0, 2);
|
||||||
|
if (n1[0].equals(n2[0]) && (AllelesToSearch.contains(d4_name2))){// || CommonAlleles.contains(d4_name2)
|
||||||
|
if (MaxFrequencies.containsKey(d4_name2)){
|
||||||
|
frq2 = Double.parseDouble(MaxFrequencies.get(d4_name2).toString());
|
||||||
|
}else{
|
||||||
|
if (n2[1].length() > 4){if (n2[1].substring(4, 5).equals("N")){frq2 = .00000005;}else{frq2 = .000001;}}else{frq2 = .000001;}
|
||||||
|
}
|
||||||
|
//Calculate allele and phase likelihoods for each allele pair
|
||||||
|
alleleLikelihood = CalculateAlleleLikelihood(i,j,HLAreads,false);
|
||||||
|
numCombinations++;
|
||||||
|
|
||||||
|
//If there is data at the allele pair, continue with other calculations
|
||||||
|
|
||||||
|
if (alleleLikelihood < 0){
|
||||||
|
phaseLikelihood = CalculatePhaseLikelihood(i,j,false,false);
|
||||||
|
log1=Math.log10(frq1);
|
||||||
|
log2=Math.log10(frq2);
|
||||||
|
|
||||||
|
//sum likelihoods
|
||||||
|
|
||||||
|
likelihood = alleleLikelihood+phaseLikelihood+log1+log2;
|
||||||
|
if (!MaxLikelihoods.containsKey(n1[0])){MaxLikelihoods.put(n1[0], likelihood);}
|
||||||
|
|
||||||
|
if (likelihood > (Double) MaxLikelihoods.get(n1[0])) {
|
||||||
|
MaxLikelihoods.put(n1[0], likelihood);
|
||||||
|
}
|
||||||
|
Likelihoods.add(likelihood);
|
||||||
|
String data = String.format("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f",n1[0],name1,name2,alleleLikelihood,phaseLikelihood,log1,log2,likelihood);
|
||||||
|
Output.add(data);
|
||||||
|
out.printf("INFO\t%s\n",data);
|
||||||
|
if (DEBUG){
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//Print output
|
||||||
|
out.printf("Locus\tA1\tA2\tGeno\tPhase\tFrq1\tFrq2\tL\tProb\tReads1\tReads2\tLocus\tEXP");
|
||||||
|
for (int i = 0; i < Populations.length; i++){
|
||||||
|
out.printf("\t%s",Populations[i]);
|
||||||
|
}
|
||||||
|
out.printf("\n");
|
||||||
|
|
||||||
|
//Calculate probabilities for each locus
|
||||||
|
Double probSum = 0.0, prob = 0.0, f1 = 0.0, f2 = 0.0, aLikelihood4 = 0.0, pLikelihood4 = 0.0;
|
||||||
|
Integer count = 0;
|
||||||
|
Hashtable HLA4DigitProbs = new Hashtable();
|
||||||
|
Hashtable HLA4DigitLs = new Hashtable();
|
||||||
|
Hashtable HLA4DigitCount = new Hashtable();
|
||||||
|
Hashtable HLA4DigitF1 = new Hashtable();
|
||||||
|
Hashtable HLA4DigitF2 = new Hashtable();
|
||||||
|
Hashtable HLA4DigitA = new Hashtable();
|
||||||
|
Hashtable HLA4DigitP = new Hashtable();
|
||||||
|
|
||||||
|
String key;
|
||||||
|
Enumeration keys = LocusCount.keys();
|
||||||
|
while (keys.hasMoreElements()){
|
||||||
|
String locus = keys.nextElement().toString();
|
||||||
|
|
||||||
|
probSum = 0.0;
|
||||||
|
ArrayList localOutput = new ArrayList<String>();
|
||||||
|
ArrayList localLikelihoods = new ArrayList<Double>();
|
||||||
|
|
||||||
|
//Sum probabilities for each locus
|
||||||
|
|
||||||
|
for (int j = 0; j < Output.size(); j++){
|
||||||
|
String data = Output.get(j).toString();
|
||||||
|
String [] d = data.split("\\t");
|
||||||
|
if (d[0].equals(locus)){
|
||||||
|
localOutput.add(data);
|
||||||
|
likelihood = (Double)Likelihoods.get(j)-(Double)MaxLikelihoods.get(locus);
|
||||||
|
localLikelihoods.add(likelihood);
|
||||||
|
probSum = probSum + Math.pow(10, likelihood);
|
||||||
|
//out.printf("INFO\t%s\t%s\t%.2f\t%.2f\t%.2f\t%s\n",locus,data,likelihood,(Double)MaxLikelihoods.get(locus),(Double)Likelihoods.get(j),probSum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//aggregate statistics for 4-digit types
|
||||||
|
|
||||||
|
String A1 = "", A2 = "", a1 = "", a2 = "";
|
||||||
|
String [] s, s1, s2;
|
||||||
|
Double prob4digit = 0.0;
|
||||||
|
int n = 0;
|
||||||
|
|
||||||
|
for (int j = 0; j < localOutput.size(); j++){
|
||||||
|
String data = localOutput.get(j).toString();
|
||||||
|
prob = Math.pow(10, (Double)localLikelihoods.get(j))/probSum;
|
||||||
|
|
||||||
|
if (prob > 0.005){
|
||||||
|
s = data.split("\\t");
|
||||||
|
s1 = s[1].split("\\*");
|
||||||
|
s2 = s[2].split("\\*");
|
||||||
|
a1 = s1[0] + "*" + s1[1].substring(0,4);
|
||||||
|
a2 = s2[0] + "*" + s2[1].substring(0,4);
|
||||||
|
key = a1 + "," + a2;
|
||||||
|
aLikelihood4 = Double.valueOf(s[3]);
|
||||||
|
pLikelihood4 = Double.valueOf(s[4]);
|
||||||
|
likelihood = aLikelihood4 + pLikelihood4 + f1 + f2;
|
||||||
|
f1 = Double.valueOf(s[5]);
|
||||||
|
f2 = Double.valueOf(s[6]);
|
||||||
|
if (!HLA4DigitProbs.containsKey(key)){
|
||||||
|
HLA4DigitProbs.put(key, prob);
|
||||||
|
HLA4DigitLs.put(key, likelihood);
|
||||||
|
HLA4DigitCount.put(key, 1);
|
||||||
|
HLA4DigitF1.put(key,f1);
|
||||||
|
HLA4DigitF2.put(key,f2);
|
||||||
|
HLA4DigitA.put(key,aLikelihood4);
|
||||||
|
HLA4DigitP.put(key,pLikelihood4);
|
||||||
|
}else{
|
||||||
|
prob = prob + Double.valueOf(HLA4DigitProbs.get(key).toString());
|
||||||
|
HLA4DigitProbs.put(key, prob);
|
||||||
|
likelihood = likelihood + Double.valueOf(HLA4DigitLs.get(key).toString());
|
||||||
|
HLA4DigitLs.put(key, likelihood);
|
||||||
|
n = Integer.valueOf(HLA4DigitCount.get(key).toString()) + 1;
|
||||||
|
HLA4DigitCount.put(key, n);
|
||||||
|
aLikelihood4 = aLikelihood4 + Double.valueOf(HLA4DigitA.get(key).toString());
|
||||||
|
HLA4DigitA.put(key, aLikelihood4);
|
||||||
|
pLikelihood4 = pLikelihood4 + Double.valueOf(HLA4DigitP.get(key).toString());
|
||||||
|
HLA4DigitP.put(key, pLikelihood4);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//Print results
|
||||||
|
Enumeration P = HLA4DigitProbs.keys();
|
||||||
|
String K = ""; String [] s, s1, s2;
|
||||||
|
double count1, count2, locusCount, accountedFor;
|
||||||
|
|
||||||
|
// Sort hashtable.
|
||||||
|
Vector v = new Vector(HLA4DigitProbs.keySet());
|
||||||
|
Collections.sort(v);
|
||||||
|
|
||||||
|
// Display (sorted) hashtable.
|
||||||
|
for (Enumeration e = v.elements(); e.hasMoreElements();) {
|
||||||
|
K = (String)e.nextElement();
|
||||||
|
prob = (Double) HLA4DigitProbs.get(K);
|
||||||
|
|
||||||
|
likelihood = (Double) HLA4DigitLs.get(K);
|
||||||
|
count = (Integer) HLA4DigitCount.get(K);
|
||||||
|
s = K.split("\\,");
|
||||||
|
s1 = s[0].split("\\*"); name1 = s1[1];
|
||||||
|
s2 = s[1].split("\\*"); name2 = s2[1];
|
||||||
|
aLikelihood4 = (Double) HLA4DigitA.get(K);
|
||||||
|
pLikelihood4 = (Double) HLA4DigitP.get(K);
|
||||||
|
f1 = (Double) HLA4DigitF1.get(K);
|
||||||
|
f2 = (Double) HLA4DigitF2.get(K);
|
||||||
|
count1 = Double.valueOf(AlleleCount.get(s[0]).toString());
|
||||||
|
count2 = Double.valueOf(AlleleCount.get(s[1]).toString());
|
||||||
|
locusCount = Double.valueOf(LocusCount.get(s1[0]).toString());
|
||||||
|
if (s[0].equals(s[1])){
|
||||||
|
accountedFor = count1 / locusCount;
|
||||||
|
}else{
|
||||||
|
accountedFor = (count1 + count2) / locusCount;
|
||||||
|
}
|
||||||
|
if (prob > 0.1){
|
||||||
|
out.printf("%s\t%s\t%s\t%.1f\t%.1f\t%.2f\t%.2f\t%.1f\t%.2f\t%.0f\t%.0f\t%.0f\t%.2f",s1[0],name1,name2,aLikelihood4/count,pLikelihood4/count,f1,f2,likelihood/count,prob,count1,count2,locusCount,accountedFor);
|
||||||
|
for (int i = 0; i < Populations.length; i++){
|
||||||
|
if (AlleleFrequencies[i].containsKey(s[0])){f1 = Double.valueOf(AlleleFrequencies[i].get(s[0]).toString());}else{f1=.000001;}
|
||||||
|
if (AlleleFrequencies[i].containsKey(s[1])){f2 = Double.valueOf(AlleleFrequencies[i].get(s[1]).toString());}else{f2=.000001;}
|
||||||
|
if (!Double.isInfinite(-1*Math.log10(f1*f2))){out.printf("\t%.2f",Math.log10(f1*f2));}else{out.printf("\t-INF");}
|
||||||
|
}
|
||||||
|
out.print("\n");
|
||||||
|
}
|
||||||
|
out.printf("INFO\t%s\t%s\t%s\t%.1f\t%.1f\t%.2f\t%.2f\t%.1f\t%.2f\t%.0f\t%.0f\t%.0f\t%.2f",s1[0],name1,name2,aLikelihood4/count,pLikelihood4/count,f1,f2,likelihood/count,prob,count1,count2,locusCount,accountedFor);
|
||||||
|
for (int i = 0; i < Populations.length; i++){
|
||||||
|
if (AlleleFrequencies[i].containsKey(s[0])){f1 = Double.valueOf(AlleleFrequencies[i].get(s[0]).toString());}else{f1=.000001;}
|
||||||
|
if (AlleleFrequencies[i].containsKey(s[1])){f2 = Double.valueOf(AlleleFrequencies[i].get(s[1]).toString());}else{f2=.000001;}
|
||||||
|
if (!Double.isInfinite(-1*Math.log10(f1*f2))){out.printf("\t%.2f",Math.log10(f1*f2));}else{out.printf("\t-INF");}
|
||||||
|
}
|
||||||
|
out.print("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Comparator<Double> valueComparator = new Comparator<Double>() {
|
||||||
|
@Override public int compare(Double val1, Double val2) {
|
||||||
|
return val1.compareTo(val2);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
private Integer[] InitializePolymorphicSites(){
|
||||||
|
int HLA_A_start = 30018310, HLA_A_end = 30021211, num_A_positions = HLA_A_end - HLA_A_start + 1;
|
||||||
|
int HLA_B_start = 31430239, HLA_B_end = 31432914, num_B_positions = HLA_B_end - HLA_B_start + 1;
|
||||||
|
int HLA_C_start = 31344925, HLA_C_end = 31347827, num_C_positions = HLA_C_end - HLA_C_start + 1;
|
||||||
|
Integer[] polymorphicSites = new Integer[num_A_positions+num_B_positions+num_C_positions];
|
||||||
|
for (int i = 0; i < num_A_positions; i++){
|
||||||
|
polymorphicSites[i]=HLA_A_start + i;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < num_C_positions; i++){
|
||||||
|
polymorphicSites[i+num_A_positions]=HLA_C_start + i;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < num_B_positions; i++){
|
||||||
|
polymorphicSites[i+num_A_positions+num_C_positions]=HLA_B_start + i;
|
||||||
|
}
|
||||||
|
return polymorphicSites;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int IndexOf(char c){
|
||||||
|
switch(c){
|
||||||
|
case 'A': return 0;
|
||||||
|
case 'C': return 1;
|
||||||
|
case 'G': return 2;
|
||||||
|
case 'T': return 3;
|
||||||
|
//case 'D': return 4;
|
||||||
|
default: return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean IsWithinInterval(int pos){
|
||||||
|
boolean isWithinInterval = false;
|
||||||
|
for (int i = 0; i < numIntervals; i++){
|
||||||
|
if (pos >= intervals[i][0] && pos <= intervals[i][1]){
|
||||||
|
isWithinInterval = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return isWithinInterval;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void UpdateCorrelation(SAMRecord read){
|
||||||
|
//Updates correlation table with SNPs from specific read (for phasing)
|
||||||
|
String s = cigarparser.FormatRead(read.getCigarString(), read.getReadString());
|
||||||
|
ArrayList<Integer> SNPsInRead = new ArrayList<Integer>();
|
||||||
|
ArrayList<Integer> readindex = new ArrayList<Integer>();
|
||||||
|
|
||||||
|
int readstart = read.getAlignmentStart();
|
||||||
|
int readend = read.getAlignmentEnd();
|
||||||
|
int numPositions = PolymorphicSites.length;
|
||||||
|
char c1, c2;
|
||||||
|
int a, b, i, j, SNPcount = 0;
|
||||||
|
|
||||||
|
//Find all SNPs in read
|
||||||
|
for (i = 0; i < numPositions; i++){
|
||||||
|
if (PolymorphicSites[i] > readstart && PolymorphicSites[i] < readend){
|
||||||
|
SNPnumInRead[i] = SNPcount;
|
||||||
|
SNPposInRead[i] = PolymorphicSites[i]-readstart;
|
||||||
|
SNPcount++;
|
||||||
|
}else{
|
||||||
|
SNPnumInRead[i] = -1;
|
||||||
|
SNPposInRead[i] = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//Update correlation table; for each combination of SNP positions
|
||||||
|
for (i = 0; i < numPositions; i++){
|
||||||
|
if (SNPnumInRead[i] > -1){
|
||||||
|
c1 = s.charAt(SNPposInRead[i]);
|
||||||
|
if (IndexOf(c1) > -1){
|
||||||
|
for (j = i+1; j < numPositions; j ++){
|
||||||
|
if (SNPnumInRead[j] > -1){
|
||||||
|
c2 = s.charAt(SNPposInRead[j]);
|
||||||
|
if (IndexOf(c2) > -1){
|
||||||
|
a = i*5 + IndexOf(c1);
|
||||||
|
b = j*5 + IndexOf(c2);
|
||||||
|
|
||||||
|
numObservations[a][b]++;
|
||||||
|
totalObservations[i][j]++;
|
||||||
|
if (DEBUG){
|
||||||
|
//out.printf("INFO %s\t%s %s\t[i=%s,j=%s]\t[%s,%s]\t[%s,%s]\n",read.getReadName(),PolymorphicSites[i],PolymorphicSites[j],i,j,c1,c2,a,b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int GenotypeIndex(char a, char b){
|
||||||
|
switch(a){
|
||||||
|
case 'A':
|
||||||
|
switch(b){
|
||||||
|
case 'A': return 0;
|
||||||
|
case 'C': return 1;
|
||||||
|
case 'G': return 2;
|
||||||
|
case 'T': return 3;
|
||||||
|
};
|
||||||
|
case 'C':
|
||||||
|
switch(b){
|
||||||
|
case 'A': return 1;
|
||||||
|
case 'C': return 4;
|
||||||
|
case 'G': return 5;
|
||||||
|
case 'T': return 6;
|
||||||
|
};
|
||||||
|
case 'G':
|
||||||
|
switch(b){
|
||||||
|
case 'A': return 2;
|
||||||
|
case 'C': return 5;
|
||||||
|
case 'G': return 7;
|
||||||
|
case 'T': return 8;
|
||||||
|
};
|
||||||
|
case 'T':
|
||||||
|
switch(b){
|
||||||
|
case 'A': return 3;
|
||||||
|
case 'C': return 6;
|
||||||
|
case 'G': return 8;
|
||||||
|
case 'T': return 9;
|
||||||
|
};
|
||||||
|
default: return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private double CalculateAlleleLikelihood(int a1, int a2, String[] HLAalleles, boolean debug){
|
||||||
|
//Calculates likelihood for specific allele pair
|
||||||
|
String read1 = HLAalleles[a1];
|
||||||
|
String read2 = HLAalleles[a2];
|
||||||
|
int start1 = HLAstartpos[a1];
|
||||||
|
int start2 = HLAstartpos[a2];
|
||||||
|
int stop1 = HLAstoppos[a1];
|
||||||
|
int stop2 = HLAstoppos[a2];
|
||||||
|
double likelihood = 0;
|
||||||
|
int pos, index;
|
||||||
|
char c1, c2;
|
||||||
|
|
||||||
|
|
||||||
|
for (int i = 0; i < positions.length; i++){
|
||||||
|
pos = positions[i];
|
||||||
|
if (pos < stop1 && pos > start1 && pos < stop2 && pos > start2){
|
||||||
|
index = GenotypeIndex(read1.charAt(pos-start1),read2.charAt(pos-start2));
|
||||||
|
if (index > -1){
|
||||||
|
likelihood = likelihood + baseLikelihoods[i][index];
|
||||||
|
if (debug){
|
||||||
|
c1 = read1.charAt(pos-start1);
|
||||||
|
c2 = read2.charAt(pos-start2);
|
||||||
|
out.printf("INFO: DEBUG %s\t%s\t%s\t%s\t%s\t%s\t%.2f\n",HLAnames[a1],HLAnames[a2],pos,c1,c2,index,likelihood);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return likelihood;
|
||||||
|
}
|
||||||
|
|
||||||
|
private double CalculatePhaseLikelihood(int alleleIndex1, int alleleIndex2, boolean PRINTDEBUG, boolean SINGLEALLELE){
|
||||||
|
//calculate the likelihood that the particular combination of alleles satisfies the phase count data
|
||||||
|
double likelihood = 0, prob = 0;
|
||||||
|
int readstart1 = HLAstartpos[alleleIndex1]; int readend1 = HLAstoppos[alleleIndex1];
|
||||||
|
int readstart2 = HLAstartpos[alleleIndex2]; int readend2 = HLAstoppos[alleleIndex2];
|
||||||
|
int combinedstart = Math.max(readstart1,readstart2);
|
||||||
|
int combinedstop = Math.min(readend1,readend2);
|
||||||
|
|
||||||
|
int numPositions = PolymorphicSites.length, SNPcount = 0;
|
||||||
|
int i, j, a1, a2, b1, b2;
|
||||||
|
char c11, c12, c21, c22;
|
||||||
|
int numInPhase = 0, numOutOfPhase = 0;
|
||||||
|
double sumInPhase = 0.0, sumObservations = 0.0;
|
||||||
|
|
||||||
|
|
||||||
|
//Find all SNPs in read
|
||||||
|
for (i = 0; i < numPositions; i++){
|
||||||
|
|
||||||
|
if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop ){ // && IsWithinInterval(PolymorphicSites[i])
|
||||||
|
if (PRINTDEBUG){
|
||||||
|
out.printf("DEBUG\t%s\t%s\n",PolymorphicSites[i],SNPcount);
|
||||||
|
}
|
||||||
|
SNPnumInRead[i] = SNPcount;
|
||||||
|
SNPposInRead[i] = PolymorphicSites[i]-combinedstart;
|
||||||
|
SNPcount++;
|
||||||
|
}else{
|
||||||
|
SNPnumInRead[i] = -1;
|
||||||
|
SNPposInRead[i] = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
String s1 = HLAreads[alleleIndex1];
|
||||||
|
String s2 = HLAreads[alleleIndex2];
|
||||||
|
if (PRINTDEBUG){
|
||||||
|
out.printf("DEBUG %s SNPs found in %s and %s between %s and %s\n",SNPcount,HLAnames[alleleIndex1], HLAnames[alleleIndex2],combinedstart,combinedstop);
|
||||||
|
}
|
||||||
|
//Iterate through every pairwise combination of SNPs, and update likelihood for the allele combination
|
||||||
|
for (i = 0; i < numPositions; i++){
|
||||||
|
if (SNPnumInRead[i] > -1){
|
||||||
|
c11 = s1.charAt(SNPposInRead[i]);
|
||||||
|
c21 = s2.charAt(SNPposInRead[i]);
|
||||||
|
if (IndexOf(c11) > -1 && IndexOf(c21) > -1){
|
||||||
|
for (j = i+1; j < numPositions; j ++){
|
||||||
|
if (SNPnumInRead[j] > -1 && totalObservations[i][j] > 0){
|
||||||
|
c12 = s1.charAt(SNPposInRead[j]);
|
||||||
|
c22 = s2.charAt(SNPposInRead[j]);
|
||||||
|
if (IndexOf(c12) > -1 && IndexOf(c22) > -1){
|
||||||
|
a1 = i*5 + IndexOf(c11);
|
||||||
|
b1 = j*5 + IndexOf(c12);
|
||||||
|
a2 = i*5 + IndexOf(c21);
|
||||||
|
b2 = j*5 + IndexOf(c22);
|
||||||
|
//check if the two alleles are identical at the chosen 2 locations
|
||||||
|
if ((c11 == c21) && (c12 == c22)){
|
||||||
|
numInPhase = numObservations[a1][b1];
|
||||||
|
}else{
|
||||||
|
numInPhase = numObservations[a1][b1] + numObservations[a2][b2];
|
||||||
|
}
|
||||||
|
numOutOfPhase = totalObservations[i][j] - numInPhase;
|
||||||
|
sumInPhase += (double) numInPhase;
|
||||||
|
sumObservations += (double) totalObservations[i][j];
|
||||||
|
if (SINGLEALLELE){
|
||||||
|
likelihood = sumInPhase / sumObservations;
|
||||||
|
}else{
|
||||||
|
likelihood += numInPhase * L_correct + numOutOfPhase * L_err;
|
||||||
|
}
|
||||||
|
//prob = Math.max((double) numInPhase / (double) totalObservations[i][j], 0.0001);
|
||||||
|
//likelihood += Math.log10(prob);
|
||||||
|
//likelihood = Math.max(Math.log10(sumInPhase / sumObservations),-10);
|
||||||
|
|
||||||
|
if (PRINTDEBUG){
|
||||||
|
out.printf("DEBUG %s %s %s[%s%s] %s[%s%s]\t[%s,%s]\t[%s,%s] [%s,%s]\t%s / %s\t%s / %s\t %.2f\n",HLAnames[alleleIndex1],HLAnames[alleleIndex2],PolymorphicSites[i],c11,c21,PolymorphicSites[j],c12,c22, i,j,a1,b1,a2,b2,numInPhase,totalObservations[i][j],sumInPhase,sumObservations,likelihood);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return likelihood;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void ExtraCode(){
|
||||||
|
String name1, name2;
|
||||||
|
//Pre-process homozygous combinations to determine top possible alleles (for efficiency)
|
||||||
|
Hashtable Alleles2Digit = new Hashtable();
|
||||||
|
Hashtable Phase2Digit = new Hashtable();
|
||||||
|
Hashtable Count2Digit = new Hashtable();
|
||||||
|
|
||||||
|
Hashtable AllelesAtLocus = new Hashtable();
|
||||||
|
ArrayList Loci = new ArrayList<String>();
|
||||||
|
double[] AlleleLikelihoods2 = new double[HLAnames.length];
|
||||||
|
double[] PhaseLikelihoods2 = new double[HLAnames.length];
|
||||||
|
for (int i = 0; i < HLAnames.length; i++){
|
||||||
|
name1 = HLAnames[i].substring(4);
|
||||||
|
String [] n1 = name1.split("\\*");
|
||||||
|
AlleleLikelihoods2[i] = CalculateAlleleLikelihood(i,i,HLAreads,false);
|
||||||
|
PhaseLikelihoods2[i] = CalculatePhaseLikelihood(i,i,false,true);
|
||||||
|
if (AlleleLikelihoods2[i] < 0){
|
||||||
|
name2 = n1[0] + "*" + n1[1].substring(0, 4);
|
||||||
|
if (!Loci.contains(n1[0])){
|
||||||
|
Loci.add(n1[0]);
|
||||||
|
MaxLikelihoods.put(n1[0], 0.0);
|
||||||
|
AllelesAtLocus.put(n1[0], 1);
|
||||||
|
}else{
|
||||||
|
AllelesAtLocus.put(n1[0], 1+(Integer)AllelesAtLocus.get(n1[0]));
|
||||||
|
}
|
||||||
|
if (!Alleles2Digit.containsKey(name2)){
|
||||||
|
Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
|
||||||
|
Phase2Digit.put(name2, PhaseLikelihoods2[i]);
|
||||||
|
Count2Digit.put(name2, 1.0);
|
||||||
|
}else {
|
||||||
|
if (AlleleLikelihoods2[i] > (Double) Alleles2Digit.get(name2)){
|
||||||
|
Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
|
||||||
|
}
|
||||||
|
if (PhaseLikelihoods2[i] > (Double) Phase2Digit.get(name2)){
|
||||||
|
Phase2Digit.put(name2, PhaseLikelihoods2[i]);
|
||||||
|
}
|
||||||
|
Count2Digit.put(name2,1.0+(Double)Count2Digit.get(name2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//Sort alleles at 2 digit resolution for each locus
|
||||||
|
|
||||||
|
for (int i = 0; i < Loci.size(); i++){
|
||||||
|
Enumeration k = Alleles2Digit.keys();
|
||||||
|
Hashtable AllelesAtLoci = new Hashtable();
|
||||||
|
HashMap map = new HashMap();
|
||||||
|
int numalleles = 0;
|
||||||
|
//find alleles at the locus
|
||||||
|
while( k.hasMoreElements() ){
|
||||||
|
name1 = k.nextElement().toString();
|
||||||
|
String [] n1 = name1.split("\\*");
|
||||||
|
if (Loci.get(i).equals(n1[0])){
|
||||||
|
numalleles++;
|
||||||
|
map.put(name1,-1 * (Double) Alleles2Digit.get(name1));
|
||||||
|
AllelesAtLoci.put(-1 * (Double) Alleles2Digit.get(name1), name1);
|
||||||
|
//out.printf("%s\t%.2f\n",name1,-1 * (Double) Alleles2Digit.get(name1));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//Sort alleles at locus, mark top six 2-digit classes for deep search
|
||||||
|
List<Map.Entry<String, Double>> entries = new ArrayList<Entry<String, Double>>(map.entrySet());
|
||||||
|
Collections.sort(entries, new Comparator<Entry<String, Double>>() {
|
||||||
|
public int compare(Entry<String, Double> e1, Entry<String, Double> e2) {
|
||||||
|
return e1.getValue().compareTo(e2.getValue());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
int num = 1;
|
||||||
|
for (Map.Entry<String, Double> entry : entries) {
|
||||||
|
if (num <= Math.max(5,entries.size()/8)){
|
||||||
|
AllelesToSearch.add(entry.getKey());
|
||||||
|
out.printf("INFO\t%s\t%.2f\t%.2f\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey()));
|
||||||
|
num++;
|
||||||
|
}else{
|
||||||
|
if (!AllelesToSearch.contains(entry.getKey())){
|
||||||
|
out.printf("INFO\t%s\t%.2f\t%.2f\tNotSearched\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey()));
|
||||||
|
}else{
|
||||||
|
out.printf("INFO\t%s\t%.2f\t%.2f\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out.printf("INFO\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,79 @@
|
||||||
|
/*
|
||||||
|
* To change this template, choose Tools | Templates
|
||||||
|
* and open the template in the editor.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @author shermanjia
|
||||||
|
*/
|
||||||
|
public class HLAFileReader {
|
||||||
|
ArrayList<String> Sequences = new ArrayList<String>();
|
||||||
|
ArrayList<String> Names = new ArrayList<String>();
|
||||||
|
ArrayList<Integer> StartPositions = new ArrayList<Integer>();
|
||||||
|
ArrayList<Integer> StopPositions = new ArrayList<Integer>();
|
||||||
|
int minstartpos;
|
||||||
|
int maxstoppos;
|
||||||
|
|
||||||
|
CigarParser formatter = new CigarParser();
|
||||||
|
|
||||||
|
public String[] GetNames(){
|
||||||
|
return Names.toArray(new String[Names.size()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String[] GetSequences(){
|
||||||
|
return Sequences.toArray(new String[Sequences.size()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer[] GetStartPositions(){
|
||||||
|
return StartPositions.toArray(new Integer[StartPositions.size()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer[] GetStopPositions(){
|
||||||
|
return StopPositions.toArray(new Integer[StopPositions.size()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Integer GetMinStartPos(){
|
||||||
|
return minstartpos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer GetMaxStopPos(){
|
||||||
|
return maxstoppos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int GetIndex(String readname){
|
||||||
|
if (Names.contains(readname)){
|
||||||
|
return Names.indexOf(readname);
|
||||||
|
}else{
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void ReadFile(String filename){
|
||||||
|
try{
|
||||||
|
FileInputStream fstream = new FileInputStream(filename);
|
||||||
|
DataInputStream in = new DataInputStream(fstream);
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(in));
|
||||||
|
String strLine; String [] s = null;
|
||||||
|
//Read File Line By Line
|
||||||
|
while ((strLine = br.readLine()) != null) {
|
||||||
|
s = strLine.split("\\t");
|
||||||
|
Sequences.add(s[3]);
|
||||||
|
Names.add(s[0]);
|
||||||
|
StartPositions.add(Integer.valueOf(s[1]));
|
||||||
|
StopPositions.add(Integer.valueOf(s[2]));
|
||||||
|
minstartpos = Math.min(minstartpos, Integer.valueOf(s[1]));
|
||||||
|
maxstoppos = Math.max(maxstoppos, Integer.valueOf(s[2]));
|
||||||
|
}
|
||||||
|
in.close();
|
||||||
|
}catch (Exception e){//Catch exception if any
|
||||||
|
System.err.println("HLAFileReader Error: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -24,6 +24,14 @@ public class PolymorphicSitesFileReader {
|
||||||
return NonPolymorphicSites.toArray(new Integer[NonPolymorphicSites.size()]);
|
return NonPolymorphicSites.toArray(new Integer[NonPolymorphicSites.size()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void AddSites(Integer [] sites){
|
||||||
|
for (int i = 0; i < sites.length; i++){
|
||||||
|
if (!PolymorphicSites.contains(sites[i])){
|
||||||
|
PolymorphicSites.add(sites[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void ReadFile(String filename){
|
public void ReadFile(String filename){
|
||||||
try{
|
try{
|
||||||
FileInputStream fstream = new FileInputStream(filename);
|
FileInputStream fstream = new FileInputStream(filename);
|
||||||
|
|
@ -34,10 +42,10 @@ public class PolymorphicSitesFileReader {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while ((strLine = br.readLine()) != null) {
|
while ((strLine = br.readLine()) != null) {
|
||||||
s = strLine.split("\\t");
|
s = strLine.split("\\t");
|
||||||
if (s[0].equals("POLYMORPHIC")){
|
if (Double.valueOf(s[8]) > 0.1){
|
||||||
PolymorphicSites.add(Integer.valueOf(s[2]));
|
PolymorphicSites.add(Integer.valueOf(s[0]));
|
||||||
}else{
|
}else{
|
||||||
NonPolymorphicSites.add(Integer.valueOf(s[2]));
|
NonPolymorphicSites.add(Integer.valueOf(s[0]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
in.close();
|
in.close();
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,9 @@ import java.util.Hashtable;
|
||||||
*/
|
*/
|
||||||
public class SimilarityFileReader {
|
public class SimilarityFileReader {
|
||||||
ArrayList<String> ReadsToDiscard = new ArrayList<String>();
|
ArrayList<String> ReadsToDiscard = new ArrayList<String>();
|
||||||
|
ArrayList<String> AllelesToSearch = new ArrayList<String>();
|
||||||
|
Hashtable AlleleCount = new Hashtable();
|
||||||
|
Hashtable LocusCount = new Hashtable();
|
||||||
Hashtable Concordance = new Hashtable();
|
Hashtable Concordance = new Hashtable();
|
||||||
Hashtable NumMatches = new Hashtable();
|
Hashtable NumMatches = new Hashtable();
|
||||||
Hashtable NumMismatches = new Hashtable();
|
Hashtable NumMismatches = new Hashtable();
|
||||||
|
|
@ -22,10 +25,22 @@ public class SimilarityFileReader {
|
||||||
return ReadsToDiscard;
|
return ReadsToDiscard;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public ArrayList<String> GetAllelesToSearch(){
|
||||||
|
return AllelesToSearch;
|
||||||
|
}
|
||||||
|
|
||||||
public String[] GetReadsToDiscardArray(){
|
public String[] GetReadsToDiscardArray(){
|
||||||
return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]);
|
return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Hashtable GetAlleleCount(){
|
||||||
|
return AlleleCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Hashtable GetLocusCount(){
|
||||||
|
return LocusCount;
|
||||||
|
}
|
||||||
|
|
||||||
public Hashtable GetConcordance(){
|
public Hashtable GetConcordance(){
|
||||||
return Concordance;
|
return Concordance;
|
||||||
}
|
}
|
||||||
|
|
@ -43,26 +58,56 @@ public class SimilarityFileReader {
|
||||||
FileInputStream fstream = new FileInputStream(filename);
|
FileInputStream fstream = new FileInputStream(filename);
|
||||||
DataInputStream in = new DataInputStream(fstream);
|
DataInputStream in = new DataInputStream(fstream);
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(in));
|
BufferedReader br = new BufferedReader(new InputStreamReader(in));
|
||||||
String strLine; String [] s = null;
|
String strLine; String [] s = null, alleles = null, a = null; String allele;
|
||||||
//Read File Line By Line
|
//Read File Line By Line
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while ((strLine = br.readLine()) != null) {
|
while ((strLine = br.readLine()) != null) {
|
||||||
s = strLine.split("\\t");
|
s = strLine.split("\\t");
|
||||||
if (s.length >= 6){
|
if (s.length >= 6){
|
||||||
Double matchFraction = Double.valueOf(s[4]);
|
Double matchFraction = Double.valueOf(s[3]);
|
||||||
int numMismatches = Integer.valueOf(s[6]);
|
int numMismatches = Integer.valueOf(s[5]);
|
||||||
|
int numMatches = Integer.valueOf(s[4]);
|
||||||
Concordance.put(s[0],matchFraction);
|
Concordance.put(s[0],matchFraction);
|
||||||
NumMatches.put(s[0], s[5]);
|
NumMatches.put(s[0], s[4]);
|
||||||
NumMismatches.put(s[0], numMismatches);
|
NumMismatches.put(s[0], numMismatches);
|
||||||
if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches > minAllowedMismatches)){
|
if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < 10){
|
||||||
ReadsToDiscard.add(s[0]);
|
ReadsToDiscard.add(s[0]);
|
||||||
|
}else{
|
||||||
|
Hashtable fourDigitAlleles = new Hashtable();
|
||||||
|
alleles = s[6].split("\\,");
|
||||||
|
if (alleles.length > 0){
|
||||||
|
a = alleles[0].split("\\_");
|
||||||
|
s = a[1].split("\\*");
|
||||||
|
if (!LocusCount.containsKey(s[0])){
|
||||||
|
LocusCount.put(s[0], 1);
|
||||||
|
}else{
|
||||||
|
LocusCount.put(s[0], (Integer) LocusCount.get(s[0]) + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int j = 0; j < alleles.length; j++){
|
||||||
|
a = alleles[j].split("\\_");
|
||||||
|
s = a[1].split("\\*");
|
||||||
|
allele = s[0] + "*" + s[1].substring(0,4);
|
||||||
|
|
||||||
|
if (!fourDigitAlleles.containsKey(allele)){
|
||||||
|
fourDigitAlleles.put(allele, allele);
|
||||||
|
if (!AlleleCount.containsKey(allele)){
|
||||||
|
AlleleCount.put(allele, 1);
|
||||||
|
}else{
|
||||||
|
AlleleCount.put(allele, (Integer) AlleleCount.get(allele) + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((Integer) AlleleCount.get(allele) > 1 && !AllelesToSearch.contains(allele)){
|
||||||
|
AllelesToSearch.add(allele);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
in.close();
|
in.close();
|
||||||
}catch (Exception e){//Catch exception if any
|
}catch (Exception e){//Catch exception if any
|
||||||
System.err.println("SimilarityFile Error: " + e.getMessage());
|
//System.err.println("SimilarityFile Error: " + e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue