HLA caller updated to examine class II loci, updated pointers to dictionary, allele frequencies.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3290 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
97fdd92e7b
commit
94b51de401
|
|
@ -56,10 +56,10 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
|||
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
|
||||
public String ethnicity = "CaucasianUSA";
|
||||
|
||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAllelesCommon";
|
||||
Hashtable AlleleFrequencies,UniqueAlleles;
|
||||
|
||||
CigarParser formatter = new CigarParser();
|
||||
|
|
@ -95,6 +95,10 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
|||
UniqueAlleles = HLAfreqReader.GetUniqueAlleles();
|
||||
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
|
||||
|
||||
//out.printf("INFO Common alleles:\n");
|
||||
for (int i = 1; i < UniqueAlleles.size(); i++){
|
||||
//out.printf("INFO %s\n",UniqueAlleles.values().toArray()[i]);
|
||||
}
|
||||
out.printf("INFO Reading HLA dictionary ...");
|
||||
|
||||
|
||||
|
|
@ -188,12 +192,18 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
|||
|
||||
for (int i = 0; i < numreads; i++){
|
||||
name1 = HLAnames[i].substring(4);
|
||||
if (UniqueAlleles.containsKey(name1)){
|
||||
String [] n1 = name1.split("\\*");
|
||||
// out.printf("1: %s\n",n1[0] + "*" + n1[1].substring(0, 3));
|
||||
if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){
|
||||
//out.printf("1: %s\n",name1);
|
||||
//frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
|
||||
//if (frq1 > minfrq){
|
||||
for (int j = i; j < numreads; j++){
|
||||
name2 = HLAnames[j].substring(4);
|
||||
if (UniqueAlleles.containsKey(name2)){
|
||||
String [] n2 = name2.split("\\*");
|
||||
// out.printf("2: %s\n",n2[0] + "*" + n2[1].substring(0, 3));
|
||||
if (UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){
|
||||
|
||||
// frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
|
||||
// if (frq2 > minfrq){
|
||||
if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
|
||||
|
|
@ -211,8 +221,8 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
|||
//}else{
|
||||
// if (DEBUG){out.printf("%s has allele frequency%.5f\n",name1,frq1);}
|
||||
//}
|
||||
}else{
|
||||
if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
|
||||
//}else{
|
||||
// if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -55,8 +55,8 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
|||
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
|
||||
public int MINALLOWEDMISMATCHES = 7;
|
||||
|
||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
|
||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
|
||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||
|
|
|
|||
|
|
@ -67,12 +67,12 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
|
|||
|
||||
GATKArgumentCollection args = this.getToolkit().getArguments();
|
||||
|
||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAllelesCommon";
|
||||
|
||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
|
||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
|
||||
SAMFileReader HLADictionaryReader = new SAMFileReader();
|
||||
boolean HLAdataLoaded = false;
|
||||
String[] HLAnames, HLAreads;
|
||||
|
|
@ -216,7 +216,8 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
|
|||
out.printf("NUM\tAllele1\tAllele2\tPhase\tFrq1\tFrq2\n");
|
||||
for (int i = 0; i < HLAnames.length; i++){
|
||||
name1 = HLAnames[i].substring(4);
|
||||
if (UniqueAlleles.containsKey(name1)){
|
||||
String [] n1 = name1.split("\\*");
|
||||
if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){
|
||||
if (AlleleFrequencies.containsKey(name1)){
|
||||
frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
|
||||
}else{
|
||||
|
|
@ -225,19 +226,19 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
|
|||
//if (frq1 > minfrq){
|
||||
for (int j = i; j < HLAnames.length; j++){
|
||||
name2 = HLAnames[j].substring(4);
|
||||
if (name1.substring(0,1).equals(name2.substring(0,1))){
|
||||
if (UniqueAlleles.containsKey(name2)){
|
||||
if (AlleleFrequencies.containsKey(name2)){
|
||||
frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
|
||||
}else{
|
||||
frq2 = .0001;
|
||||
}
|
||||
// if (frq2 > minfrq){
|
||||
likelihood = CalculatePhaseLikelihood(i,j,false);
|
||||
numCombinations++;
|
||||
out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2));
|
||||
// }
|
||||
String [] n2 = name2.split("\\*");
|
||||
if (n1[0].equals(n2[0]) && UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){
|
||||
if (AlleleFrequencies.containsKey(name2)){
|
||||
frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
|
||||
}else{
|
||||
frq2 = .0001;
|
||||
}
|
||||
// if (frq2 > minfrq){
|
||||
likelihood = CalculatePhaseLikelihood(i,j,false);
|
||||
numCombinations++;
|
||||
out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2));
|
||||
// }
|
||||
|
||||
}
|
||||
}
|
||||
//}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,322 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Hashtable;
|
||||
/**
|
||||
* Compares reads to longest read at each exon. Usage: java -jar GenomeAnalysisTK.jar -T ClusterReads -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta [-filter INPUT.filter] | grep -v INFO | sort -k1 > OUTPUT
|
||||
* @author shermanjia
|
||||
*/
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
public class ClusterReadsWalker extends ReadWalker<Integer, Integer> {
|
||||
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
|
||||
public String filterFile = "";
|
||||
|
||||
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
|
||||
public int MINALLOWEDMISMATCHES = 5;
|
||||
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||
|
||||
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
|
||||
|
||||
boolean DatabaseLoaded = false;
|
||||
boolean DEBUG = false;
|
||||
|
||||
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
|
||||
double[] SingleAlleleFrequencies;
|
||||
ArrayList<String> ReadsToDiscard = new ArrayList<String>();
|
||||
ArrayList<SAMRecord> AlignedReads = new ArrayList<SAMRecord>();
|
||||
|
||||
Hashtable MaxNumMatches, MaxConcordance, NumCompared = new Hashtable();
|
||||
|
||||
double[] nummatched, concordance, numcompared;
|
||||
|
||||
SAMRecord A2, A3, B2, B3, C2, C3;
|
||||
int MaxMatchesA2 = 0, MaxMatchesA3 = 0, MaxMatchesB2 = 0, MaxMatchesB3 = 0, MaxMatchesC2 = 0, MaxMatchesC3 = 0;
|
||||
int A2start = 30018513, A2stop = 30018781;
|
||||
int A3start = 30019024, A3stop = 30019300;
|
||||
|
||||
int C2start = 31347355, C2stop = 31347623;
|
||||
int C3start = 31346829, C3stop = 31347104;
|
||||
|
||||
int B2start = 31432444, B2stop = 31432714;
|
||||
int B3start = 31431923, B3stop = 31432198;
|
||||
|
||||
int minstartpos = 0;
|
||||
int maxstoppos = 0;
|
||||
|
||||
|
||||
Hashtable AlleleFrequencies = new Hashtable();
|
||||
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
|
||||
CigarParser formatter = new CigarParser();
|
||||
|
||||
public Integer reduceInit() {
|
||||
if (!DatabaseLoaded){
|
||||
DatabaseLoaded = true;
|
||||
|
||||
PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader();
|
||||
siteFileReader.ReadFile(PolymorphicSitesFile);
|
||||
PolymorphicSites = siteFileReader.GetPolymorphicSites();
|
||||
NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
|
||||
|
||||
|
||||
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
|
||||
|
||||
if (!filterFile.equals("")){
|
||||
out.printf("INFO Reading properties file ... ");
|
||||
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
||||
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
|
||||
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
||||
MaxNumMatches = similarityReader.GetNumMatches();
|
||||
MaxConcordance = similarityReader.GetConcordance();
|
||||
|
||||
out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
|
||||
for (int i = 0; i < ReadsToDiscard.size(); i++){
|
||||
out.printf("MISALIGNED %s\n", ReadsToDiscard.get(i).toString());
|
||||
}
|
||||
}
|
||||
|
||||
out.printf("INFO Comparing reads ...\n");
|
||||
|
||||
if (DEBUG){
|
||||
//out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker tracker) {
|
||||
//Calculate concordance for this read and all overlapping reads
|
||||
if (!ReadsToDiscard.contains(read.getReadName())){
|
||||
AlignedReads.add(read);
|
||||
|
||||
int readstart = read.getAlignmentStart();
|
||||
int readstop = read.getAlignmentEnd();
|
||||
int length = readstop - readstart + 1;
|
||||
if (MaxNumMatches.containsKey(read.getReadName())){
|
||||
int maxMatches = Integer.parseInt(MaxNumMatches.get(read.getReadName()).toString());
|
||||
double concordance = Double.parseDouble((String) MaxConcordance.get(read.getReadName()).toString());
|
||||
|
||||
if (readstart < A2stop && readstop > A2start){
|
||||
if (maxMatches > MaxMatchesA2 && concordance > 0.95){
|
||||
MaxMatchesA2 = maxMatches;
|
||||
A2 = read;
|
||||
}
|
||||
} else if (readstart < A3stop && readstop > A3start){
|
||||
if (maxMatches > MaxMatchesA3){
|
||||
MaxMatchesA3 = maxMatches;
|
||||
A3 = read;
|
||||
}
|
||||
} else if (readstart < B2stop && readstop > B2start){
|
||||
if (maxMatches > MaxMatchesB2){
|
||||
MaxMatchesB2 = maxMatches;
|
||||
B2 = read;
|
||||
}
|
||||
} else if (readstart < B3stop && readstop > B3start){
|
||||
if (maxMatches > MaxMatchesB3){
|
||||
MaxMatchesB3 = maxMatches;
|
||||
B3 = read;
|
||||
}
|
||||
} else if (readstart < C2stop && readstop > C2start){
|
||||
if (maxMatches > MaxMatchesC2){
|
||||
MaxMatchesC2 = maxMatches;
|
||||
C2 = read;
|
||||
}
|
||||
} else if (readstart < C3stop && readstop > C3start){
|
||||
if (maxMatches > MaxMatchesC3){
|
||||
MaxMatchesC3 = maxMatches;
|
||||
C3 = read;
|
||||
}
|
||||
}
|
||||
}else{
|
||||
out.printf("Data for %s not found\n",read.getReadName());
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer numreads) {
|
||||
SAMRecord read; String name, name2; String locus = "";
|
||||
int A2a = 0, A2b = 0, A2c = 0;
|
||||
int A3a = 0, A3b = 0, A3c = 0;
|
||||
int B2a = 0, B2b = 0, B2c = 0;
|
||||
int B3a = 0, B3b = 0, B3c = 0;
|
||||
int C2a = 0, C2b = 0, C2c = 0;
|
||||
int C3a = 0, C3b = 0, C3c = 0;
|
||||
double minA2 = -1, minA3 = -1, minB2 = -1, minB3 = -1, minC2 = -1, minC3 = -1;
|
||||
double maxA2 = 0, maxA3 = 0, maxB2 = 0, maxB3 = 0, maxC2 = 0, maxC3 = 0;
|
||||
double ratioA2 = 0, ratioA3 = 0, ratioB2 = 0, ratioB3 = 0, ratioC2 = 0, ratioC3 = 0;
|
||||
double maxA2l = 0, maxA3l = 0, maxB2l = 0, maxB3l = 0, maxC2l = 0, maxC3l = 0;
|
||||
double maxA2d = 0, maxA3d = 0, maxB2d = 0, maxB3d = 0, maxC2d = 0, maxC3d = 0;
|
||||
double a2, a3, b2, b3, c2, c3, normalized = 0;
|
||||
int readstart, readstop;
|
||||
double matches, compared, concordance;
|
||||
STATS stats;
|
||||
|
||||
for (int i = 0; i < AlignedReads.size(); i++){
|
||||
read = AlignedReads.get(i);
|
||||
readstart = read.getAlignmentStart();
|
||||
readstop = read.getAlignmentEnd();
|
||||
if (readstart < A2stop && readstop > A2start){
|
||||
stats = CalculateConcordance(read, A2, "A2", A2start, A2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
|
||||
if (stats.getNumCompared() > 40){if (minA2 < 0 || minA2 > concordance){minA2 = concordance;}; if (concordance > maxA2){maxA2 = concordance;}; if (matches > maxA2l){maxA2l = matches;}; if (compared-matches > maxA2d){maxA2d = compared-matches;}}else{A2c++;}
|
||||
} else if (readstart < A3stop && readstop > A3start){
|
||||
stats = CalculateConcordance(read, A3, "A3", A3start, A3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
|
||||
if (stats.getNumCompared() > 40){if (minA3 < 0 || minA3 > concordance){minA3 = concordance;}; if (concordance > maxA3){maxA3 = concordance;}; if (matches > maxA3l){maxA3l = matches;}; if (compared-matches > maxA3d){maxA3d = compared-matches;}}else{A3c++;}
|
||||
} else if (readstart < B2stop && readstop > B2start){
|
||||
stats = CalculateConcordance(read, B2, "B2", B2start, B2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
|
||||
if (stats.getNumCompared() > 40){if (minB2 < 0 || minB2 > concordance){minB2 = concordance;}; if (concordance > maxB2){maxB2 = concordance;}; if (matches > maxB2l){maxB2l = matches;}; if (compared-matches > maxB2d){maxB2d = compared-matches;}}else{B2c++;}
|
||||
} else if (readstart < B3stop && readstop > B3start){
|
||||
stats = CalculateConcordance(read, B3, "B3", B3start, B3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
|
||||
if (stats.getNumCompared() > 40){if (minB3 < 0 || minB3 > concordance){minB3 = concordance;}; if (concordance > maxB3){maxB3 = concordance;}; if (matches > maxB3l){maxB3l = matches;}; if (compared-matches > maxB3d){maxB3d = compared-matches;}}else{B3c++;}
|
||||
} else if (readstart < C2stop && readstop > C2start){
|
||||
stats = CalculateConcordance(read, C2, "C2", C2start, C2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
|
||||
if (stats.getNumCompared() > 40){if (minC2 < 0 || minC2 > concordance){minC2 = concordance;}; if (concordance > maxC2){maxC2 = concordance;}; if (matches > maxC2l){maxC2l = matches;}; if (compared-matches > maxC2d){maxC2d = compared-matches;}}else{C2c++;}
|
||||
} else if (readstart < C3stop && readstop > C3start){
|
||||
stats = CalculateConcordance(read, C3, "C3", C3start, C3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
|
||||
if (stats.getNumCompared() > 40){if (minC3 < 0 || minC3 > concordance){minC3 = concordance;}; if (concordance > maxC3){maxC3 = concordance;}; if (matches > maxC3l){maxC3l = matches;}; if (compared-matches > maxC3d){maxC3d = compared-matches;}}else{C3c++;}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < AlignedReads.size(); i++){
|
||||
read = AlignedReads.get(i);
|
||||
readstart = read.getAlignmentStart();
|
||||
readstop = read.getAlignmentEnd();
|
||||
name = read.getReadName(); name2 = "";
|
||||
if (NumCompared.containsKey(name)){
|
||||
compared = Double.parseDouble((String) NumCompared.get(name).toString());
|
||||
matches = Double.parseDouble((String) MaxNumMatches.get(name).toString());
|
||||
concordance = Double.parseDouble((String) MaxConcordance.get(name).toString());
|
||||
if (matches > 40){
|
||||
if (readstart < A2stop && readstop > A2start){
|
||||
locus = "A2"; name2 = A2.getReadName();
|
||||
a2 = (concordance - minA2)/(maxA2-minA2); if (a2 >= .5){A2a++;}else{A2b++;}; normalized = a2;
|
||||
} else if (readstart < A3stop && readstop > A3start){
|
||||
locus = "A3"; name2 = A3.getReadName();
|
||||
a3 = (concordance - minA3)/(maxA3-minA3); if (a3 >= .5){A3a++;}else{A3b++;}; normalized = a3;
|
||||
} else if (readstart < B2stop && readstop > B2start){
|
||||
locus = "B2"; name2 = B2.getReadName();
|
||||
b2 = (concordance - minB2)/(maxB2-minB2); if (b2 >= .5){B2a++;}else{B2b++;}; normalized = b2;
|
||||
} else if (readstart < B3stop && readstop > B3start){
|
||||
locus = "B3"; name2 = B3.getReadName();
|
||||
b3 = (concordance - minB3)/(maxB3-minB3); if (b3 >= .5){B3a++;}else{B3b++;}; normalized = b3;
|
||||
} else if (readstart < C2stop && readstop > C2start){
|
||||
locus = "C2"; name2 = C2.getReadName();
|
||||
c2 = (concordance - minC2)/(maxC2-minC2); if (c2 >= .5){C2a++;}else{C2b++;}; normalized = c2;
|
||||
} else if (readstart < C3stop && readstop > C3start){
|
||||
locus = "C3"; name2 = C3.getReadName();
|
||||
c3 = (concordance - minC3)/(maxC3-minC3); if (c3 >= .5){C3a++;}else{C3b++;}; normalized = c3;
|
||||
}
|
||||
out.printf("%s\t%s\t%s\t%.0f\t%.0f\t%.3f\t%.3f\n",locus,name,name2,matches,compared,concordance,normalized);
|
||||
}else{
|
||||
out.printf("%s (compared at %s sites) is too short\n",name,matches);
|
||||
}
|
||||
}else{
|
||||
out.printf("%s [%s to %s] not found\n",name,readstart,readstop);
|
||||
}
|
||||
}
|
||||
|
||||
if (A2a > 0 && A2b > 0){if (A2a > A2b){ratioA2 = (double)A2b/(A2a+A2b);}else{ratioA2 = (double)A2a/(A2a+A2b);}}else{ratioA2 = -1;}
|
||||
if (A3a > 0 && A3b > 0){if (A3a > A3b){ratioA3 = (double)A3b/(A3a+A3b);}else{ratioA3 = (double)A3a/(A3a+A3b);}}else{ratioA3 = -1;}
|
||||
if (B2a > 0 && B2b > 0){if (B2a > B2b){ratioB2 = (double)B2b/(B2a+B2b);}else{ratioB2 = (double)B2a/(B2a+B2b);}}else{ratioB2 = -1;}
|
||||
if (B3a > 0 && B3b > 0){if (B3a > B3b){ratioB3 = (double)B3b/(B3a+B3b);}else{ratioB3 = (double)B3a/(B3a+B3b);}}else{ratioB3 = -1;}
|
||||
if (C2a > 0 && C2b > 0){if (C2a > C2b){ratioC2 = (double)C2b/(C2a+C2b);}else{ratioC2 = (double)C2a/(C2a+C2b);}}else{ratioC2 = -1;}
|
||||
if (C3a > 0 && C3b > 0){if (C3a > C3b){ratioC3 = (double)C3b/(C3a+C3b);}else{ratioC3 = (double)C3a/(C3a+C3b);}}else{ratioC3 = -1;}
|
||||
|
||||
out.printf("RATIO_A2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioA2,A2a,A2b,A2c,maxA2-minA2,maxA2l,maxA2d);
|
||||
out.printf("RATIO_A3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioA3,A3a,A3b,A3c,maxA3-minA3,maxA3l,maxA3d);
|
||||
out.printf("RATIO_B2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioB2,B2a,B2b,B2c,maxB2-minB2,maxB2l,maxB2d);
|
||||
out.printf("RATIO_B3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioB3,B3a,B3b,B3c,maxB3-minB3,maxB3l,maxB3d);
|
||||
out.printf("RATIO_C2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioC2,C2a,C2b,C2c,maxC2-minC2,maxC2l,maxC2d);
|
||||
out.printf("RATIO_C3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioC3,C3a,C3b,C3c,maxC3-minC3,maxC3l,maxC3d);
|
||||
|
||||
}
|
||||
|
||||
public class STATS {
|
||||
protected double concordance = 0.0;
|
||||
protected double numcompared = 0;
|
||||
protected double nummatches = 0;
|
||||
|
||||
public STATS(double d, double i, double m) {
|
||||
concordance = d;
|
||||
numcompared = i;
|
||||
nummatches = m;
|
||||
}
|
||||
|
||||
public double getConcordance() {
|
||||
return concordance;
|
||||
}
|
||||
|
||||
public double getNumCompared() {
|
||||
return numcompared;
|
||||
}
|
||||
|
||||
public double getNumMatches() {
|
||||
return nummatches;
|
||||
}
|
||||
}
|
||||
|
||||
private STATS CalculateConcordance(SAMRecord read1, SAMRecord read2, String locus, int start, int stop){
|
||||
int start1 = read1.getAlignmentStart(), stop1 = read1.getAlignmentEnd();
|
||||
int start2 = read2.getAlignmentStart(), stop2 = read2.getAlignmentEnd();
|
||||
|
||||
int pos;
|
||||
double numcompared = 0, nummatched = 0, concordance;
|
||||
char c1, c2;
|
||||
String s1 = formatter.FormatRead(read1.getCigarString(), read1.getReadString());
|
||||
String s2 = formatter.FormatRead(read2.getCigarString(), read2.getReadString());
|
||||
|
||||
|
||||
//Polymorphic sites: always increment denominator, increment numerator when bases are concordant
|
||||
for (int j = 0; j < PolymorphicSites.length; j++){
|
||||
pos = PolymorphicSites[j];
|
||||
if (pos >= start1 && pos <= stop1 && pos >= start2 && pos <= stop2 && pos >= start && pos <= stop){
|
||||
c1 = s1.charAt(pos-start1);
|
||||
c2 = s2.charAt(pos-start2);
|
||||
if (c1 != 'D'){//allow for deletions (sequencing errors)
|
||||
numcompared++;
|
||||
if (c1 == c2){
|
||||
nummatched++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Non-polymorphic sites: increment denominator only when bases are discordant
|
||||
if (false){
|
||||
for (int j = 0; j < NonPolymorphicSites.length; j++){
|
||||
pos = NonPolymorphicSites[j];
|
||||
if (pos >= start1 && pos <= stop1 && pos >= start2 && pos <= stop2){
|
||||
c1 = s1.charAt(pos-start1);
|
||||
c2 = s2.charAt(pos-start2);
|
||||
if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors)
|
||||
numcompared++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Update concordance array
|
||||
concordance=nummatched/numcompared;
|
||||
|
||||
MaxNumMatches.put(read1.getReadName(), nummatched);
|
||||
NumCompared.put(read1.getReadName(), numcompared);
|
||||
MaxConcordance.put(read1.getReadName(), concordance);
|
||||
//out.printf("%s\t%s\t%s\t%.0f\t%.0f\t%.3f\n",locus,read1.getReadName(),read2.getReadName(),nummatched,numcompared,concordance);
|
||||
|
||||
return new STATS(concordance, numcompared, nummatched);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -52,6 +52,12 @@ public class CreatePedFileWalker extends ReadWalker<Integer, Integer> {
|
|||
@Argument(fullName = "DNAcode", shortName = "DNAcode", doc = "Amino acid codes", required = false)
|
||||
public String dnaCodesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/DNA_CODE.txt";
|
||||
|
||||
@Argument(fullName = "PrintDNA", shortName = "PrintDNA", doc = "Print DNA sequences", required = false)
|
||||
public boolean PrintDNA = false;
|
||||
|
||||
@Argument(fullName = "PrintAA", shortName = "PrintAA", doc = "Print Amino Acid sequences", required = false)
|
||||
public boolean PrintAA = true;
|
||||
|
||||
String[] HLAnames, HLAreads, inputFileContents;
|
||||
Integer[] HLAstartpos, HLAstoppos;
|
||||
ArrayList<String> HLAnamesAL, HLAreadsAL;
|
||||
|
|
@ -408,23 +414,26 @@ private String PrintAminoAcids(String ID, String alleleName1, String alleleName2
|
|||
|
||||
|
||||
if (true) {
|
||||
error = error + PrintGenotypes(s[1], HLA_A_1,HLA_A_2, HLA_A_start,HLA_A_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_C_1,HLA_C_2, HLA_C_start,HLA_C_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_B_1,HLA_B_2, HLA_B_start,HLA_B_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_DRB1_1,HLA_DRB1_2, HLA_DRB1_start,HLA_DRB1_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_DQA1_1,HLA_DQA1_2, HLA_DQA1_start,HLA_DQA1_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_DQB1_1,HLA_DQB1_2, HLA_DQB1_start,HLA_DQB1_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_DPA1_1,HLA_DPA1_2, HLA_DPA1_start,HLA_DPA1_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_DPB1_1,HLA_DPB1_2, HLA_DPB1_start,HLA_DPB1_end);
|
||||
|
||||
error = error + PrintAminoAcids(s[1], HLA_A_1,HLA_A_2, A_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_C_1,HLA_C_2, C_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_B_1,HLA_B_2, B_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_DRB1_1,HLA_DRB1_2, DRB1_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_DQA1_1,HLA_DQA1_2, DQA1_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_DQB1_1,HLA_DQB1_2, DQB1_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_DPA1_1,HLA_DPA1_2, DPA1_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_DPB1_1,HLA_DPB1_2, DPB1_exons);
|
||||
if (PrintDNA){
|
||||
error = error + PrintGenotypes(s[1], HLA_A_1,HLA_A_2, HLA_A_start,HLA_A_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_C_1,HLA_C_2, HLA_C_start,HLA_C_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_B_1,HLA_B_2, HLA_B_start,HLA_B_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_DRB1_1,HLA_DRB1_2, HLA_DRB1_start,HLA_DRB1_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_DQA1_1,HLA_DQA1_2, HLA_DQA1_start,HLA_DQA1_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_DQB1_1,HLA_DQB1_2, HLA_DQB1_start,HLA_DQB1_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_DPA1_1,HLA_DPA1_2, HLA_DPA1_start,HLA_DPA1_end);
|
||||
error = error + PrintGenotypes(s[1], HLA_DPB1_1,HLA_DPB1_2, HLA_DPB1_start,HLA_DPB1_end);
|
||||
}
|
||||
if (PrintAA){
|
||||
error = error + PrintAminoAcids(s[1], HLA_A_1,HLA_A_2, A_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_C_1,HLA_C_2, C_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_B_1,HLA_B_2, B_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_DRB1_1,HLA_DRB1_2, DRB1_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_DQA1_1,HLA_DQA1_2, DQA1_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_DQB1_1,HLA_DQB1_2, DQB1_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_DPA1_1,HLA_DPA1_2, DPA1_exons);
|
||||
error = error + PrintAminoAcids(s[1], HLA_DPB1_1,HLA_DPB1_2, DPB1_exons);
|
||||
}
|
||||
out.printf("\n");
|
||||
out.printf("%s",error);
|
||||
}
|
||||
|
|
@ -433,23 +442,27 @@ private String PrintAminoAcids(String ID, String alleleName1, String alleleName2
|
|||
|
||||
//Prints SNP names for each site
|
||||
if (true){
|
||||
PrintSNPS(HLA_A_start,HLA_A_end);
|
||||
PrintSNPS(HLA_C_start,HLA_C_end);
|
||||
PrintSNPS(HLA_B_start,HLA_B_end);
|
||||
PrintSNPS(HLA_DRB1_start,HLA_DRB1_end);
|
||||
PrintSNPS(HLA_DQA1_start,HLA_DQA1_end);
|
||||
PrintSNPS(HLA_DQB1_start,HLA_DQB1_end);
|
||||
PrintSNPS(HLA_DPA1_start,HLA_DPA1_end);
|
||||
PrintSNPS(HLA_DPB1_start,HLA_DPB1_end);
|
||||
if (PrintDNA){
|
||||
PrintSNPS(HLA_A_start,HLA_A_end);
|
||||
PrintSNPS(HLA_C_start,HLA_C_end);
|
||||
PrintSNPS(HLA_B_start,HLA_B_end);
|
||||
PrintSNPS(HLA_DRB1_start,HLA_DRB1_end);
|
||||
PrintSNPS(HLA_DQA1_start,HLA_DQA1_end);
|
||||
PrintSNPS(HLA_DQB1_start,HLA_DQB1_end);
|
||||
PrintSNPS(HLA_DPA1_start,HLA_DPA1_end);
|
||||
PrintSNPS(HLA_DPB1_start,HLA_DPB1_end);
|
||||
}
|
||||
|
||||
PrintAminoAcidSites(A_exons,"A",true);
|
||||
PrintAminoAcidSites(C_exons,"C",false);
|
||||
PrintAminoAcidSites(B_exons,"B",false);
|
||||
PrintAminoAcidSites(DRB1_exons,"DRB1",false);
|
||||
PrintAminoAcidSites(DQA1_exons,"DQA1",true);
|
||||
PrintAminoAcidSites(DQB1_exons,"DQB1",false);
|
||||
PrintAminoAcidSites(DPA1_exons,"DPA1",false);
|
||||
PrintAminoAcidSites(DPB1_exons,"DPB1",true);
|
||||
if (PrintAA){
|
||||
PrintAminoAcidSites(A_exons,"A",true);
|
||||
PrintAminoAcidSites(C_exons,"C",false);
|
||||
PrintAminoAcidSites(B_exons,"B",false);
|
||||
PrintAminoAcidSites(DRB1_exons,"DRB1",false);
|
||||
PrintAminoAcidSites(DQA1_exons,"DQA1",true);
|
||||
PrintAminoAcidSites(DQB1_exons,"DQB1",false);
|
||||
PrintAminoAcidSites(DPA1_exons,"DPA1",false);
|
||||
PrintAminoAcidSites(DPB1_exons,"DPB1",true);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,6 +43,9 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
|
||||
@Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false)
|
||||
public boolean findFirst = false;
|
||||
|
||||
@Argument(fullName = "DEBUG", shortName = "DEBUG", doc = "Debug walker", required = false)
|
||||
public boolean debug = false;
|
||||
|
||||
@Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false)
|
||||
public String debugAllele = "";
|
||||
|
|
@ -50,16 +53,18 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
|
||||
public String ethnicity = "Caucasian";
|
||||
|
||||
@Argument(fullName = "dictionary", shortName = "dictionary", doc = "bam file of HLA ditionary", required = false)
|
||||
public String HLAdictionaryFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam";
|
||||
|
||||
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
||||
public boolean ONLYFREQUENT = false;
|
||||
|
||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
|
||||
SAMFileReader HLADictionaryReader = new SAMFileReader();
|
||||
|
||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||
String AlleleFrequencyFile;
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||
|
||||
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
|
||||
|
||||
|
|
@ -74,6 +79,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
int numHLAlleles = 0;
|
||||
int minstartpos = 0;
|
||||
int maxstoppos = 0;
|
||||
int numpolymorphicsites = 0, numnonpolymorphicsites = 0, pos =0;
|
||||
|
||||
int HLA_A_start = 30018310;
|
||||
int HLA_A_end = 30021211;
|
||||
|
|
@ -89,7 +95,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
//Load HLA dictionary
|
||||
out.printf("INFO Loading HLA dictionary ... ");
|
||||
|
||||
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
||||
HLADictionaryReader.ReadFile(HLAdictionaryFile);
|
||||
HLAreads = HLADictionaryReader.GetReads();
|
||||
HLAnames = HLADictionaryReader.GetReadNames();
|
||||
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
||||
|
|
@ -121,7 +127,8 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
siteFileReader.ReadFile(PolymorphicSitesFile);
|
||||
PolymorphicSites = siteFileReader.GetPolymorphicSites();
|
||||
NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
|
||||
|
||||
numpolymorphicsites = PolymorphicSites.length;
|
||||
numnonpolymorphicsites = NonPolymorphicSites.length;
|
||||
|
||||
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
|
||||
out.printf("INFO Comparing reads to database ...\n");
|
||||
|
|
@ -133,53 +140,15 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
return 0;
|
||||
}
|
||||
|
||||
private void FindPolymorphicSites(int start, int stop){
|
||||
boolean initialized, polymorphic, examined;
|
||||
char c = ' ';
|
||||
ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
|
||||
ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
|
||||
//Find polymorphic sites in dictionary
|
||||
for (int pos = start; pos <= stop; pos++){
|
||||
initialized = false; polymorphic = false; examined = false;
|
||||
//look across all alleles at specific position to see if it is polymorphic
|
||||
for (int i = 0; i < HLAreads.length; i++){
|
||||
if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
|
||||
if (!initialized){
|
||||
c = HLAreads[i].charAt(pos-HLAstartpos[i]);
|
||||
initialized = true;
|
||||
examined = true;
|
||||
}
|
||||
if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
|
||||
polymorphicsites.add(pos);
|
||||
out.printf("POLYMORPHIC\t6\t%s\n", pos);
|
||||
polymorphic = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
if (!polymorphic && examined){
|
||||
nonpolymorphicsites.add(pos);
|
||||
out.printf("CONSERVED\t6\t%s\n", pos);
|
||||
}
|
||||
}
|
||||
PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
|
||||
NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
|
||||
}
|
||||
|
||||
private double CalculateConcordance(SAMRecord read){
|
||||
int readstart = read.getAlignmentStart();
|
||||
int readstop = read.getAlignmentEnd();
|
||||
int numpolymorphicsites, numnonpolymorphicsites, pos;
|
||||
char c1, c2;
|
||||
double maxConcordance = 0.0, freq = 0.0, minFreq = 0.0;
|
||||
String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString());
|
||||
String s2;
|
||||
int allelestart, allelestop;
|
||||
|
||||
numpolymorphicsites = PolymorphicSites.length;
|
||||
numnonpolymorphicsites = NonPolymorphicSites.length;
|
||||
|
||||
if (ONLYFREQUENT){
|
||||
minFreq = 0.0001;
|
||||
}
|
||||
|
|
@ -200,7 +169,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
|
||||
c1 = s1.charAt(pos-readstart);
|
||||
c2 = s2.charAt(pos-allelestart);
|
||||
if (c1 != 'D'){//allow for deletions (sequencing errors)
|
||||
if (c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
|
||||
numcompared[i]++;
|
||||
if (c1 == c2){
|
||||
nummatched[i]++;
|
||||
|
|
@ -214,16 +183,17 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
//Non-polymorphic sites: increment denominator only when bases are discordant
|
||||
|
||||
for (int j = 0; j < numnonpolymorphicsites; j++){
|
||||
pos = NonPolymorphicSites[j];
|
||||
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
|
||||
c1 = s1.charAt(pos-readstart);
|
||||
c2 = s2.charAt(pos-allelestart);
|
||||
if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors)
|
||||
numcompared[i]++;
|
||||
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
||||
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
|
||||
if (numcompared[i] > 0){
|
||||
for (int j = 0; j < numnonpolymorphicsites; j++){
|
||||
pos = NonPolymorphicSites[j];
|
||||
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
|
||||
c1 = s1.charAt(pos-readstart);
|
||||
c2 = s2.charAt(pos-allelestart);
|
||||
if (c1 != c2 && c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
|
||||
numcompared[i]++;
|
||||
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
||||
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -259,26 +229,32 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
|
||||
public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
//Calculate concordance for this read and all overlapping reads
|
||||
double maxConcordance = CalculateConcordance(read);
|
||||
if (read.getMappingQuality() > 0){
|
||||
double maxConcordance = CalculateConcordance(read);
|
||||
|
||||
String readname = read.getReadName(), allelename = ""; double freq;
|
||||
//For input bam files that contain HLA alleles, find and print allele frequency
|
||||
freq = GetAlleleFrequency(readname);
|
||||
out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());
|
||||
if (maxConcordance > 0){
|
||||
String readname = read.getReadName(), allelename = ""; double freq;
|
||||
//For input bam files that contain HLA alleles, find and print allele frequency
|
||||
//freq = GetAlleleFrequency(readname);
|
||||
out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());
|
||||
|
||||
//Find the maximum frequency of the alleles most concordant with the read
|
||||
double maxFreq = FindMaxAlleleFrequency(maxConcordance);
|
||||
|
||||
//Print concordance statistics between this read and the most similar HLA allele(s)
|
||||
for (int i = 0; i < HLAreads.length; i++){
|
||||
if (concordance[i] == maxConcordance && maxConcordance > 0){
|
||||
freq = GetAlleleFrequency(HLAnames[i]);
|
||||
if (freq == maxFreq){
|
||||
out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
|
||||
//Find the maximum frequency of the alleles most concordant with the read
|
||||
//double maxFreq = FindMaxAlleleFrequency(maxConcordance);
|
||||
|
||||
//Print concordance statistics between this read and the most similar HLA allele(s)
|
||||
|
||||
for (int i = 0; i < HLAreads.length; i++){
|
||||
if (concordance[i] == maxConcordance){
|
||||
freq = GetAlleleFrequency(HLAnames[i]);
|
||||
//if (freq == maxFreq){
|
||||
out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
|
||||
//}
|
||||
break;
|
||||
}
|
||||
}
|
||||
out.print("\n");
|
||||
}
|
||||
}
|
||||
out.print("\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,152 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Hashtable;
|
||||
/**
|
||||
* Finds polymorphic sites in the HLA dictionary. Usage: java -jar GenomeAnalysisTK.jar -T FindPolymorphicSites -I HLA_DICTIONARY.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval -findFirst | grep -v INFO | sort -k1 > OUTPUT
|
||||
* @author shermanjia
|
||||
*/
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
|
||||
@Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false)
|
||||
public String debugRead = "";
|
||||
|
||||
@Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false)
|
||||
public boolean findFirst = false;
|
||||
|
||||
@Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false)
|
||||
public String debugAllele = "";
|
||||
|
||||
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
|
||||
public String ethnicity = "Caucasian";
|
||||
|
||||
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
||||
public boolean ONLYFREQUENT = false;
|
||||
|
||||
//String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
|
||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
|
||||
|
||||
SAMFileReader HLADictionaryReader = new SAMFileReader();
|
||||
|
||||
//String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||
String AlleleFrequencyFile;
|
||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||
|
||||
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
|
||||
|
||||
boolean DatabaseLoaded = false;
|
||||
boolean DEBUG = false;
|
||||
|
||||
String[] HLAnames, HLAreads;
|
||||
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
|
||||
double[] SingleAlleleFrequencies;
|
||||
|
||||
double[] nummatched, concordance, numcompared;
|
||||
int numHLAlleles = 0;
|
||||
int minstartpos = 0;
|
||||
int maxstoppos = 0;
|
||||
|
||||
int HLA_A_start = 30018310;
|
||||
int HLA_A_end = 30021211;
|
||||
|
||||
Hashtable AlleleFrequencies = new Hashtable();
|
||||
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
|
||||
CigarParser formatter = new CigarParser();
|
||||
|
||||
public Integer reduceInit() {
|
||||
if (!DatabaseLoaded){
|
||||
DatabaseLoaded = true;
|
||||
|
||||
//Load HLA dictionary
|
||||
out.printf("INFO Loading HLA dictionary ... ");
|
||||
|
||||
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
||||
HLAreads = HLADictionaryReader.GetReads();
|
||||
HLAnames = HLADictionaryReader.GetReadNames();
|
||||
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
||||
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
||||
minstartpos = HLADictionaryReader.GetMinStartPos();
|
||||
maxstoppos = HLADictionaryReader.GetMaxStopPos();
|
||||
|
||||
out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
|
||||
|
||||
nummatched = new double[HLAreads.length];
|
||||
concordance = new double[HLAreads.length];
|
||||
numcompared = new double[HLAreads.length];
|
||||
|
||||
//Read allele frequencies
|
||||
if (ethnicity.equals("Black")){
|
||||
AlleleFrequencyFile = BlackAlleleFrequencyFile;
|
||||
}else{
|
||||
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
|
||||
}
|
||||
out.printf("INFO Reading HLA allele frequencies ... ");
|
||||
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
|
||||
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
|
||||
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
|
||||
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
|
||||
|
||||
FindPolymorphicSites(minstartpos,maxstoppos);
|
||||
|
||||
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
|
||||
out.printf("INFO Comparing reads to database ...\n");
|
||||
|
||||
if (DEBUG){
|
||||
//out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private void FindPolymorphicSites(int start, int stop){
|
||||
boolean initialized, polymorphic, examined;
|
||||
char c = ' ';
|
||||
ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
|
||||
ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
|
||||
//Find polymorphic sites in dictionary
|
||||
for (int pos = start; pos <= stop; pos++){
|
||||
initialized = false; polymorphic = false; examined = false;
|
||||
//look across all alleles at specific position to see if it is polymorphic
|
||||
for (int i = 0; i < HLAreads.length; i++){
|
||||
if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
|
||||
if (!initialized){
|
||||
c = HLAreads[i].charAt(pos-HLAstartpos[i]);
|
||||
initialized = true;
|
||||
examined = true;
|
||||
}
|
||||
if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
|
||||
polymorphicsites.add(pos);
|
||||
out.printf("POLYMORPHIC\t6\t%s\n", pos);
|
||||
polymorphic = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
if (!polymorphic && examined){
|
||||
nonpolymorphicsites.add(pos);
|
||||
out.printf("CONSERVED\t6\t%s\n", pos);
|
||||
}
|
||||
}
|
||||
PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
|
||||
NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
|
||||
}
|
||||
|
||||
public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
//Calculate concordance for this read and all overlapping reads
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -16,8 +16,9 @@ import java.util.Hashtable;
|
|||
*/
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
|
||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.sam";
|
||||
String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.closest";
|
||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
|
||||
// String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.CLASS1.closest";
|
||||
String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.CLASS2.closest";
|
||||
|
||||
boolean DatabaseLoaded = false;
|
||||
boolean DEBUG = false;
|
||||
|
|
@ -40,14 +41,24 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
|
|||
int HLA_B_end = 31432914;
|
||||
int HLA_C_start = 31344925;
|
||||
int HLA_C_end = 31347827;
|
||||
int HLA_DQA1_start = 32713161;
|
||||
int HLA_DQA1_end = 32719407;
|
||||
int HLA_DQB1_start = 32735635;
|
||||
int HLA_DQB1_end = 32742444;
|
||||
int HLA_DPA1_start = 33140772;
|
||||
int HLA_DPA1_end = 33149356;
|
||||
int HLA_DPB1_start = 33151738;
|
||||
int HLA_DPB1_end = 33162954;
|
||||
int HLA_DRB1_start = 32654525;
|
||||
int HLA_DRB1_end = 32665540;
|
||||
|
||||
|
||||
ArrayList<String> PolymorphicSites = new ArrayList<String>();
|
||||
|
||||
Hashtable ClosestAllele = new Hashtable();
|
||||
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
|
||||
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1, iDRBstart = -1, iDRBstop = -1, iDQAstart = -1, iDQAstop = -1, iDQBstart = -1, iDQBstop = -1, iDPAstart = -1, iDPAstop = -1, iDPBstart = -1, iDPBstop = -1;
|
||||
CigarParser formatter = new CigarParser();
|
||||
|
||||
|
||||
public Integer reduceInit() {
|
||||
if (!DatabaseLoaded){
|
||||
try{
|
||||
|
|
@ -77,6 +88,21 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
|
|||
}else if (s[0].indexOf("HLA_C") > -1){
|
||||
if (iCstart < 0){iCstart=i;}
|
||||
iCstop = i; i++;
|
||||
}else if (s[0].indexOf("HLA_DRB1") > -1){
|
||||
if (iDRBstart < 0){iDRBstart=i;}
|
||||
iDRBstop = i; i++;
|
||||
}else if (s[0].indexOf("HLA_DQA1") > -1){
|
||||
if (iDQAstart < 0){iDQAstart=i;}
|
||||
iDQAstop = i; i++;
|
||||
}else if (s[0].indexOf("HLA_DQB1") > -1){
|
||||
if (iDQBstart < 0){iDQBstart=i;}
|
||||
iDQBstop = i; i++;
|
||||
}else if (s[0].indexOf("HLA_DPA1") > -1){
|
||||
if (iDPAstart < 0){iDPAstart=i;}
|
||||
iDPAstop = i; i++;
|
||||
}else if (s[0].indexOf("HLA_DPB1") > -1){
|
||||
if (iDPBstart < 0){iDPBstart=i;}
|
||||
iDPBstop = i; i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -146,10 +172,12 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
|
|||
int numM = 0, numI = 0, numD = 0;
|
||||
|
||||
name = read.getReadName();
|
||||
|
||||
String matchedAllele = (String) ClosestAllele.get(name);
|
||||
|
||||
//out.printf("%s\t%s\n",name,matchedAllele);
|
||||
int index = HLAnames.indexOf(matchedAllele);
|
||||
|
||||
String matchedRead = HLAreads.get(index);
|
||||
|
||||
if (name.indexOf("HLA_A") > -1){
|
||||
|
|
@ -161,8 +189,24 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
|
|||
} else if (name.indexOf("HLA_C") > -1){
|
||||
startimputation = HLA_C_start;
|
||||
stopimputation = HLA_C_end;
|
||||
} else if (name.indexOf("HLA_DRB1") > -1){
|
||||
startimputation = HLA_DRB1_start;
|
||||
stopimputation = HLA_DRB1_end;
|
||||
} else if (name.indexOf("HLA_DQA1") > -1){
|
||||
startimputation = HLA_DQA1_start;
|
||||
stopimputation = HLA_DQA1_end;
|
||||
} else if (name.indexOf("HLA_DQB1") > -1){
|
||||
startimputation = HLA_DQB1_start;
|
||||
stopimputation = HLA_DQB1_end;
|
||||
} else if (name.indexOf("HLA_DPA1") > -1){
|
||||
startimputation = HLA_DPA1_start;
|
||||
stopimputation = HLA_DPA1_end;
|
||||
} else if (name.indexOf("HLA_DPB1") > -1){
|
||||
startimputation = HLA_DPB1_start;
|
||||
stopimputation = HLA_DPB1_end;
|
||||
}
|
||||
|
||||
//out.printf("DEBUG %s\t%s\t%s\t%s\t%s\n",name,matchedAllele,index,startimputation,stopimputation);
|
||||
for (int i = startimputation; i <= stopimputation; i++){
|
||||
//if position is within read
|
||||
if (i >= readstart && i <= readstop){
|
||||
|
|
|
|||
|
|
@ -7,12 +7,16 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
|||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Hashtable;
|
||||
/**
|
||||
*
|
||||
* @author shermanjia
|
||||
*/
|
||||
public class SimilarityFileReader {
|
||||
ArrayList<String> ReadsToDiscard = new ArrayList<String>();
|
||||
Hashtable Concordance = new Hashtable();
|
||||
Hashtable NumMatches = new Hashtable();
|
||||
Hashtable NumMismatches = new Hashtable();
|
||||
|
||||
public ArrayList<String> GetReadsToDiscard(){
|
||||
return ReadsToDiscard;
|
||||
|
|
@ -22,6 +26,18 @@ public class SimilarityFileReader {
|
|||
return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]);
|
||||
}
|
||||
|
||||
public Hashtable GetConcordance(){
|
||||
return Concordance;
|
||||
}
|
||||
|
||||
public Hashtable GetNumMatches(){
|
||||
return NumMatches;
|
||||
}
|
||||
|
||||
public Hashtable GetNumMismatches(){
|
||||
return NumMismatches;
|
||||
}
|
||||
|
||||
public void ReadFile(String filename, int minAllowedMismatches){
|
||||
try{
|
||||
FileInputStream fstream = new FileInputStream(filename);
|
||||
|
|
@ -35,6 +51,10 @@ public class SimilarityFileReader {
|
|||
if (s.length >= 6){
|
||||
Double matchFraction = Double.valueOf(s[4]);
|
||||
int numMismatches = Integer.valueOf(s[6]);
|
||||
|
||||
Concordance.put(s[0],matchFraction);
|
||||
NumMatches.put(s[0], s[5]);
|
||||
NumMismatches.put(s[0], numMismatches);
|
||||
if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches > minAllowedMismatches)){
|
||||
ReadsToDiscard.add(s[0]);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue