HLA caller updated to examine class II loci, updated pointers to dictionary, allele frequencies.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3290 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
sjia 2010-05-03 14:54:52 +00:00
parent 97fdd92e7b
commit 94b51de401
9 changed files with 668 additions and 130 deletions

View File

@ -56,10 +56,10 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
public String ethnicity = "CaucasianUSA";
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAllelesCommon";
Hashtable AlleleFrequencies,UniqueAlleles;
CigarParser formatter = new CigarParser();
@ -95,6 +95,10 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
UniqueAlleles = HLAfreqReader.GetUniqueAlleles();
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
//out.printf("INFO Common alleles:\n");
for (int i = 1; i < UniqueAlleles.size(); i++){
//out.printf("INFO %s\n",UniqueAlleles.values().toArray()[i]);
}
out.printf("INFO Reading HLA dictionary ...");
@ -188,12 +192,18 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
for (int i = 0; i < numreads; i++){
name1 = HLAnames[i].substring(4);
if (UniqueAlleles.containsKey(name1)){
String [] n1 = name1.split("\\*");
// out.printf("1: %s\n",n1[0] + "*" + n1[1].substring(0, 3));
if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){
//out.printf("1: %s\n",name1);
//frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
//if (frq1 > minfrq){
for (int j = i; j < numreads; j++){
name2 = HLAnames[j].substring(4);
if (UniqueAlleles.containsKey(name2)){
String [] n2 = name2.split("\\*");
// out.printf("2: %s\n",n2[0] + "*" + n2[1].substring(0, 3));
if (UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){
// frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
// if (frq2 > minfrq){
if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
@ -211,8 +221,8 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
//}else{
// if (DEBUG){out.printf("%s has allele frequency%.5f\n",name1,frq1);}
//}
}else{
if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
//}else{
// if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
}
}
}

View File

@ -55,8 +55,8 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
public int MINALLOWEDMISMATCHES = 7;
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";

View File

@ -67,12 +67,12 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
GATKArgumentCollection args = this.getToolkit().getArguments();
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAllelesCommon";
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
SAMFileReader HLADictionaryReader = new SAMFileReader();
boolean HLAdataLoaded = false;
String[] HLAnames, HLAreads;
@ -216,7 +216,8 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
out.printf("NUM\tAllele1\tAllele2\tPhase\tFrq1\tFrq2\n");
for (int i = 0; i < HLAnames.length; i++){
name1 = HLAnames[i].substring(4);
if (UniqueAlleles.containsKey(name1)){
String [] n1 = name1.split("\\*");
if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){
if (AlleleFrequencies.containsKey(name1)){
frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
}else{
@ -225,19 +226,19 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
//if (frq1 > minfrq){
for (int j = i; j < HLAnames.length; j++){
name2 = HLAnames[j].substring(4);
if (name1.substring(0,1).equals(name2.substring(0,1))){
if (UniqueAlleles.containsKey(name2)){
if (AlleleFrequencies.containsKey(name2)){
frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
}else{
frq2 = .0001;
}
// if (frq2 > minfrq){
likelihood = CalculatePhaseLikelihood(i,j,false);
numCombinations++;
out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2));
// }
String [] n2 = name2.split("\\*");
if (n1[0].equals(n2[0]) && UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){
if (AlleleFrequencies.containsKey(name2)){
frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
}else{
frq2 = .0001;
}
// if (frq2 > minfrq){
likelihood = CalculatePhaseLikelihood(i,j,false);
numCombinations++;
out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2));
// }
}
}
//}

View File

@ -0,0 +1,322 @@
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.commandline.Argument;
import java.util.ArrayList;
import java.util.Hashtable;
/**
* Compares reads to longest read at each exon. Usage: java -jar GenomeAnalysisTK.jar -T ClusterReads -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta [-filter INPUT.filter] | grep -v INFO | sort -k1 > OUTPUT
* @author shermanjia
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class ClusterReadsWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
public String filterFile = "";
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
public int MINALLOWEDMISMATCHES = 5;
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
boolean DatabaseLoaded = false;
boolean DEBUG = false;
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
double[] SingleAlleleFrequencies;
ArrayList<String> ReadsToDiscard = new ArrayList<String>();
ArrayList<SAMRecord> AlignedReads = new ArrayList<SAMRecord>();
Hashtable MaxNumMatches, MaxConcordance, NumCompared = new Hashtable();
double[] nummatched, concordance, numcompared;
SAMRecord A2, A3, B2, B3, C2, C3;
int MaxMatchesA2 = 0, MaxMatchesA3 = 0, MaxMatchesB2 = 0, MaxMatchesB3 = 0, MaxMatchesC2 = 0, MaxMatchesC3 = 0;
int A2start = 30018513, A2stop = 30018781;
int A3start = 30019024, A3stop = 30019300;
int C2start = 31347355, C2stop = 31347623;
int C3start = 31346829, C3stop = 31347104;
int B2start = 31432444, B2stop = 31432714;
int B3start = 31431923, B3stop = 31432198;
int minstartpos = 0;
int maxstoppos = 0;
Hashtable AlleleFrequencies = new Hashtable();
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
CigarParser formatter = new CigarParser();
public Integer reduceInit() {
if (!DatabaseLoaded){
DatabaseLoaded = true;
PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader();
siteFileReader.ReadFile(PolymorphicSitesFile);
PolymorphicSites = siteFileReader.GetPolymorphicSites();
NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
if (!filterFile.equals("")){
out.printf("INFO Reading properties file ... ");
SimilarityFileReader similarityReader = new SimilarityFileReader();
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
ReadsToDiscard = similarityReader.GetReadsToDiscard();
MaxNumMatches = similarityReader.GetNumMatches();
MaxConcordance = similarityReader.GetConcordance();
out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
for (int i = 0; i < ReadsToDiscard.size(); i++){
out.printf("MISALIGNED %s\n", ReadsToDiscard.get(i).toString());
}
}
out.printf("INFO Comparing reads ...\n");
if (DEBUG){
//out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles);
}
}
return 0;
}
public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker tracker) {
//Calculate concordance for this read and all overlapping reads
if (!ReadsToDiscard.contains(read.getReadName())){
AlignedReads.add(read);
int readstart = read.getAlignmentStart();
int readstop = read.getAlignmentEnd();
int length = readstop - readstart + 1;
if (MaxNumMatches.containsKey(read.getReadName())){
int maxMatches = Integer.parseInt(MaxNumMatches.get(read.getReadName()).toString());
double concordance = Double.parseDouble((String) MaxConcordance.get(read.getReadName()).toString());
if (readstart < A2stop && readstop > A2start){
if (maxMatches > MaxMatchesA2 && concordance > 0.95){
MaxMatchesA2 = maxMatches;
A2 = read;
}
} else if (readstart < A3stop && readstop > A3start){
if (maxMatches > MaxMatchesA3){
MaxMatchesA3 = maxMatches;
A3 = read;
}
} else if (readstart < B2stop && readstop > B2start){
if (maxMatches > MaxMatchesB2){
MaxMatchesB2 = maxMatches;
B2 = read;
}
} else if (readstart < B3stop && readstop > B3start){
if (maxMatches > MaxMatchesB3){
MaxMatchesB3 = maxMatches;
B3 = read;
}
} else if (readstart < C2stop && readstop > C2start){
if (maxMatches > MaxMatchesC2){
MaxMatchesC2 = maxMatches;
C2 = read;
}
} else if (readstart < C3stop && readstop > C3start){
if (maxMatches > MaxMatchesC3){
MaxMatchesC3 = maxMatches;
C3 = read;
}
}
}else{
out.printf("Data for %s not found\n",read.getReadName());
}
}
return 1;
}
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
public void onTraversalDone(Integer numreads) {
SAMRecord read; String name, name2; String locus = "";
int A2a = 0, A2b = 0, A2c = 0;
int A3a = 0, A3b = 0, A3c = 0;
int B2a = 0, B2b = 0, B2c = 0;
int B3a = 0, B3b = 0, B3c = 0;
int C2a = 0, C2b = 0, C2c = 0;
int C3a = 0, C3b = 0, C3c = 0;
double minA2 = -1, minA3 = -1, minB2 = -1, minB3 = -1, minC2 = -1, minC3 = -1;
double maxA2 = 0, maxA3 = 0, maxB2 = 0, maxB3 = 0, maxC2 = 0, maxC3 = 0;
double ratioA2 = 0, ratioA3 = 0, ratioB2 = 0, ratioB3 = 0, ratioC2 = 0, ratioC3 = 0;
double maxA2l = 0, maxA3l = 0, maxB2l = 0, maxB3l = 0, maxC2l = 0, maxC3l = 0;
double maxA2d = 0, maxA3d = 0, maxB2d = 0, maxB3d = 0, maxC2d = 0, maxC3d = 0;
double a2, a3, b2, b3, c2, c3, normalized = 0;
int readstart, readstop;
double matches, compared, concordance;
STATS stats;
for (int i = 0; i < AlignedReads.size(); i++){
read = AlignedReads.get(i);
readstart = read.getAlignmentStart();
readstop = read.getAlignmentEnd();
if (readstart < A2stop && readstop > A2start){
stats = CalculateConcordance(read, A2, "A2", A2start, A2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
if (stats.getNumCompared() > 40){if (minA2 < 0 || minA2 > concordance){minA2 = concordance;}; if (concordance > maxA2){maxA2 = concordance;}; if (matches > maxA2l){maxA2l = matches;}; if (compared-matches > maxA2d){maxA2d = compared-matches;}}else{A2c++;}
} else if (readstart < A3stop && readstop > A3start){
stats = CalculateConcordance(read, A3, "A3", A3start, A3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
if (stats.getNumCompared() > 40){if (minA3 < 0 || minA3 > concordance){minA3 = concordance;}; if (concordance > maxA3){maxA3 = concordance;}; if (matches > maxA3l){maxA3l = matches;}; if (compared-matches > maxA3d){maxA3d = compared-matches;}}else{A3c++;}
} else if (readstart < B2stop && readstop > B2start){
stats = CalculateConcordance(read, B2, "B2", B2start, B2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
if (stats.getNumCompared() > 40){if (minB2 < 0 || minB2 > concordance){minB2 = concordance;}; if (concordance > maxB2){maxB2 = concordance;}; if (matches > maxB2l){maxB2l = matches;}; if (compared-matches > maxB2d){maxB2d = compared-matches;}}else{B2c++;}
} else if (readstart < B3stop && readstop > B3start){
stats = CalculateConcordance(read, B3, "B3", B3start, B3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
if (stats.getNumCompared() > 40){if (minB3 < 0 || minB3 > concordance){minB3 = concordance;}; if (concordance > maxB3){maxB3 = concordance;}; if (matches > maxB3l){maxB3l = matches;}; if (compared-matches > maxB3d){maxB3d = compared-matches;}}else{B3c++;}
} else if (readstart < C2stop && readstop > C2start){
stats = CalculateConcordance(read, C2, "C2", C2start, C2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
if (stats.getNumCompared() > 40){if (minC2 < 0 || minC2 > concordance){minC2 = concordance;}; if (concordance > maxC2){maxC2 = concordance;}; if (matches > maxC2l){maxC2l = matches;}; if (compared-matches > maxC2d){maxC2d = compared-matches;}}else{C2c++;}
} else if (readstart < C3stop && readstop > C3start){
stats = CalculateConcordance(read, C3, "C3", C3start, C3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
if (stats.getNumCompared() > 40){if (minC3 < 0 || minC3 > concordance){minC3 = concordance;}; if (concordance > maxC3){maxC3 = concordance;}; if (matches > maxC3l){maxC3l = matches;}; if (compared-matches > maxC3d){maxC3d = compared-matches;}}else{C3c++;}
}
}
for (int i = 0; i < AlignedReads.size(); i++){
read = AlignedReads.get(i);
readstart = read.getAlignmentStart();
readstop = read.getAlignmentEnd();
name = read.getReadName(); name2 = "";
if (NumCompared.containsKey(name)){
compared = Double.parseDouble((String) NumCompared.get(name).toString());
matches = Double.parseDouble((String) MaxNumMatches.get(name).toString());
concordance = Double.parseDouble((String) MaxConcordance.get(name).toString());
if (matches > 40){
if (readstart < A2stop && readstop > A2start){
locus = "A2"; name2 = A2.getReadName();
a2 = (concordance - minA2)/(maxA2-minA2); if (a2 >= .5){A2a++;}else{A2b++;}; normalized = a2;
} else if (readstart < A3stop && readstop > A3start){
locus = "A3"; name2 = A3.getReadName();
a3 = (concordance - minA3)/(maxA3-minA3); if (a3 >= .5){A3a++;}else{A3b++;}; normalized = a3;
} else if (readstart < B2stop && readstop > B2start){
locus = "B2"; name2 = B2.getReadName();
b2 = (concordance - minB2)/(maxB2-minB2); if (b2 >= .5){B2a++;}else{B2b++;}; normalized = b2;
} else if (readstart < B3stop && readstop > B3start){
locus = "B3"; name2 = B3.getReadName();
b3 = (concordance - minB3)/(maxB3-minB3); if (b3 >= .5){B3a++;}else{B3b++;}; normalized = b3;
} else if (readstart < C2stop && readstop > C2start){
locus = "C2"; name2 = C2.getReadName();
c2 = (concordance - minC2)/(maxC2-minC2); if (c2 >= .5){C2a++;}else{C2b++;}; normalized = c2;
} else if (readstart < C3stop && readstop > C3start){
locus = "C3"; name2 = C3.getReadName();
c3 = (concordance - minC3)/(maxC3-minC3); if (c3 >= .5){C3a++;}else{C3b++;}; normalized = c3;
}
out.printf("%s\t%s\t%s\t%.0f\t%.0f\t%.3f\t%.3f\n",locus,name,name2,matches,compared,concordance,normalized);
}else{
out.printf("%s (compared at %s sites) is too short\n",name,matches);
}
}else{
out.printf("%s [%s to %s] not found\n",name,readstart,readstop);
}
}
if (A2a > 0 && A2b > 0){if (A2a > A2b){ratioA2 = (double)A2b/(A2a+A2b);}else{ratioA2 = (double)A2a/(A2a+A2b);}}else{ratioA2 = -1;}
if (A3a > 0 && A3b > 0){if (A3a > A3b){ratioA3 = (double)A3b/(A3a+A3b);}else{ratioA3 = (double)A3a/(A3a+A3b);}}else{ratioA3 = -1;}
if (B2a > 0 && B2b > 0){if (B2a > B2b){ratioB2 = (double)B2b/(B2a+B2b);}else{ratioB2 = (double)B2a/(B2a+B2b);}}else{ratioB2 = -1;}
if (B3a > 0 && B3b > 0){if (B3a > B3b){ratioB3 = (double)B3b/(B3a+B3b);}else{ratioB3 = (double)B3a/(B3a+B3b);}}else{ratioB3 = -1;}
if (C2a > 0 && C2b > 0){if (C2a > C2b){ratioC2 = (double)C2b/(C2a+C2b);}else{ratioC2 = (double)C2a/(C2a+C2b);}}else{ratioC2 = -1;}
if (C3a > 0 && C3b > 0){if (C3a > C3b){ratioC3 = (double)C3b/(C3a+C3b);}else{ratioC3 = (double)C3a/(C3a+C3b);}}else{ratioC3 = -1;}
out.printf("RATIO_A2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioA2,A2a,A2b,A2c,maxA2-minA2,maxA2l,maxA2d);
out.printf("RATIO_A3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioA3,A3a,A3b,A3c,maxA3-minA3,maxA3l,maxA3d);
out.printf("RATIO_B2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioB2,B2a,B2b,B2c,maxB2-minB2,maxB2l,maxB2d);
out.printf("RATIO_B3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioB3,B3a,B3b,B3c,maxB3-minB3,maxB3l,maxB3d);
out.printf("RATIO_C2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioC2,C2a,C2b,C2c,maxC2-minC2,maxC2l,maxC2d);
out.printf("RATIO_C3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioC3,C3a,C3b,C3c,maxC3-minC3,maxC3l,maxC3d);
}
public class STATS {
protected double concordance = 0.0;
protected double numcompared = 0;
protected double nummatches = 0;
public STATS(double d, double i, double m) {
concordance = d;
numcompared = i;
nummatches = m;
}
public double getConcordance() {
return concordance;
}
public double getNumCompared() {
return numcompared;
}
public double getNumMatches() {
return nummatches;
}
}
private STATS CalculateConcordance(SAMRecord read1, SAMRecord read2, String locus, int start, int stop){
int start1 = read1.getAlignmentStart(), stop1 = read1.getAlignmentEnd();
int start2 = read2.getAlignmentStart(), stop2 = read2.getAlignmentEnd();
int pos;
double numcompared = 0, nummatched = 0, concordance;
char c1, c2;
String s1 = formatter.FormatRead(read1.getCigarString(), read1.getReadString());
String s2 = formatter.FormatRead(read2.getCigarString(), read2.getReadString());
//Polymorphic sites: always increment denominator, increment numerator when bases are concordant
for (int j = 0; j < PolymorphicSites.length; j++){
pos = PolymorphicSites[j];
if (pos >= start1 && pos <= stop1 && pos >= start2 && pos <= stop2 && pos >= start && pos <= stop){
c1 = s1.charAt(pos-start1);
c2 = s2.charAt(pos-start2);
if (c1 != 'D'){//allow for deletions (sequencing errors)
numcompared++;
if (c1 == c2){
nummatched++;
}
}
}
}
//Non-polymorphic sites: increment denominator only when bases are discordant
if (false){
for (int j = 0; j < NonPolymorphicSites.length; j++){
pos = NonPolymorphicSites[j];
if (pos >= start1 && pos <= stop1 && pos >= start2 && pos <= stop2){
c1 = s1.charAt(pos-start1);
c2 = s2.charAt(pos-start2);
if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors)
numcompared++;
}
}
}
}
//Update concordance array
concordance=nummatched/numcompared;
MaxNumMatches.put(read1.getReadName(), nummatched);
NumCompared.put(read1.getReadName(), numcompared);
MaxConcordance.put(read1.getReadName(), concordance);
//out.printf("%s\t%s\t%s\t%.0f\t%.0f\t%.3f\n",locus,read1.getReadName(),read2.getReadName(),nummatched,numcompared,concordance);
return new STATS(concordance, numcompared, nummatched);
}
}

View File

@ -52,6 +52,12 @@ public class CreatePedFileWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "DNAcode", shortName = "DNAcode", doc = "Amino acid codes", required = false)
public String dnaCodesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/DNA_CODE.txt";
@Argument(fullName = "PrintDNA", shortName = "PrintDNA", doc = "Print DNA sequences", required = false)
public boolean PrintDNA = false;
@Argument(fullName = "PrintAA", shortName = "PrintAA", doc = "Print Amino Acid sequences", required = false)
public boolean PrintAA = true;
String[] HLAnames, HLAreads, inputFileContents;
Integer[] HLAstartpos, HLAstoppos;
ArrayList<String> HLAnamesAL, HLAreadsAL;
@ -408,23 +414,26 @@ private String PrintAminoAcids(String ID, String alleleName1, String alleleName2
if (true) {
error = error + PrintGenotypes(s[1], HLA_A_1,HLA_A_2, HLA_A_start,HLA_A_end);
error = error + PrintGenotypes(s[1], HLA_C_1,HLA_C_2, HLA_C_start,HLA_C_end);
error = error + PrintGenotypes(s[1], HLA_B_1,HLA_B_2, HLA_B_start,HLA_B_end);
error = error + PrintGenotypes(s[1], HLA_DRB1_1,HLA_DRB1_2, HLA_DRB1_start,HLA_DRB1_end);
error = error + PrintGenotypes(s[1], HLA_DQA1_1,HLA_DQA1_2, HLA_DQA1_start,HLA_DQA1_end);
error = error + PrintGenotypes(s[1], HLA_DQB1_1,HLA_DQB1_2, HLA_DQB1_start,HLA_DQB1_end);
error = error + PrintGenotypes(s[1], HLA_DPA1_1,HLA_DPA1_2, HLA_DPA1_start,HLA_DPA1_end);
error = error + PrintGenotypes(s[1], HLA_DPB1_1,HLA_DPB1_2, HLA_DPB1_start,HLA_DPB1_end);
error = error + PrintAminoAcids(s[1], HLA_A_1,HLA_A_2, A_exons);
error = error + PrintAminoAcids(s[1], HLA_C_1,HLA_C_2, C_exons);
error = error + PrintAminoAcids(s[1], HLA_B_1,HLA_B_2, B_exons);
error = error + PrintAminoAcids(s[1], HLA_DRB1_1,HLA_DRB1_2, DRB1_exons);
error = error + PrintAminoAcids(s[1], HLA_DQA1_1,HLA_DQA1_2, DQA1_exons);
error = error + PrintAminoAcids(s[1], HLA_DQB1_1,HLA_DQB1_2, DQB1_exons);
error = error + PrintAminoAcids(s[1], HLA_DPA1_1,HLA_DPA1_2, DPA1_exons);
error = error + PrintAminoAcids(s[1], HLA_DPB1_1,HLA_DPB1_2, DPB1_exons);
if (PrintDNA){
error = error + PrintGenotypes(s[1], HLA_A_1,HLA_A_2, HLA_A_start,HLA_A_end);
error = error + PrintGenotypes(s[1], HLA_C_1,HLA_C_2, HLA_C_start,HLA_C_end);
error = error + PrintGenotypes(s[1], HLA_B_1,HLA_B_2, HLA_B_start,HLA_B_end);
error = error + PrintGenotypes(s[1], HLA_DRB1_1,HLA_DRB1_2, HLA_DRB1_start,HLA_DRB1_end);
error = error + PrintGenotypes(s[1], HLA_DQA1_1,HLA_DQA1_2, HLA_DQA1_start,HLA_DQA1_end);
error = error + PrintGenotypes(s[1], HLA_DQB1_1,HLA_DQB1_2, HLA_DQB1_start,HLA_DQB1_end);
error = error + PrintGenotypes(s[1], HLA_DPA1_1,HLA_DPA1_2, HLA_DPA1_start,HLA_DPA1_end);
error = error + PrintGenotypes(s[1], HLA_DPB1_1,HLA_DPB1_2, HLA_DPB1_start,HLA_DPB1_end);
}
if (PrintAA){
error = error + PrintAminoAcids(s[1], HLA_A_1,HLA_A_2, A_exons);
error = error + PrintAminoAcids(s[1], HLA_C_1,HLA_C_2, C_exons);
error = error + PrintAminoAcids(s[1], HLA_B_1,HLA_B_2, B_exons);
error = error + PrintAminoAcids(s[1], HLA_DRB1_1,HLA_DRB1_2, DRB1_exons);
error = error + PrintAminoAcids(s[1], HLA_DQA1_1,HLA_DQA1_2, DQA1_exons);
error = error + PrintAminoAcids(s[1], HLA_DQB1_1,HLA_DQB1_2, DQB1_exons);
error = error + PrintAminoAcids(s[1], HLA_DPA1_1,HLA_DPA1_2, DPA1_exons);
error = error + PrintAminoAcids(s[1], HLA_DPB1_1,HLA_DPB1_2, DPB1_exons);
}
out.printf("\n");
out.printf("%s",error);
}
@ -433,23 +442,27 @@ private String PrintAminoAcids(String ID, String alleleName1, String alleleName2
//Prints SNP names for each site
if (true){
PrintSNPS(HLA_A_start,HLA_A_end);
PrintSNPS(HLA_C_start,HLA_C_end);
PrintSNPS(HLA_B_start,HLA_B_end);
PrintSNPS(HLA_DRB1_start,HLA_DRB1_end);
PrintSNPS(HLA_DQA1_start,HLA_DQA1_end);
PrintSNPS(HLA_DQB1_start,HLA_DQB1_end);
PrintSNPS(HLA_DPA1_start,HLA_DPA1_end);
PrintSNPS(HLA_DPB1_start,HLA_DPB1_end);
if (PrintDNA){
PrintSNPS(HLA_A_start,HLA_A_end);
PrintSNPS(HLA_C_start,HLA_C_end);
PrintSNPS(HLA_B_start,HLA_B_end);
PrintSNPS(HLA_DRB1_start,HLA_DRB1_end);
PrintSNPS(HLA_DQA1_start,HLA_DQA1_end);
PrintSNPS(HLA_DQB1_start,HLA_DQB1_end);
PrintSNPS(HLA_DPA1_start,HLA_DPA1_end);
PrintSNPS(HLA_DPB1_start,HLA_DPB1_end);
}
PrintAminoAcidSites(A_exons,"A",true);
PrintAminoAcidSites(C_exons,"C",false);
PrintAminoAcidSites(B_exons,"B",false);
PrintAminoAcidSites(DRB1_exons,"DRB1",false);
PrintAminoAcidSites(DQA1_exons,"DQA1",true);
PrintAminoAcidSites(DQB1_exons,"DQB1",false);
PrintAminoAcidSites(DPA1_exons,"DPA1",false);
PrintAminoAcidSites(DPB1_exons,"DPB1",true);
if (PrintAA){
PrintAminoAcidSites(A_exons,"A",true);
PrintAminoAcidSites(C_exons,"C",false);
PrintAminoAcidSites(B_exons,"B",false);
PrintAminoAcidSites(DRB1_exons,"DRB1",false);
PrintAminoAcidSites(DQA1_exons,"DQA1",true);
PrintAminoAcidSites(DQB1_exons,"DQB1",false);
PrintAminoAcidSites(DPA1_exons,"DPA1",false);
PrintAminoAcidSites(DPB1_exons,"DPB1",true);
}
}
}

View File

@ -43,6 +43,9 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false)
public boolean findFirst = false;
@Argument(fullName = "DEBUG", shortName = "DEBUG", doc = "Debug walker", required = false)
public boolean debug = false;
@Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false)
public String debugAllele = "";
@ -50,16 +53,18 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
public String ethnicity = "Caucasian";
@Argument(fullName = "dictionary", shortName = "dictionary", doc = "bam file of HLA ditionary", required = false)
public String HLAdictionaryFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam";
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
public boolean ONLYFREQUENT = false;
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
SAMFileReader HLADictionaryReader = new SAMFileReader();
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile;
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
@ -74,6 +79,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
int numHLAlleles = 0;
int minstartpos = 0;
int maxstoppos = 0;
int numpolymorphicsites = 0, numnonpolymorphicsites = 0, pos =0;
int HLA_A_start = 30018310;
int HLA_A_end = 30021211;
@ -89,7 +95,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
//Load HLA dictionary
out.printf("INFO Loading HLA dictionary ... ");
HLADictionaryReader.ReadFile(HLAdatabaseFile);
HLADictionaryReader.ReadFile(HLAdictionaryFile);
HLAreads = HLADictionaryReader.GetReads();
HLAnames = HLADictionaryReader.GetReadNames();
HLAstartpos = HLADictionaryReader.GetStartPositions();
@ -121,7 +127,8 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
siteFileReader.ReadFile(PolymorphicSitesFile);
PolymorphicSites = siteFileReader.GetPolymorphicSites();
NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
numpolymorphicsites = PolymorphicSites.length;
numnonpolymorphicsites = NonPolymorphicSites.length;
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
out.printf("INFO Comparing reads to database ...\n");
@ -133,53 +140,15 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
return 0;
}
private void FindPolymorphicSites(int start, int stop){
boolean initialized, polymorphic, examined;
char c = ' ';
ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
//Find polymorphic sites in dictionary
for (int pos = start; pos <= stop; pos++){
initialized = false; polymorphic = false; examined = false;
//look across all alleles at specific position to see if it is polymorphic
for (int i = 0; i < HLAreads.length; i++){
if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
if (!initialized){
c = HLAreads[i].charAt(pos-HLAstartpos[i]);
initialized = true;
examined = true;
}
if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
polymorphicsites.add(pos);
out.printf("POLYMORPHIC\t6\t%s\n", pos);
polymorphic = true;
break;
}
}
}
if (!polymorphic && examined){
nonpolymorphicsites.add(pos);
out.printf("CONSERVED\t6\t%s\n", pos);
}
}
PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
}
private double CalculateConcordance(SAMRecord read){
int readstart = read.getAlignmentStart();
int readstop = read.getAlignmentEnd();
int numpolymorphicsites, numnonpolymorphicsites, pos;
char c1, c2;
double maxConcordance = 0.0, freq = 0.0, minFreq = 0.0;
String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString());
String s2;
int allelestart, allelestop;
numpolymorphicsites = PolymorphicSites.length;
numnonpolymorphicsites = NonPolymorphicSites.length;
if (ONLYFREQUENT){
minFreq = 0.0001;
}
@ -200,7 +169,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
c1 = s1.charAt(pos-readstart);
c2 = s2.charAt(pos-allelestart);
if (c1 != 'D'){//allow for deletions (sequencing errors)
if (c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
numcompared[i]++;
if (c1 == c2){
nummatched[i]++;
@ -214,16 +183,17 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
}
//Non-polymorphic sites: increment denominator only when bases are discordant
for (int j = 0; j < numnonpolymorphicsites; j++){
pos = NonPolymorphicSites[j];
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
c1 = s1.charAt(pos-readstart);
c2 = s2.charAt(pos-allelestart);
if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors)
numcompared[i]++;
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
if (numcompared[i] > 0){
for (int j = 0; j < numnonpolymorphicsites; j++){
pos = NonPolymorphicSites[j];
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
c1 = s1.charAt(pos-readstart);
c2 = s2.charAt(pos-allelestart);
if (c1 != c2 && c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
numcompared[i]++;
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
}
}
}
}
@ -259,26 +229,32 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
//Calculate concordance for this read and all overlapping reads
double maxConcordance = CalculateConcordance(read);
if (read.getMappingQuality() > 0){
double maxConcordance = CalculateConcordance(read);
String readname = read.getReadName(), allelename = ""; double freq;
//For input bam files that contain HLA alleles, find and print allele frequency
freq = GetAlleleFrequency(readname);
out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());
if (maxConcordance > 0){
String readname = read.getReadName(), allelename = ""; double freq;
//For input bam files that contain HLA alleles, find and print allele frequency
//freq = GetAlleleFrequency(readname);
out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());
//Find the maximum frequency of the alleles most concordant with the read
double maxFreq = FindMaxAlleleFrequency(maxConcordance);
//Print concordance statistics between this read and the most similar HLA allele(s)
for (int i = 0; i < HLAreads.length; i++){
if (concordance[i] == maxConcordance && maxConcordance > 0){
freq = GetAlleleFrequency(HLAnames[i]);
if (freq == maxFreq){
out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
//Find the maximum frequency of the alleles most concordant with the read
//double maxFreq = FindMaxAlleleFrequency(maxConcordance);
//Print concordance statistics between this read and the most similar HLA allele(s)
for (int i = 0; i < HLAreads.length; i++){
if (concordance[i] == maxConcordance){
freq = GetAlleleFrequency(HLAnames[i]);
//if (freq == maxFreq){
out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
//}
break;
}
}
out.print("\n");
}
}
out.print("\n");
return 1;
}

View File

@ -0,0 +1,152 @@
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.commandline.Argument;
import java.util.ArrayList;
import java.util.Hashtable;
/**
* Finds polymorphic sites in the HLA dictionary. Usage: java -jar GenomeAnalysisTK.jar -T FindPolymorphicSites -I HLA_DICTIONARY.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval -findFirst | grep -v INFO | sort -k1 > OUTPUT
* @author shermanjia
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false)
public String debugRead = "";
@Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false)
public boolean findFirst = false;
@Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false)
public String debugAllele = "";
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
public String ethnicity = "Caucasian";
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
public boolean ONLYFREQUENT = false;
//String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
SAMFileReader HLADictionaryReader = new SAMFileReader();
//String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile;
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
boolean DatabaseLoaded = false;
boolean DEBUG = false;
String[] HLAnames, HLAreads;
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
double[] SingleAlleleFrequencies;
double[] nummatched, concordance, numcompared;
int numHLAlleles = 0;
int minstartpos = 0;
int maxstoppos = 0;
int HLA_A_start = 30018310;
int HLA_A_end = 30021211;
Hashtable AlleleFrequencies = new Hashtable();
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
CigarParser formatter = new CigarParser();
public Integer reduceInit() {
if (!DatabaseLoaded){
DatabaseLoaded = true;
//Load HLA dictionary
out.printf("INFO Loading HLA dictionary ... ");
HLADictionaryReader.ReadFile(HLAdatabaseFile);
HLAreads = HLADictionaryReader.GetReads();
HLAnames = HLADictionaryReader.GetReadNames();
HLAstartpos = HLADictionaryReader.GetStartPositions();
HLAstoppos = HLADictionaryReader.GetStopPositions();
minstartpos = HLADictionaryReader.GetMinStartPos();
maxstoppos = HLADictionaryReader.GetMaxStopPos();
out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
nummatched = new double[HLAreads.length];
concordance = new double[HLAreads.length];
numcompared = new double[HLAreads.length];
//Read allele frequencies
if (ethnicity.equals("Black")){
AlleleFrequencyFile = BlackAlleleFrequencyFile;
}else{
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
}
out.printf("INFO Reading HLA allele frequencies ... ");
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
FindPolymorphicSites(minstartpos,maxstoppos);
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
out.printf("INFO Comparing reads to database ...\n");
if (DEBUG){
//out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles);
}
}
return 0;
}
private void FindPolymorphicSites(int start, int stop){
boolean initialized, polymorphic, examined;
char c = ' ';
ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
//Find polymorphic sites in dictionary
for (int pos = start; pos <= stop; pos++){
initialized = false; polymorphic = false; examined = false;
//look across all alleles at specific position to see if it is polymorphic
for (int i = 0; i < HLAreads.length; i++){
if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
if (!initialized){
c = HLAreads[i].charAt(pos-HLAstartpos[i]);
initialized = true;
examined = true;
}
if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
polymorphicsites.add(pos);
out.printf("POLYMORPHIC\t6\t%s\n", pos);
polymorphic = true;
break;
}
}
}
if (!polymorphic && examined){
nonpolymorphicsites.add(pos);
out.printf("CONSERVED\t6\t%s\n", pos);
}
}
PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
}
public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
//Calculate concordance for this read and all overlapping reads
return 1;
}
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
}

View File

@ -16,8 +16,9 @@ import java.util.Hashtable;
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.sam";
String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.closest";
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
// String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.CLASS1.closest";
String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.CLASS2.closest";
boolean DatabaseLoaded = false;
boolean DEBUG = false;
@ -40,14 +41,24 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
int HLA_B_end = 31432914;
int HLA_C_start = 31344925;
int HLA_C_end = 31347827;
int HLA_DQA1_start = 32713161;
int HLA_DQA1_end = 32719407;
int HLA_DQB1_start = 32735635;
int HLA_DQB1_end = 32742444;
int HLA_DPA1_start = 33140772;
int HLA_DPA1_end = 33149356;
int HLA_DPB1_start = 33151738;
int HLA_DPB1_end = 33162954;
int HLA_DRB1_start = 32654525;
int HLA_DRB1_end = 32665540;
ArrayList<String> PolymorphicSites = new ArrayList<String>();
Hashtable ClosestAllele = new Hashtable();
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1, iDRBstart = -1, iDRBstop = -1, iDQAstart = -1, iDQAstop = -1, iDQBstart = -1, iDQBstop = -1, iDPAstart = -1, iDPAstop = -1, iDPBstart = -1, iDPBstop = -1;
CigarParser formatter = new CigarParser();
public Integer reduceInit() {
if (!DatabaseLoaded){
try{
@ -77,6 +88,21 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
}else if (s[0].indexOf("HLA_C") > -1){
if (iCstart < 0){iCstart=i;}
iCstop = i; i++;
}else if (s[0].indexOf("HLA_DRB1") > -1){
if (iDRBstart < 0){iDRBstart=i;}
iDRBstop = i; i++;
}else if (s[0].indexOf("HLA_DQA1") > -1){
if (iDQAstart < 0){iDQAstart=i;}
iDQAstop = i; i++;
}else if (s[0].indexOf("HLA_DQB1") > -1){
if (iDQBstart < 0){iDQBstart=i;}
iDQBstop = i; i++;
}else if (s[0].indexOf("HLA_DPA1") > -1){
if (iDPAstart < 0){iDPAstart=i;}
iDPAstop = i; i++;
}else if (s[0].indexOf("HLA_DPB1") > -1){
if (iDPBstart < 0){iDPBstart=i;}
iDPBstop = i; i++;
}
}
}
@ -146,10 +172,12 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
int numM = 0, numI = 0, numD = 0;
name = read.getReadName();
String matchedAllele = (String) ClosestAllele.get(name);
//out.printf("%s\t%s\n",name,matchedAllele);
int index = HLAnames.indexOf(matchedAllele);
String matchedRead = HLAreads.get(index);
if (name.indexOf("HLA_A") > -1){
@ -161,8 +189,24 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
} else if (name.indexOf("HLA_C") > -1){
startimputation = HLA_C_start;
stopimputation = HLA_C_end;
} else if (name.indexOf("HLA_DRB1") > -1){
startimputation = HLA_DRB1_start;
stopimputation = HLA_DRB1_end;
} else if (name.indexOf("HLA_DQA1") > -1){
startimputation = HLA_DQA1_start;
stopimputation = HLA_DQA1_end;
} else if (name.indexOf("HLA_DQB1") > -1){
startimputation = HLA_DQB1_start;
stopimputation = HLA_DQB1_end;
} else if (name.indexOf("HLA_DPA1") > -1){
startimputation = HLA_DPA1_start;
stopimputation = HLA_DPA1_end;
} else if (name.indexOf("HLA_DPB1") > -1){
startimputation = HLA_DPB1_start;
stopimputation = HLA_DPB1_end;
}
//out.printf("DEBUG %s\t%s\t%s\t%s\t%s\n",name,matchedAllele,index,startimputation,stopimputation);
for (int i = startimputation; i <= stopimputation; i++){
//if position is within read
if (i >= readstart && i <= readstop){

View File

@ -7,12 +7,16 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import java.io.*;
import java.util.ArrayList;
import java.util.Hashtable;
/**
*
* @author shermanjia
*/
public class SimilarityFileReader {
ArrayList<String> ReadsToDiscard = new ArrayList<String>();
Hashtable Concordance = new Hashtable();
Hashtable NumMatches = new Hashtable();
Hashtable NumMismatches = new Hashtable();
public ArrayList<String> GetReadsToDiscard(){
return ReadsToDiscard;
@ -22,6 +26,18 @@ public class SimilarityFileReader {
return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]);
}
public Hashtable GetConcordance(){
return Concordance;
}
public Hashtable GetNumMatches(){
return NumMatches;
}
public Hashtable GetNumMismatches(){
return NumMismatches;
}
public void ReadFile(String filename, int minAllowedMismatches){
try{
FileInputStream fstream = new FileInputStream(filename);
@ -35,6 +51,10 @@ public class SimilarityFileReader {
if (s.length >= 6){
Double matchFraction = Double.valueOf(s[4]);
int numMismatches = Integer.valueOf(s[6]);
Concordance.put(s[0],matchFraction);
NumMatches.put(s[0], s[5]);
NumMismatches.put(s[0], numMismatches);
if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches > minAllowedMismatches)){
ReadsToDiscard.add(s[0]);
}