HLA caller updated to examine class II loci, updated pointers to dictionary, allele frequencies.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3290 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
97fdd92e7b
commit
94b51de401
|
|
@ -56,10 +56,10 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
||||||
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
|
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
|
||||||
public String ethnicity = "CaucasianUSA";
|
public String ethnicity = "CaucasianUSA";
|
||||||
|
|
||||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||||
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAllelesCommon";
|
||||||
Hashtable AlleleFrequencies,UniqueAlleles;
|
Hashtable AlleleFrequencies,UniqueAlleles;
|
||||||
|
|
||||||
CigarParser formatter = new CigarParser();
|
CigarParser formatter = new CigarParser();
|
||||||
|
|
@ -95,6 +95,10 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
||||||
UniqueAlleles = HLAfreqReader.GetUniqueAlleles();
|
UniqueAlleles = HLAfreqReader.GetUniqueAlleles();
|
||||||
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
|
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
|
||||||
|
|
||||||
|
//out.printf("INFO Common alleles:\n");
|
||||||
|
for (int i = 1; i < UniqueAlleles.size(); i++){
|
||||||
|
//out.printf("INFO %s\n",UniqueAlleles.values().toArray()[i]);
|
||||||
|
}
|
||||||
out.printf("INFO Reading HLA dictionary ...");
|
out.printf("INFO Reading HLA dictionary ...");
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -188,12 +192,18 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
||||||
|
|
||||||
for (int i = 0; i < numreads; i++){
|
for (int i = 0; i < numreads; i++){
|
||||||
name1 = HLAnames[i].substring(4);
|
name1 = HLAnames[i].substring(4);
|
||||||
if (UniqueAlleles.containsKey(name1)){
|
String [] n1 = name1.split("\\*");
|
||||||
|
// out.printf("1: %s\n",n1[0] + "*" + n1[1].substring(0, 3));
|
||||||
|
if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){
|
||||||
|
//out.printf("1: %s\n",name1);
|
||||||
//frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
|
//frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
|
||||||
//if (frq1 > minfrq){
|
//if (frq1 > minfrq){
|
||||||
for (int j = i; j < numreads; j++){
|
for (int j = i; j < numreads; j++){
|
||||||
name2 = HLAnames[j].substring(4);
|
name2 = HLAnames[j].substring(4);
|
||||||
if (UniqueAlleles.containsKey(name2)){
|
String [] n2 = name2.split("\\*");
|
||||||
|
// out.printf("2: %s\n",n2[0] + "*" + n2[1].substring(0, 3));
|
||||||
|
if (UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){
|
||||||
|
|
||||||
// frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
|
// frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
|
||||||
// if (frq2 > minfrq){
|
// if (frq2 > minfrq){
|
||||||
if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
|
if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
|
||||||
|
|
@ -211,8 +221,8 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
|
||||||
//}else{
|
//}else{
|
||||||
// if (DEBUG){out.printf("%s has allele frequency%.5f\n",name1,frq1);}
|
// if (DEBUG){out.printf("%s has allele frequency%.5f\n",name1,frq1);}
|
||||||
//}
|
//}
|
||||||
}else{
|
//}else{
|
||||||
if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
|
// if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -55,8 +55,8 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
||||||
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
|
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
|
||||||
public int MINALLOWEDMISMATCHES = 7;
|
public int MINALLOWEDMISMATCHES = 7;
|
||||||
|
|
||||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
|
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
|
||||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||||
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||||
|
|
|
||||||
|
|
@ -67,12 +67,12 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
|
||||||
|
|
||||||
GATKArgumentCollection args = this.getToolkit().getArguments();
|
GATKArgumentCollection args = this.getToolkit().getArguments();
|
||||||
|
|
||||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||||
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAllelesCommon";
|
||||||
|
|
||||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
|
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
|
||||||
SAMFileReader HLADictionaryReader = new SAMFileReader();
|
SAMFileReader HLADictionaryReader = new SAMFileReader();
|
||||||
boolean HLAdataLoaded = false;
|
boolean HLAdataLoaded = false;
|
||||||
String[] HLAnames, HLAreads;
|
String[] HLAnames, HLAreads;
|
||||||
|
|
@ -216,7 +216,8 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
|
||||||
out.printf("NUM\tAllele1\tAllele2\tPhase\tFrq1\tFrq2\n");
|
out.printf("NUM\tAllele1\tAllele2\tPhase\tFrq1\tFrq2\n");
|
||||||
for (int i = 0; i < HLAnames.length; i++){
|
for (int i = 0; i < HLAnames.length; i++){
|
||||||
name1 = HLAnames[i].substring(4);
|
name1 = HLAnames[i].substring(4);
|
||||||
if (UniqueAlleles.containsKey(name1)){
|
String [] n1 = name1.split("\\*");
|
||||||
|
if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){
|
||||||
if (AlleleFrequencies.containsKey(name1)){
|
if (AlleleFrequencies.containsKey(name1)){
|
||||||
frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
|
frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
|
||||||
}else{
|
}else{
|
||||||
|
|
@ -225,19 +226,19 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
|
||||||
//if (frq1 > minfrq){
|
//if (frq1 > minfrq){
|
||||||
for (int j = i; j < HLAnames.length; j++){
|
for (int j = i; j < HLAnames.length; j++){
|
||||||
name2 = HLAnames[j].substring(4);
|
name2 = HLAnames[j].substring(4);
|
||||||
if (name1.substring(0,1).equals(name2.substring(0,1))){
|
String [] n2 = name2.split("\\*");
|
||||||
if (UniqueAlleles.containsKey(name2)){
|
if (n1[0].equals(n2[0]) && UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){
|
||||||
if (AlleleFrequencies.containsKey(name2)){
|
if (AlleleFrequencies.containsKey(name2)){
|
||||||
frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
|
frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
|
||||||
}else{
|
}else{
|
||||||
frq2 = .0001;
|
frq2 = .0001;
|
||||||
}
|
|
||||||
// if (frq2 > minfrq){
|
|
||||||
likelihood = CalculatePhaseLikelihood(i,j,false);
|
|
||||||
numCombinations++;
|
|
||||||
out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2));
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
// if (frq2 > minfrq){
|
||||||
|
likelihood = CalculatePhaseLikelihood(i,j,false);
|
||||||
|
numCombinations++;
|
||||||
|
out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2));
|
||||||
|
// }
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//}
|
//}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,322 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Hashtable;
|
||||||
|
/**
|
||||||
|
* Compares reads to longest read at each exon. Usage: java -jar GenomeAnalysisTK.jar -T ClusterReads -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta [-filter INPUT.filter] | grep -v INFO | sort -k1 > OUTPUT
|
||||||
|
* @author shermanjia
|
||||||
|
*/
|
||||||
|
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||||
|
public class ClusterReadsWalker extends ReadWalker<Integer, Integer> {
|
||||||
|
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
|
||||||
|
public String filterFile = "";
|
||||||
|
|
||||||
|
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
|
||||||
|
public int MINALLOWEDMISMATCHES = 5;
|
||||||
|
|
||||||
|
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||||
|
|
||||||
|
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
|
||||||
|
|
||||||
|
boolean DatabaseLoaded = false;
|
||||||
|
boolean DEBUG = false;
|
||||||
|
|
||||||
|
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
|
||||||
|
double[] SingleAlleleFrequencies;
|
||||||
|
ArrayList<String> ReadsToDiscard = new ArrayList<String>();
|
||||||
|
ArrayList<SAMRecord> AlignedReads = new ArrayList<SAMRecord>();
|
||||||
|
|
||||||
|
Hashtable MaxNumMatches, MaxConcordance, NumCompared = new Hashtable();
|
||||||
|
|
||||||
|
double[] nummatched, concordance, numcompared;
|
||||||
|
|
||||||
|
SAMRecord A2, A3, B2, B3, C2, C3;
|
||||||
|
int MaxMatchesA2 = 0, MaxMatchesA3 = 0, MaxMatchesB2 = 0, MaxMatchesB3 = 0, MaxMatchesC2 = 0, MaxMatchesC3 = 0;
|
||||||
|
int A2start = 30018513, A2stop = 30018781;
|
||||||
|
int A3start = 30019024, A3stop = 30019300;
|
||||||
|
|
||||||
|
int C2start = 31347355, C2stop = 31347623;
|
||||||
|
int C3start = 31346829, C3stop = 31347104;
|
||||||
|
|
||||||
|
int B2start = 31432444, B2stop = 31432714;
|
||||||
|
int B3start = 31431923, B3stop = 31432198;
|
||||||
|
|
||||||
|
int minstartpos = 0;
|
||||||
|
int maxstoppos = 0;
|
||||||
|
|
||||||
|
|
||||||
|
Hashtable AlleleFrequencies = new Hashtable();
|
||||||
|
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
|
||||||
|
CigarParser formatter = new CigarParser();
|
||||||
|
|
||||||
|
public Integer reduceInit() {
|
||||||
|
if (!DatabaseLoaded){
|
||||||
|
DatabaseLoaded = true;
|
||||||
|
|
||||||
|
PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader();
|
||||||
|
siteFileReader.ReadFile(PolymorphicSitesFile);
|
||||||
|
PolymorphicSites = siteFileReader.GetPolymorphicSites();
|
||||||
|
NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
|
||||||
|
|
||||||
|
|
||||||
|
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
|
||||||
|
|
||||||
|
if (!filterFile.equals("")){
|
||||||
|
out.printf("INFO Reading properties file ... ");
|
||||||
|
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
||||||
|
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
|
||||||
|
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
||||||
|
MaxNumMatches = similarityReader.GetNumMatches();
|
||||||
|
MaxConcordance = similarityReader.GetConcordance();
|
||||||
|
|
||||||
|
out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
|
||||||
|
for (int i = 0; i < ReadsToDiscard.size(); i++){
|
||||||
|
out.printf("MISALIGNED %s\n", ReadsToDiscard.get(i).toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out.printf("INFO Comparing reads ...\n");
|
||||||
|
|
||||||
|
if (DEBUG){
|
||||||
|
//out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker tracker) {
|
||||||
|
//Calculate concordance for this read and all overlapping reads
|
||||||
|
if (!ReadsToDiscard.contains(read.getReadName())){
|
||||||
|
AlignedReads.add(read);
|
||||||
|
|
||||||
|
int readstart = read.getAlignmentStart();
|
||||||
|
int readstop = read.getAlignmentEnd();
|
||||||
|
int length = readstop - readstart + 1;
|
||||||
|
if (MaxNumMatches.containsKey(read.getReadName())){
|
||||||
|
int maxMatches = Integer.parseInt(MaxNumMatches.get(read.getReadName()).toString());
|
||||||
|
double concordance = Double.parseDouble((String) MaxConcordance.get(read.getReadName()).toString());
|
||||||
|
|
||||||
|
if (readstart < A2stop && readstop > A2start){
|
||||||
|
if (maxMatches > MaxMatchesA2 && concordance > 0.95){
|
||||||
|
MaxMatchesA2 = maxMatches;
|
||||||
|
A2 = read;
|
||||||
|
}
|
||||||
|
} else if (readstart < A3stop && readstop > A3start){
|
||||||
|
if (maxMatches > MaxMatchesA3){
|
||||||
|
MaxMatchesA3 = maxMatches;
|
||||||
|
A3 = read;
|
||||||
|
}
|
||||||
|
} else if (readstart < B2stop && readstop > B2start){
|
||||||
|
if (maxMatches > MaxMatchesB2){
|
||||||
|
MaxMatchesB2 = maxMatches;
|
||||||
|
B2 = read;
|
||||||
|
}
|
||||||
|
} else if (readstart < B3stop && readstop > B3start){
|
||||||
|
if (maxMatches > MaxMatchesB3){
|
||||||
|
MaxMatchesB3 = maxMatches;
|
||||||
|
B3 = read;
|
||||||
|
}
|
||||||
|
} else if (readstart < C2stop && readstop > C2start){
|
||||||
|
if (maxMatches > MaxMatchesC2){
|
||||||
|
MaxMatchesC2 = maxMatches;
|
||||||
|
C2 = read;
|
||||||
|
}
|
||||||
|
} else if (readstart < C3stop && readstop > C3start){
|
||||||
|
if (maxMatches > MaxMatchesC3){
|
||||||
|
MaxMatchesC3 = maxMatches;
|
||||||
|
C3 = read;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
out.printf("Data for %s not found\n",read.getReadName());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
|
||||||
|
return value + sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTraversalDone(Integer numreads) {
|
||||||
|
SAMRecord read; String name, name2; String locus = "";
|
||||||
|
int A2a = 0, A2b = 0, A2c = 0;
|
||||||
|
int A3a = 0, A3b = 0, A3c = 0;
|
||||||
|
int B2a = 0, B2b = 0, B2c = 0;
|
||||||
|
int B3a = 0, B3b = 0, B3c = 0;
|
||||||
|
int C2a = 0, C2b = 0, C2c = 0;
|
||||||
|
int C3a = 0, C3b = 0, C3c = 0;
|
||||||
|
double minA2 = -1, minA3 = -1, minB2 = -1, minB3 = -1, minC2 = -1, minC3 = -1;
|
||||||
|
double maxA2 = 0, maxA3 = 0, maxB2 = 0, maxB3 = 0, maxC2 = 0, maxC3 = 0;
|
||||||
|
double ratioA2 = 0, ratioA3 = 0, ratioB2 = 0, ratioB3 = 0, ratioC2 = 0, ratioC3 = 0;
|
||||||
|
double maxA2l = 0, maxA3l = 0, maxB2l = 0, maxB3l = 0, maxC2l = 0, maxC3l = 0;
|
||||||
|
double maxA2d = 0, maxA3d = 0, maxB2d = 0, maxB3d = 0, maxC2d = 0, maxC3d = 0;
|
||||||
|
double a2, a3, b2, b3, c2, c3, normalized = 0;
|
||||||
|
int readstart, readstop;
|
||||||
|
double matches, compared, concordance;
|
||||||
|
STATS stats;
|
||||||
|
|
||||||
|
for (int i = 0; i < AlignedReads.size(); i++){
|
||||||
|
read = AlignedReads.get(i);
|
||||||
|
readstart = read.getAlignmentStart();
|
||||||
|
readstop = read.getAlignmentEnd();
|
||||||
|
if (readstart < A2stop && readstop > A2start){
|
||||||
|
stats = CalculateConcordance(read, A2, "A2", A2start, A2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
|
||||||
|
if (stats.getNumCompared() > 40){if (minA2 < 0 || minA2 > concordance){minA2 = concordance;}; if (concordance > maxA2){maxA2 = concordance;}; if (matches > maxA2l){maxA2l = matches;}; if (compared-matches > maxA2d){maxA2d = compared-matches;}}else{A2c++;}
|
||||||
|
} else if (readstart < A3stop && readstop > A3start){
|
||||||
|
stats = CalculateConcordance(read, A3, "A3", A3start, A3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
|
||||||
|
if (stats.getNumCompared() > 40){if (minA3 < 0 || minA3 > concordance){minA3 = concordance;}; if (concordance > maxA3){maxA3 = concordance;}; if (matches > maxA3l){maxA3l = matches;}; if (compared-matches > maxA3d){maxA3d = compared-matches;}}else{A3c++;}
|
||||||
|
} else if (readstart < B2stop && readstop > B2start){
|
||||||
|
stats = CalculateConcordance(read, B2, "B2", B2start, B2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
|
||||||
|
if (stats.getNumCompared() > 40){if (minB2 < 0 || minB2 > concordance){minB2 = concordance;}; if (concordance > maxB2){maxB2 = concordance;}; if (matches > maxB2l){maxB2l = matches;}; if (compared-matches > maxB2d){maxB2d = compared-matches;}}else{B2c++;}
|
||||||
|
} else if (readstart < B3stop && readstop > B3start){
|
||||||
|
stats = CalculateConcordance(read, B3, "B3", B3start, B3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
|
||||||
|
if (stats.getNumCompared() > 40){if (minB3 < 0 || minB3 > concordance){minB3 = concordance;}; if (concordance > maxB3){maxB3 = concordance;}; if (matches > maxB3l){maxB3l = matches;}; if (compared-matches > maxB3d){maxB3d = compared-matches;}}else{B3c++;}
|
||||||
|
} else if (readstart < C2stop && readstop > C2start){
|
||||||
|
stats = CalculateConcordance(read, C2, "C2", C2start, C2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
|
||||||
|
if (stats.getNumCompared() > 40){if (minC2 < 0 || minC2 > concordance){minC2 = concordance;}; if (concordance > maxC2){maxC2 = concordance;}; if (matches > maxC2l){maxC2l = matches;}; if (compared-matches > maxC2d){maxC2d = compared-matches;}}else{C2c++;}
|
||||||
|
} else if (readstart < C3stop && readstop > C3start){
|
||||||
|
stats = CalculateConcordance(read, C3, "C3", C3start, C3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
|
||||||
|
if (stats.getNumCompared() > 40){if (minC3 < 0 || minC3 > concordance){minC3 = concordance;}; if (concordance > maxC3){maxC3 = concordance;}; if (matches > maxC3l){maxC3l = matches;}; if (compared-matches > maxC3d){maxC3d = compared-matches;}}else{C3c++;}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for (int i = 0; i < AlignedReads.size(); i++){
|
||||||
|
read = AlignedReads.get(i);
|
||||||
|
readstart = read.getAlignmentStart();
|
||||||
|
readstop = read.getAlignmentEnd();
|
||||||
|
name = read.getReadName(); name2 = "";
|
||||||
|
if (NumCompared.containsKey(name)){
|
||||||
|
compared = Double.parseDouble((String) NumCompared.get(name).toString());
|
||||||
|
matches = Double.parseDouble((String) MaxNumMatches.get(name).toString());
|
||||||
|
concordance = Double.parseDouble((String) MaxConcordance.get(name).toString());
|
||||||
|
if (matches > 40){
|
||||||
|
if (readstart < A2stop && readstop > A2start){
|
||||||
|
locus = "A2"; name2 = A2.getReadName();
|
||||||
|
a2 = (concordance - minA2)/(maxA2-minA2); if (a2 >= .5){A2a++;}else{A2b++;}; normalized = a2;
|
||||||
|
} else if (readstart < A3stop && readstop > A3start){
|
||||||
|
locus = "A3"; name2 = A3.getReadName();
|
||||||
|
a3 = (concordance - minA3)/(maxA3-minA3); if (a3 >= .5){A3a++;}else{A3b++;}; normalized = a3;
|
||||||
|
} else if (readstart < B2stop && readstop > B2start){
|
||||||
|
locus = "B2"; name2 = B2.getReadName();
|
||||||
|
b2 = (concordance - minB2)/(maxB2-minB2); if (b2 >= .5){B2a++;}else{B2b++;}; normalized = b2;
|
||||||
|
} else if (readstart < B3stop && readstop > B3start){
|
||||||
|
locus = "B3"; name2 = B3.getReadName();
|
||||||
|
b3 = (concordance - minB3)/(maxB3-minB3); if (b3 >= .5){B3a++;}else{B3b++;}; normalized = b3;
|
||||||
|
} else if (readstart < C2stop && readstop > C2start){
|
||||||
|
locus = "C2"; name2 = C2.getReadName();
|
||||||
|
c2 = (concordance - minC2)/(maxC2-minC2); if (c2 >= .5){C2a++;}else{C2b++;}; normalized = c2;
|
||||||
|
} else if (readstart < C3stop && readstop > C3start){
|
||||||
|
locus = "C3"; name2 = C3.getReadName();
|
||||||
|
c3 = (concordance - minC3)/(maxC3-minC3); if (c3 >= .5){C3a++;}else{C3b++;}; normalized = c3;
|
||||||
|
}
|
||||||
|
out.printf("%s\t%s\t%s\t%.0f\t%.0f\t%.3f\t%.3f\n",locus,name,name2,matches,compared,concordance,normalized);
|
||||||
|
}else{
|
||||||
|
out.printf("%s (compared at %s sites) is too short\n",name,matches);
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
out.printf("%s [%s to %s] not found\n",name,readstart,readstop);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (A2a > 0 && A2b > 0){if (A2a > A2b){ratioA2 = (double)A2b/(A2a+A2b);}else{ratioA2 = (double)A2a/(A2a+A2b);}}else{ratioA2 = -1;}
|
||||||
|
if (A3a > 0 && A3b > 0){if (A3a > A3b){ratioA3 = (double)A3b/(A3a+A3b);}else{ratioA3 = (double)A3a/(A3a+A3b);}}else{ratioA3 = -1;}
|
||||||
|
if (B2a > 0 && B2b > 0){if (B2a > B2b){ratioB2 = (double)B2b/(B2a+B2b);}else{ratioB2 = (double)B2a/(B2a+B2b);}}else{ratioB2 = -1;}
|
||||||
|
if (B3a > 0 && B3b > 0){if (B3a > B3b){ratioB3 = (double)B3b/(B3a+B3b);}else{ratioB3 = (double)B3a/(B3a+B3b);}}else{ratioB3 = -1;}
|
||||||
|
if (C2a > 0 && C2b > 0){if (C2a > C2b){ratioC2 = (double)C2b/(C2a+C2b);}else{ratioC2 = (double)C2a/(C2a+C2b);}}else{ratioC2 = -1;}
|
||||||
|
if (C3a > 0 && C3b > 0){if (C3a > C3b){ratioC3 = (double)C3b/(C3a+C3b);}else{ratioC3 = (double)C3a/(C3a+C3b);}}else{ratioC3 = -1;}
|
||||||
|
|
||||||
|
out.printf("RATIO_A2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioA2,A2a,A2b,A2c,maxA2-minA2,maxA2l,maxA2d);
|
||||||
|
out.printf("RATIO_A3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioA3,A3a,A3b,A3c,maxA3-minA3,maxA3l,maxA3d);
|
||||||
|
out.printf("RATIO_B2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioB2,B2a,B2b,B2c,maxB2-minB2,maxB2l,maxB2d);
|
||||||
|
out.printf("RATIO_B3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioB3,B3a,B3b,B3c,maxB3-minB3,maxB3l,maxB3d);
|
||||||
|
out.printf("RATIO_C2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioC2,C2a,C2b,C2c,maxC2-minC2,maxC2l,maxC2d);
|
||||||
|
out.printf("RATIO_C3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioC3,C3a,C3b,C3c,maxC3-minC3,maxC3l,maxC3d);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public class STATS {
|
||||||
|
protected double concordance = 0.0;
|
||||||
|
protected double numcompared = 0;
|
||||||
|
protected double nummatches = 0;
|
||||||
|
|
||||||
|
public STATS(double d, double i, double m) {
|
||||||
|
concordance = d;
|
||||||
|
numcompared = i;
|
||||||
|
nummatches = m;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getConcordance() {
|
||||||
|
return concordance;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getNumCompared() {
|
||||||
|
return numcompared;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getNumMatches() {
|
||||||
|
return nummatches;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private STATS CalculateConcordance(SAMRecord read1, SAMRecord read2, String locus, int start, int stop){
|
||||||
|
int start1 = read1.getAlignmentStart(), stop1 = read1.getAlignmentEnd();
|
||||||
|
int start2 = read2.getAlignmentStart(), stop2 = read2.getAlignmentEnd();
|
||||||
|
|
||||||
|
int pos;
|
||||||
|
double numcompared = 0, nummatched = 0, concordance;
|
||||||
|
char c1, c2;
|
||||||
|
String s1 = formatter.FormatRead(read1.getCigarString(), read1.getReadString());
|
||||||
|
String s2 = formatter.FormatRead(read2.getCigarString(), read2.getReadString());
|
||||||
|
|
||||||
|
|
||||||
|
//Polymorphic sites: always increment denominator, increment numerator when bases are concordant
|
||||||
|
for (int j = 0; j < PolymorphicSites.length; j++){
|
||||||
|
pos = PolymorphicSites[j];
|
||||||
|
if (pos >= start1 && pos <= stop1 && pos >= start2 && pos <= stop2 && pos >= start && pos <= stop){
|
||||||
|
c1 = s1.charAt(pos-start1);
|
||||||
|
c2 = s2.charAt(pos-start2);
|
||||||
|
if (c1 != 'D'){//allow for deletions (sequencing errors)
|
||||||
|
numcompared++;
|
||||||
|
if (c1 == c2){
|
||||||
|
nummatched++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//Non-polymorphic sites: increment denominator only when bases are discordant
|
||||||
|
if (false){
|
||||||
|
for (int j = 0; j < NonPolymorphicSites.length; j++){
|
||||||
|
pos = NonPolymorphicSites[j];
|
||||||
|
if (pos >= start1 && pos <= stop1 && pos >= start2 && pos <= stop2){
|
||||||
|
c1 = s1.charAt(pos-start1);
|
||||||
|
c2 = s2.charAt(pos-start2);
|
||||||
|
if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors)
|
||||||
|
numcompared++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//Update concordance array
|
||||||
|
concordance=nummatched/numcompared;
|
||||||
|
|
||||||
|
MaxNumMatches.put(read1.getReadName(), nummatched);
|
||||||
|
NumCompared.put(read1.getReadName(), numcompared);
|
||||||
|
MaxConcordance.put(read1.getReadName(), concordance);
|
||||||
|
//out.printf("%s\t%s\t%s\t%.0f\t%.0f\t%.3f\n",locus,read1.getReadName(),read2.getReadName(),nummatched,numcompared,concordance);
|
||||||
|
|
||||||
|
return new STATS(concordance, numcompared, nummatched);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -52,6 +52,12 @@ public class CreatePedFileWalker extends ReadWalker<Integer, Integer> {
|
||||||
@Argument(fullName = "DNAcode", shortName = "DNAcode", doc = "Amino acid codes", required = false)
|
@Argument(fullName = "DNAcode", shortName = "DNAcode", doc = "Amino acid codes", required = false)
|
||||||
public String dnaCodesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/DNA_CODE.txt";
|
public String dnaCodesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/DNA_CODE.txt";
|
||||||
|
|
||||||
|
@Argument(fullName = "PrintDNA", shortName = "PrintDNA", doc = "Print DNA sequences", required = false)
|
||||||
|
public boolean PrintDNA = false;
|
||||||
|
|
||||||
|
@Argument(fullName = "PrintAA", shortName = "PrintAA", doc = "Print Amino Acid sequences", required = false)
|
||||||
|
public boolean PrintAA = true;
|
||||||
|
|
||||||
String[] HLAnames, HLAreads, inputFileContents;
|
String[] HLAnames, HLAreads, inputFileContents;
|
||||||
Integer[] HLAstartpos, HLAstoppos;
|
Integer[] HLAstartpos, HLAstoppos;
|
||||||
ArrayList<String> HLAnamesAL, HLAreadsAL;
|
ArrayList<String> HLAnamesAL, HLAreadsAL;
|
||||||
|
|
@ -408,23 +414,26 @@ private String PrintAminoAcids(String ID, String alleleName1, String alleleName2
|
||||||
|
|
||||||
|
|
||||||
if (true) {
|
if (true) {
|
||||||
error = error + PrintGenotypes(s[1], HLA_A_1,HLA_A_2, HLA_A_start,HLA_A_end);
|
if (PrintDNA){
|
||||||
error = error + PrintGenotypes(s[1], HLA_C_1,HLA_C_2, HLA_C_start,HLA_C_end);
|
error = error + PrintGenotypes(s[1], HLA_A_1,HLA_A_2, HLA_A_start,HLA_A_end);
|
||||||
error = error + PrintGenotypes(s[1], HLA_B_1,HLA_B_2, HLA_B_start,HLA_B_end);
|
error = error + PrintGenotypes(s[1], HLA_C_1,HLA_C_2, HLA_C_start,HLA_C_end);
|
||||||
error = error + PrintGenotypes(s[1], HLA_DRB1_1,HLA_DRB1_2, HLA_DRB1_start,HLA_DRB1_end);
|
error = error + PrintGenotypes(s[1], HLA_B_1,HLA_B_2, HLA_B_start,HLA_B_end);
|
||||||
error = error + PrintGenotypes(s[1], HLA_DQA1_1,HLA_DQA1_2, HLA_DQA1_start,HLA_DQA1_end);
|
error = error + PrintGenotypes(s[1], HLA_DRB1_1,HLA_DRB1_2, HLA_DRB1_start,HLA_DRB1_end);
|
||||||
error = error + PrintGenotypes(s[1], HLA_DQB1_1,HLA_DQB1_2, HLA_DQB1_start,HLA_DQB1_end);
|
error = error + PrintGenotypes(s[1], HLA_DQA1_1,HLA_DQA1_2, HLA_DQA1_start,HLA_DQA1_end);
|
||||||
error = error + PrintGenotypes(s[1], HLA_DPA1_1,HLA_DPA1_2, HLA_DPA1_start,HLA_DPA1_end);
|
error = error + PrintGenotypes(s[1], HLA_DQB1_1,HLA_DQB1_2, HLA_DQB1_start,HLA_DQB1_end);
|
||||||
error = error + PrintGenotypes(s[1], HLA_DPB1_1,HLA_DPB1_2, HLA_DPB1_start,HLA_DPB1_end);
|
error = error + PrintGenotypes(s[1], HLA_DPA1_1,HLA_DPA1_2, HLA_DPA1_start,HLA_DPA1_end);
|
||||||
|
error = error + PrintGenotypes(s[1], HLA_DPB1_1,HLA_DPB1_2, HLA_DPB1_start,HLA_DPB1_end);
|
||||||
error = error + PrintAminoAcids(s[1], HLA_A_1,HLA_A_2, A_exons);
|
}
|
||||||
error = error + PrintAminoAcids(s[1], HLA_C_1,HLA_C_2, C_exons);
|
if (PrintAA){
|
||||||
error = error + PrintAminoAcids(s[1], HLA_B_1,HLA_B_2, B_exons);
|
error = error + PrintAminoAcids(s[1], HLA_A_1,HLA_A_2, A_exons);
|
||||||
error = error + PrintAminoAcids(s[1], HLA_DRB1_1,HLA_DRB1_2, DRB1_exons);
|
error = error + PrintAminoAcids(s[1], HLA_C_1,HLA_C_2, C_exons);
|
||||||
error = error + PrintAminoAcids(s[1], HLA_DQA1_1,HLA_DQA1_2, DQA1_exons);
|
error = error + PrintAminoAcids(s[1], HLA_B_1,HLA_B_2, B_exons);
|
||||||
error = error + PrintAminoAcids(s[1], HLA_DQB1_1,HLA_DQB1_2, DQB1_exons);
|
error = error + PrintAminoAcids(s[1], HLA_DRB1_1,HLA_DRB1_2, DRB1_exons);
|
||||||
error = error + PrintAminoAcids(s[1], HLA_DPA1_1,HLA_DPA1_2, DPA1_exons);
|
error = error + PrintAminoAcids(s[1], HLA_DQA1_1,HLA_DQA1_2, DQA1_exons);
|
||||||
error = error + PrintAminoAcids(s[1], HLA_DPB1_1,HLA_DPB1_2, DPB1_exons);
|
error = error + PrintAminoAcids(s[1], HLA_DQB1_1,HLA_DQB1_2, DQB1_exons);
|
||||||
|
error = error + PrintAminoAcids(s[1], HLA_DPA1_1,HLA_DPA1_2, DPA1_exons);
|
||||||
|
error = error + PrintAminoAcids(s[1], HLA_DPB1_1,HLA_DPB1_2, DPB1_exons);
|
||||||
|
}
|
||||||
out.printf("\n");
|
out.printf("\n");
|
||||||
out.printf("%s",error);
|
out.printf("%s",error);
|
||||||
}
|
}
|
||||||
|
|
@ -433,23 +442,27 @@ private String PrintAminoAcids(String ID, String alleleName1, String alleleName2
|
||||||
|
|
||||||
//Prints SNP names for each site
|
//Prints SNP names for each site
|
||||||
if (true){
|
if (true){
|
||||||
PrintSNPS(HLA_A_start,HLA_A_end);
|
if (PrintDNA){
|
||||||
PrintSNPS(HLA_C_start,HLA_C_end);
|
PrintSNPS(HLA_A_start,HLA_A_end);
|
||||||
PrintSNPS(HLA_B_start,HLA_B_end);
|
PrintSNPS(HLA_C_start,HLA_C_end);
|
||||||
PrintSNPS(HLA_DRB1_start,HLA_DRB1_end);
|
PrintSNPS(HLA_B_start,HLA_B_end);
|
||||||
PrintSNPS(HLA_DQA1_start,HLA_DQA1_end);
|
PrintSNPS(HLA_DRB1_start,HLA_DRB1_end);
|
||||||
PrintSNPS(HLA_DQB1_start,HLA_DQB1_end);
|
PrintSNPS(HLA_DQA1_start,HLA_DQA1_end);
|
||||||
PrintSNPS(HLA_DPA1_start,HLA_DPA1_end);
|
PrintSNPS(HLA_DQB1_start,HLA_DQB1_end);
|
||||||
PrintSNPS(HLA_DPB1_start,HLA_DPB1_end);
|
PrintSNPS(HLA_DPA1_start,HLA_DPA1_end);
|
||||||
|
PrintSNPS(HLA_DPB1_start,HLA_DPB1_end);
|
||||||
|
}
|
||||||
|
|
||||||
PrintAminoAcidSites(A_exons,"A",true);
|
if (PrintAA){
|
||||||
PrintAminoAcidSites(C_exons,"C",false);
|
PrintAminoAcidSites(A_exons,"A",true);
|
||||||
PrintAminoAcidSites(B_exons,"B",false);
|
PrintAminoAcidSites(C_exons,"C",false);
|
||||||
PrintAminoAcidSites(DRB1_exons,"DRB1",false);
|
PrintAminoAcidSites(B_exons,"B",false);
|
||||||
PrintAminoAcidSites(DQA1_exons,"DQA1",true);
|
PrintAminoAcidSites(DRB1_exons,"DRB1",false);
|
||||||
PrintAminoAcidSites(DQB1_exons,"DQB1",false);
|
PrintAminoAcidSites(DQA1_exons,"DQA1",true);
|
||||||
PrintAminoAcidSites(DPA1_exons,"DPA1",false);
|
PrintAminoAcidSites(DQB1_exons,"DQB1",false);
|
||||||
PrintAminoAcidSites(DPB1_exons,"DPB1",true);
|
PrintAminoAcidSites(DPA1_exons,"DPA1",false);
|
||||||
|
PrintAminoAcidSites(DPB1_exons,"DPB1",true);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,9 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
|
|
||||||
@Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false)
|
@Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false)
|
||||||
public boolean findFirst = false;
|
public boolean findFirst = false;
|
||||||
|
|
||||||
|
@Argument(fullName = "DEBUG", shortName = "DEBUG", doc = "Debug walker", required = false)
|
||||||
|
public boolean debug = false;
|
||||||
|
|
||||||
@Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false)
|
@Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false)
|
||||||
public String debugAllele = "";
|
public String debugAllele = "";
|
||||||
|
|
@ -50,16 +53,18 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
|
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
|
||||||
public String ethnicity = "Caucasian";
|
public String ethnicity = "Caucasian";
|
||||||
|
|
||||||
|
@Argument(fullName = "dictionary", shortName = "dictionary", doc = "bam file of HLA ditionary", required = false)
|
||||||
|
public String HLAdictionaryFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam";
|
||||||
|
|
||||||
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
||||||
public boolean ONLYFREQUENT = false;
|
public boolean ONLYFREQUENT = false;
|
||||||
|
|
||||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
|
|
||||||
SAMFileReader HLADictionaryReader = new SAMFileReader();
|
SAMFileReader HLADictionaryReader = new SAMFileReader();
|
||||||
|
|
||||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||||
String AlleleFrequencyFile;
|
String AlleleFrequencyFile;
|
||||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||||
|
|
||||||
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
|
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
|
||||||
|
|
||||||
|
|
@ -74,6 +79,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
int numHLAlleles = 0;
|
int numHLAlleles = 0;
|
||||||
int minstartpos = 0;
|
int minstartpos = 0;
|
||||||
int maxstoppos = 0;
|
int maxstoppos = 0;
|
||||||
|
int numpolymorphicsites = 0, numnonpolymorphicsites = 0, pos =0;
|
||||||
|
|
||||||
int HLA_A_start = 30018310;
|
int HLA_A_start = 30018310;
|
||||||
int HLA_A_end = 30021211;
|
int HLA_A_end = 30021211;
|
||||||
|
|
@ -89,7 +95,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
//Load HLA dictionary
|
//Load HLA dictionary
|
||||||
out.printf("INFO Loading HLA dictionary ... ");
|
out.printf("INFO Loading HLA dictionary ... ");
|
||||||
|
|
||||||
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
HLADictionaryReader.ReadFile(HLAdictionaryFile);
|
||||||
HLAreads = HLADictionaryReader.GetReads();
|
HLAreads = HLADictionaryReader.GetReads();
|
||||||
HLAnames = HLADictionaryReader.GetReadNames();
|
HLAnames = HLADictionaryReader.GetReadNames();
|
||||||
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
||||||
|
|
@ -121,7 +127,8 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
siteFileReader.ReadFile(PolymorphicSitesFile);
|
siteFileReader.ReadFile(PolymorphicSitesFile);
|
||||||
PolymorphicSites = siteFileReader.GetPolymorphicSites();
|
PolymorphicSites = siteFileReader.GetPolymorphicSites();
|
||||||
NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
|
NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
|
||||||
|
numpolymorphicsites = PolymorphicSites.length;
|
||||||
|
numnonpolymorphicsites = NonPolymorphicSites.length;
|
||||||
|
|
||||||
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
|
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
|
||||||
out.printf("INFO Comparing reads to database ...\n");
|
out.printf("INFO Comparing reads to database ...\n");
|
||||||
|
|
@ -133,53 +140,15 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void FindPolymorphicSites(int start, int stop){
|
|
||||||
boolean initialized, polymorphic, examined;
|
|
||||||
char c = ' ';
|
|
||||||
ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
|
|
||||||
ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
|
|
||||||
//Find polymorphic sites in dictionary
|
|
||||||
for (int pos = start; pos <= stop; pos++){
|
|
||||||
initialized = false; polymorphic = false; examined = false;
|
|
||||||
//look across all alleles at specific position to see if it is polymorphic
|
|
||||||
for (int i = 0; i < HLAreads.length; i++){
|
|
||||||
if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
|
|
||||||
if (!initialized){
|
|
||||||
c = HLAreads[i].charAt(pos-HLAstartpos[i]);
|
|
||||||
initialized = true;
|
|
||||||
examined = true;
|
|
||||||
}
|
|
||||||
if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
|
|
||||||
polymorphicsites.add(pos);
|
|
||||||
out.printf("POLYMORPHIC\t6\t%s\n", pos);
|
|
||||||
polymorphic = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
if (!polymorphic && examined){
|
|
||||||
nonpolymorphicsites.add(pos);
|
|
||||||
out.printf("CONSERVED\t6\t%s\n", pos);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
|
|
||||||
NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
|
|
||||||
}
|
|
||||||
|
|
||||||
private double CalculateConcordance(SAMRecord read){
|
private double CalculateConcordance(SAMRecord read){
|
||||||
int readstart = read.getAlignmentStart();
|
int readstart = read.getAlignmentStart();
|
||||||
int readstop = read.getAlignmentEnd();
|
int readstop = read.getAlignmentEnd();
|
||||||
int numpolymorphicsites, numnonpolymorphicsites, pos;
|
|
||||||
char c1, c2;
|
char c1, c2;
|
||||||
double maxConcordance = 0.0, freq = 0.0, minFreq = 0.0;
|
double maxConcordance = 0.0, freq = 0.0, minFreq = 0.0;
|
||||||
String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString());
|
String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString());
|
||||||
String s2;
|
String s2;
|
||||||
int allelestart, allelestop;
|
int allelestart, allelestop;
|
||||||
|
|
||||||
numpolymorphicsites = PolymorphicSites.length;
|
|
||||||
numnonpolymorphicsites = NonPolymorphicSites.length;
|
|
||||||
|
|
||||||
if (ONLYFREQUENT){
|
if (ONLYFREQUENT){
|
||||||
minFreq = 0.0001;
|
minFreq = 0.0001;
|
||||||
}
|
}
|
||||||
|
|
@ -200,7 +169,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
|
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
|
||||||
c1 = s1.charAt(pos-readstart);
|
c1 = s1.charAt(pos-readstart);
|
||||||
c2 = s2.charAt(pos-allelestart);
|
c2 = s2.charAt(pos-allelestart);
|
||||||
if (c1 != 'D'){//allow for deletions (sequencing errors)
|
if (c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
|
||||||
numcompared[i]++;
|
numcompared[i]++;
|
||||||
if (c1 == c2){
|
if (c1 == c2){
|
||||||
nummatched[i]++;
|
nummatched[i]++;
|
||||||
|
|
@ -214,16 +183,17 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
//Non-polymorphic sites: increment denominator only when bases are discordant
|
//Non-polymorphic sites: increment denominator only when bases are discordant
|
||||||
|
if (numcompared[i] > 0){
|
||||||
for (int j = 0; j < numnonpolymorphicsites; j++){
|
for (int j = 0; j < numnonpolymorphicsites; j++){
|
||||||
pos = NonPolymorphicSites[j];
|
pos = NonPolymorphicSites[j];
|
||||||
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
|
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
|
||||||
c1 = s1.charAt(pos-readstart);
|
c1 = s1.charAt(pos-readstart);
|
||||||
c2 = s2.charAt(pos-allelestart);
|
c2 = s2.charAt(pos-allelestart);
|
||||||
if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors)
|
if (c1 != c2 && c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
|
||||||
numcompared[i]++;
|
numcompared[i]++;
|
||||||
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
||||||
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
|
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -259,26 +229,32 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
||||||
|
|
||||||
public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
//Calculate concordance for this read and all overlapping reads
|
//Calculate concordance for this read and all overlapping reads
|
||||||
double maxConcordance = CalculateConcordance(read);
|
if (read.getMappingQuality() > 0){
|
||||||
|
double maxConcordance = CalculateConcordance(read);
|
||||||
|
|
||||||
String readname = read.getReadName(), allelename = ""; double freq;
|
if (maxConcordance > 0){
|
||||||
//For input bam files that contain HLA alleles, find and print allele frequency
|
String readname = read.getReadName(), allelename = ""; double freq;
|
||||||
freq = GetAlleleFrequency(readname);
|
//For input bam files that contain HLA alleles, find and print allele frequency
|
||||||
out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());
|
//freq = GetAlleleFrequency(readname);
|
||||||
|
out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());
|
||||||
|
|
||||||
//Find the maximum frequency of the alleles most concordant with the read
|
//Find the maximum frequency of the alleles most concordant with the read
|
||||||
double maxFreq = FindMaxAlleleFrequency(maxConcordance);
|
//double maxFreq = FindMaxAlleleFrequency(maxConcordance);
|
||||||
|
|
||||||
//Print concordance statistics between this read and the most similar HLA allele(s)
|
//Print concordance statistics between this read and the most similar HLA allele(s)
|
||||||
for (int i = 0; i < HLAreads.length; i++){
|
|
||||||
if (concordance[i] == maxConcordance && maxConcordance > 0){
|
for (int i = 0; i < HLAreads.length; i++){
|
||||||
freq = GetAlleleFrequency(HLAnames[i]);
|
if (concordance[i] == maxConcordance){
|
||||||
if (freq == maxFreq){
|
freq = GetAlleleFrequency(HLAnames[i]);
|
||||||
out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
|
//if (freq == maxFreq){
|
||||||
|
out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
|
||||||
|
//}
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
out.print("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out.print("\n");
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,152 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Hashtable;
|
||||||
|
/**
|
||||||
|
* Finds polymorphic sites in the HLA dictionary. Usage: java -jar GenomeAnalysisTK.jar -T FindPolymorphicSites -I HLA_DICTIONARY.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval -findFirst | grep -v INFO | sort -k1 > OUTPUT
|
||||||
|
* @author shermanjia
|
||||||
|
*/
|
||||||
|
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||||
|
public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
|
||||||
|
@Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false)
|
||||||
|
public String debugRead = "";
|
||||||
|
|
||||||
|
@Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false)
|
||||||
|
public boolean findFirst = false;
|
||||||
|
|
||||||
|
@Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false)
|
||||||
|
public String debugAllele = "";
|
||||||
|
|
||||||
|
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
|
||||||
|
public String ethnicity = "Caucasian";
|
||||||
|
|
||||||
|
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
||||||
|
public boolean ONLYFREQUENT = false;
|
||||||
|
|
||||||
|
//String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
|
||||||
|
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
|
||||||
|
|
||||||
|
SAMFileReader HLADictionaryReader = new SAMFileReader();
|
||||||
|
|
||||||
|
//String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||||
|
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||||
|
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||||
|
String AlleleFrequencyFile;
|
||||||
|
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||||
|
|
||||||
|
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
|
||||||
|
|
||||||
|
boolean DatabaseLoaded = false;
|
||||||
|
boolean DEBUG = false;
|
||||||
|
|
||||||
|
String[] HLAnames, HLAreads;
|
||||||
|
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
|
||||||
|
double[] SingleAlleleFrequencies;
|
||||||
|
|
||||||
|
double[] nummatched, concordance, numcompared;
|
||||||
|
int numHLAlleles = 0;
|
||||||
|
int minstartpos = 0;
|
||||||
|
int maxstoppos = 0;
|
||||||
|
|
||||||
|
int HLA_A_start = 30018310;
|
||||||
|
int HLA_A_end = 30021211;
|
||||||
|
|
||||||
|
Hashtable AlleleFrequencies = new Hashtable();
|
||||||
|
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
|
||||||
|
CigarParser formatter = new CigarParser();
|
||||||
|
|
||||||
|
public Integer reduceInit() {
|
||||||
|
if (!DatabaseLoaded){
|
||||||
|
DatabaseLoaded = true;
|
||||||
|
|
||||||
|
//Load HLA dictionary
|
||||||
|
out.printf("INFO Loading HLA dictionary ... ");
|
||||||
|
|
||||||
|
HLADictionaryReader.ReadFile(HLAdatabaseFile);
|
||||||
|
HLAreads = HLADictionaryReader.GetReads();
|
||||||
|
HLAnames = HLADictionaryReader.GetReadNames();
|
||||||
|
HLAstartpos = HLADictionaryReader.GetStartPositions();
|
||||||
|
HLAstoppos = HLADictionaryReader.GetStopPositions();
|
||||||
|
minstartpos = HLADictionaryReader.GetMinStartPos();
|
||||||
|
maxstoppos = HLADictionaryReader.GetMaxStopPos();
|
||||||
|
|
||||||
|
out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
|
||||||
|
|
||||||
|
nummatched = new double[HLAreads.length];
|
||||||
|
concordance = new double[HLAreads.length];
|
||||||
|
numcompared = new double[HLAreads.length];
|
||||||
|
|
||||||
|
//Read allele frequencies
|
||||||
|
if (ethnicity.equals("Black")){
|
||||||
|
AlleleFrequencyFile = BlackAlleleFrequencyFile;
|
||||||
|
}else{
|
||||||
|
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
|
||||||
|
}
|
||||||
|
out.printf("INFO Reading HLA allele frequencies ... ");
|
||||||
|
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
|
||||||
|
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
|
||||||
|
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
|
||||||
|
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
|
||||||
|
|
||||||
|
FindPolymorphicSites(minstartpos,maxstoppos);
|
||||||
|
|
||||||
|
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
|
||||||
|
out.printf("INFO Comparing reads to database ...\n");
|
||||||
|
|
||||||
|
if (DEBUG){
|
||||||
|
//out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void FindPolymorphicSites(int start, int stop){
|
||||||
|
boolean initialized, polymorphic, examined;
|
||||||
|
char c = ' ';
|
||||||
|
ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
|
||||||
|
ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
|
||||||
|
//Find polymorphic sites in dictionary
|
||||||
|
for (int pos = start; pos <= stop; pos++){
|
||||||
|
initialized = false; polymorphic = false; examined = false;
|
||||||
|
//look across all alleles at specific position to see if it is polymorphic
|
||||||
|
for (int i = 0; i < HLAreads.length; i++){
|
||||||
|
if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
|
||||||
|
if (!initialized){
|
||||||
|
c = HLAreads[i].charAt(pos-HLAstartpos[i]);
|
||||||
|
initialized = true;
|
||||||
|
examined = true;
|
||||||
|
}
|
||||||
|
if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
|
||||||
|
polymorphicsites.add(pos);
|
||||||
|
out.printf("POLYMORPHIC\t6\t%s\n", pos);
|
||||||
|
polymorphic = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
if (!polymorphic && examined){
|
||||||
|
nonpolymorphicsites.add(pos);
|
||||||
|
out.printf("CONSERVED\t6\t%s\n", pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
|
||||||
|
NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
|
//Calculate concordance for this read and all overlapping reads
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return value + sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -16,8 +16,9 @@ import java.util.Hashtable;
|
||||||
*/
|
*/
|
||||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||||
public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
|
public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
|
||||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.sam";
|
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
|
||||||
String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.closest";
|
// String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.CLASS1.closest";
|
||||||
|
String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.CLASS2.closest";
|
||||||
|
|
||||||
boolean DatabaseLoaded = false;
|
boolean DatabaseLoaded = false;
|
||||||
boolean DEBUG = false;
|
boolean DEBUG = false;
|
||||||
|
|
@ -40,14 +41,24 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
|
||||||
int HLA_B_end = 31432914;
|
int HLA_B_end = 31432914;
|
||||||
int HLA_C_start = 31344925;
|
int HLA_C_start = 31344925;
|
||||||
int HLA_C_end = 31347827;
|
int HLA_C_end = 31347827;
|
||||||
|
int HLA_DQA1_start = 32713161;
|
||||||
|
int HLA_DQA1_end = 32719407;
|
||||||
|
int HLA_DQB1_start = 32735635;
|
||||||
|
int HLA_DQB1_end = 32742444;
|
||||||
|
int HLA_DPA1_start = 33140772;
|
||||||
|
int HLA_DPA1_end = 33149356;
|
||||||
|
int HLA_DPB1_start = 33151738;
|
||||||
|
int HLA_DPB1_end = 33162954;
|
||||||
|
int HLA_DRB1_start = 32654525;
|
||||||
|
int HLA_DRB1_end = 32665540;
|
||||||
|
|
||||||
|
|
||||||
ArrayList<String> PolymorphicSites = new ArrayList<String>();
|
ArrayList<String> PolymorphicSites = new ArrayList<String>();
|
||||||
|
|
||||||
Hashtable ClosestAllele = new Hashtable();
|
Hashtable ClosestAllele = new Hashtable();
|
||||||
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
|
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1, iDRBstart = -1, iDRBstop = -1, iDQAstart = -1, iDQAstop = -1, iDQBstart = -1, iDQBstop = -1, iDPAstart = -1, iDPAstop = -1, iDPBstart = -1, iDPBstop = -1;
|
||||||
CigarParser formatter = new CigarParser();
|
CigarParser formatter = new CigarParser();
|
||||||
|
|
||||||
public Integer reduceInit() {
|
public Integer reduceInit() {
|
||||||
if (!DatabaseLoaded){
|
if (!DatabaseLoaded){
|
||||||
try{
|
try{
|
||||||
|
|
@ -77,6 +88,21 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
|
||||||
}else if (s[0].indexOf("HLA_C") > -1){
|
}else if (s[0].indexOf("HLA_C") > -1){
|
||||||
if (iCstart < 0){iCstart=i;}
|
if (iCstart < 0){iCstart=i;}
|
||||||
iCstop = i; i++;
|
iCstop = i; i++;
|
||||||
|
}else if (s[0].indexOf("HLA_DRB1") > -1){
|
||||||
|
if (iDRBstart < 0){iDRBstart=i;}
|
||||||
|
iDRBstop = i; i++;
|
||||||
|
}else if (s[0].indexOf("HLA_DQA1") > -1){
|
||||||
|
if (iDQAstart < 0){iDQAstart=i;}
|
||||||
|
iDQAstop = i; i++;
|
||||||
|
}else if (s[0].indexOf("HLA_DQB1") > -1){
|
||||||
|
if (iDQBstart < 0){iDQBstart=i;}
|
||||||
|
iDQBstop = i; i++;
|
||||||
|
}else if (s[0].indexOf("HLA_DPA1") > -1){
|
||||||
|
if (iDPAstart < 0){iDPAstart=i;}
|
||||||
|
iDPAstop = i; i++;
|
||||||
|
}else if (s[0].indexOf("HLA_DPB1") > -1){
|
||||||
|
if (iDPBstart < 0){iDPBstart=i;}
|
||||||
|
iDPBstop = i; i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -146,10 +172,12 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
|
||||||
int numM = 0, numI = 0, numD = 0;
|
int numM = 0, numI = 0, numD = 0;
|
||||||
|
|
||||||
name = read.getReadName();
|
name = read.getReadName();
|
||||||
|
|
||||||
String matchedAllele = (String) ClosestAllele.get(name);
|
String matchedAllele = (String) ClosestAllele.get(name);
|
||||||
|
|
||||||
//out.printf("%s\t%s\n",name,matchedAllele);
|
//out.printf("%s\t%s\n",name,matchedAllele);
|
||||||
int index = HLAnames.indexOf(matchedAllele);
|
int index = HLAnames.indexOf(matchedAllele);
|
||||||
|
|
||||||
String matchedRead = HLAreads.get(index);
|
String matchedRead = HLAreads.get(index);
|
||||||
|
|
||||||
if (name.indexOf("HLA_A") > -1){
|
if (name.indexOf("HLA_A") > -1){
|
||||||
|
|
@ -161,8 +189,24 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
|
||||||
} else if (name.indexOf("HLA_C") > -1){
|
} else if (name.indexOf("HLA_C") > -1){
|
||||||
startimputation = HLA_C_start;
|
startimputation = HLA_C_start;
|
||||||
stopimputation = HLA_C_end;
|
stopimputation = HLA_C_end;
|
||||||
|
} else if (name.indexOf("HLA_DRB1") > -1){
|
||||||
|
startimputation = HLA_DRB1_start;
|
||||||
|
stopimputation = HLA_DRB1_end;
|
||||||
|
} else if (name.indexOf("HLA_DQA1") > -1){
|
||||||
|
startimputation = HLA_DQA1_start;
|
||||||
|
stopimputation = HLA_DQA1_end;
|
||||||
|
} else if (name.indexOf("HLA_DQB1") > -1){
|
||||||
|
startimputation = HLA_DQB1_start;
|
||||||
|
stopimputation = HLA_DQB1_end;
|
||||||
|
} else if (name.indexOf("HLA_DPA1") > -1){
|
||||||
|
startimputation = HLA_DPA1_start;
|
||||||
|
stopimputation = HLA_DPA1_end;
|
||||||
|
} else if (name.indexOf("HLA_DPB1") > -1){
|
||||||
|
startimputation = HLA_DPB1_start;
|
||||||
|
stopimputation = HLA_DPB1_end;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//out.printf("DEBUG %s\t%s\t%s\t%s\t%s\n",name,matchedAllele,index,startimputation,stopimputation);
|
||||||
for (int i = startimputation; i <= stopimputation; i++){
|
for (int i = startimputation; i <= stopimputation; i++){
|
||||||
//if position is within read
|
//if position is within read
|
||||||
if (i >= readstart && i <= readstop){
|
if (i >= readstart && i <= readstop){
|
||||||
|
|
|
||||||
|
|
@ -7,12 +7,16 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Hashtable;
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @author shermanjia
|
* @author shermanjia
|
||||||
*/
|
*/
|
||||||
public class SimilarityFileReader {
|
public class SimilarityFileReader {
|
||||||
ArrayList<String> ReadsToDiscard = new ArrayList<String>();
|
ArrayList<String> ReadsToDiscard = new ArrayList<String>();
|
||||||
|
Hashtable Concordance = new Hashtable();
|
||||||
|
Hashtable NumMatches = new Hashtable();
|
||||||
|
Hashtable NumMismatches = new Hashtable();
|
||||||
|
|
||||||
public ArrayList<String> GetReadsToDiscard(){
|
public ArrayList<String> GetReadsToDiscard(){
|
||||||
return ReadsToDiscard;
|
return ReadsToDiscard;
|
||||||
|
|
@ -22,6 +26,18 @@ public class SimilarityFileReader {
|
||||||
return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]);
|
return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Hashtable GetConcordance(){
|
||||||
|
return Concordance;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Hashtable GetNumMatches(){
|
||||||
|
return NumMatches;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Hashtable GetNumMismatches(){
|
||||||
|
return NumMismatches;
|
||||||
|
}
|
||||||
|
|
||||||
public void ReadFile(String filename, int minAllowedMismatches){
|
public void ReadFile(String filename, int minAllowedMismatches){
|
||||||
try{
|
try{
|
||||||
FileInputStream fstream = new FileInputStream(filename);
|
FileInputStream fstream = new FileInputStream(filename);
|
||||||
|
|
@ -35,6 +51,10 @@ public class SimilarityFileReader {
|
||||||
if (s.length >= 6){
|
if (s.length >= 6){
|
||||||
Double matchFraction = Double.valueOf(s[4]);
|
Double matchFraction = Double.valueOf(s[4]);
|
||||||
int numMismatches = Integer.valueOf(s[6]);
|
int numMismatches = Integer.valueOf(s[6]);
|
||||||
|
|
||||||
|
Concordance.put(s[0],matchFraction);
|
||||||
|
NumMatches.put(s[0], s[5]);
|
||||||
|
NumMismatches.put(s[0], numMismatches);
|
||||||
if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches > minAllowedMismatches)){
|
if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches > minAllowedMismatches)){
|
||||||
ReadsToDiscard.add(s[0]);
|
ReadsToDiscard.add(s[0]);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue