HLA Caller 2.0 Walkers:

CalculateBaseLikelihoodsWalker.java walks through reads calculates likelihoods using SSG at each base position
CalculateAlleleLikelihoodsWalker.java walks through HLA dictionary and calculates likelihoods for allele pairs given output of CalculateBaseLikelihoodsWalker.java
CalculatePhaseLikelihoodsWalker.java walks through reads and calculates likelihoods score for allele pairs given phase information

File Readers:
BaseLikelihoodsFileReader.java reads text file of likelihoods outputted by SSG
FrequencyFileReader.java reads text file of HLA allele frequencies
PolymorphicSitesFileReader.java reads text file of polymorphic sites in the HLA dictionary
SAMFileReader.java reads a sam file (used to read HLA dictionary when in another walker)
SimilarityFileReader.java reads a text file of how similar each read is to the closest HLA allele (used to filter misaligned reads)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1744 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
sjia 2009-09-29 20:45:55 +00:00
parent 281a77c981
commit 9b78a789e2
13 changed files with 1347 additions and 239 deletions

View File

@ -0,0 +1,102 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import java.io.*;
import java.util.ArrayList;
/**
*
* @author shermanjia
*/
public class BaseLikelihoodsFileReader {
double[][] baseLikelihoods;
int[] positions;
ArrayList<Integer> polymorphicSites = new ArrayList<Integer>();
public Integer[] GetPolymorphicSites(){
return polymorphicSites.toArray(new Integer[polymorphicSites.size()]);
}
public double[][] GetBaseLikelihoods(){
return baseLikelihoods;
}
public int[] GetPositions(){
return positions;
}
public void ReadFile(String filename, boolean findPolymorphicSites){
try{
//System.out.printf("INFO Reading base likelihoods file ... ");
FileInputStream fstream = new FileInputStream(filename);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine; String [] s = null, pos = null;
//Determine size of file
int n = 0; char ref;
while ((strLine = br.readLine()) != null) {
if (strLine.indexOf("INFO") == -1){n++;}
}
baseLikelihoods = new double[n][10];
positions = new int[n];
double[] localLikelihoods = new double[10];
//System.out.printf("%s lines of data found ... ",n);
in.close();
//Read and store data
fstream = new FileInputStream(filename);
in = new DataInputStream(fstream);
br = new BufferedReader(new InputStreamReader(in));
n = 0;
while ((strLine = br.readLine()) != null) {
if (strLine.indexOf("INFO") == -1){
s = strLine.split("\\t");
pos = s[0].split(":");
ref = s[1].charAt(0);
positions[n] = Integer.valueOf(pos[1]);
for (int i = 3; i <= 12; i++){
baseLikelihoods[n][i-3] = Double.valueOf(s[i]);
localLikelihoods[i-3] = baseLikelihoods[n][i-3];
}
if (findPolymorphicSites){
if (IsPolymorphicSite(localLikelihoods,ref)){
polymorphicSites.add(positions[n]);
}
}
n++;
}
}
}catch (Exception e){//Catch exception if any
System.err.println("BaseLikelihoodsFileReader Error: " + e.getMessage());
}
}
private int IndexOf(char c){
switch(c){
case 'A': return 0;
case 'C': return 4;
case 'G': return 7;
case 'T': return 9;
default: return -1;
}
}
private boolean IsPolymorphicSite(double[] likelihoods, char ref){
boolean isPolymorphicSite = false;
double homreflikelihood = likelihoods[IndexOf(ref)];
for (int i = 0; i < 10; i++){
if (likelihoods[i] > homreflikelihood){
isPolymorphicSite = true;
}
}
return isPolymorphicSite;
}
}

View File

@ -0,0 +1,209 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.FileInputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Hashtable;
/**
*
* @author shermanjia
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true)
public String baseLikelihoodsFile = "";
@Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false)
public boolean DEBUG = false;
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
public boolean FREQUENT = false;
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
public String ethnicity = "Caucasian";
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile;
Hashtable AlleleFrequencies;
CigarParser formatter = new CigarParser();
double[][] baseLikelihoods;
int[] positions;
boolean loaded = false;
String[] HLAnames, HLAreads;
Integer[] HLAstartpos, HLAstoppos;
ArrayList<String> HLAnamesAL, HLAreadsAL;
ArrayList<Integer> HLAstartposAL, HLAstopposAL;
public Integer reduceInit() {
if (!loaded){
loaded = true;
BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader();
baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, false);
baseLikelihoods = baseLikelihoodsReader.GetBaseLikelihoods();
positions = baseLikelihoodsReader.GetPositions();
HLAnamesAL = new ArrayList<String>();
HLAreadsAL = new ArrayList<String>();
HLAstartposAL = new ArrayList<Integer>();
HLAstopposAL = new ArrayList<Integer>();
if (ethnicity.equals("Black")){
AlleleFrequencyFile = BlackAlleleFrequencyFile;
}else{
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
}
out.printf("INFO Reading HLA allele frequencies ... ");
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
HLAfreqReader.ReadFile(AlleleFrequencyFile);
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
out.printf("INFO Reading HLA dictionary ...");
}
return 0;
}
public Integer map(char[] ref, SAMRecord read) {
HLAnamesAL.add(read.getReadName());
HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString()));
HLAstartposAL.add(read.getAlignmentStart());
HLAstopposAL.add(read.getAlignmentEnd());
return 1;
}
private int GenotypeIndex(char a, char b){
switch(a){
case 'A':
switch(b){
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
};
case 'C':
switch(b){
case 'A': return 1;
case 'C': return 4;
case 'G': return 5;
case 'T': return 6;
};
case 'G':
switch(b){
case 'A': return 2;
case 'C': return 5;
case 'G': return 7;
case 'T': return 8;
};
case 'T':
switch(b){
case 'A': return 3;
case 'C': return 6;
case 'G': return 8;
case 'T': return 9;
};
default: return -1;
}
}
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
public void onTraversalDone(Integer numreads) {
out.printf("Done! %s alleles found\n", numreads);
HLAnames = HLAnamesAL.toArray(new String[numreads]);
HLAreads = HLAreadsAL.toArray(new String[numreads]);
HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]);
HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]);
double[][] AlleleLikelihoods = new double[numreads][numreads];
String name1, name2;
double frq1, frq2;
double minfrq = 0;
if (FREQUENT){
minfrq = 0.0001;
}
int numcombinations = 0;
out.printf("NUM\tAllele1\tAllele2\tSSG\n");
for (int i = 0; i < numreads; i++){
name1 = HLAnames[i].substring(4);
if (AlleleFrequencies.containsKey(name1)){
frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
if (frq1 > minfrq){
for (int j = i; j < numreads; j++){
name2 = HLAnames[j].substring(4);
if (AlleleFrequencies.containsKey(name2)){
frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
if (frq2 > minfrq){
if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
numcombinations++;
AlleleLikelihoods[i][j] = CalculateLikelihood(i,j);
out.printf("%s\t%s\t%s\t%.2f\n",numcombinations,name1,name2,AlleleLikelihoods[i][j]);
}
}else{
if (DEBUG){out.printf("%s has allele frequency%.5f\n",name2,frq2);}
}
}else{
if (DEBUG){out.printf("%s not found in allele frequency file\n",name2);}
}
}
}else{
if (DEBUG){out.printf("%s has allele frequency%.5f\n",name1,frq1);}
}
}else{
if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
}
}
}
private double CalculateLikelihood(int a1, int a2){
String read1 = HLAreads[a1];
String read2 = HLAreads[a2];
int start1 = HLAstartpos[a1];
int start2 = HLAstartpos[a2];
int stop1 = HLAstoppos[a1];
int stop2 = HLAstoppos[a2];
double likelihood = 0;
int pos, index;
char c1, c2;
for (int i = 0; i < positions.length; i++){
pos = positions[i];
if (pos < stop1 && pos > start1 && pos < stop2 && pos > start2){
index = GenotypeIndex(read1.charAt(pos-start1),read2.charAt(pos-start2));
if (index > -1){
likelihood = likelihood + baseLikelihoods[i][index];
}else{
/*if (DEBUG){
c1 = read1.charAt(pos-start1);
c2 = read2.charAt(pos-start2);
out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%.2f\n",HLAnames[a1],HLAnames[a2],pos,c1,c2,index,likelihood);
}*/
}
}
}
return likelihood;
}
}

View File

@ -0,0 +1,163 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
/**
*
* @author shermanjia
*/
public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Long, Long>>{
@Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false)
public boolean DEBUG = false;
@Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false)
public String inputAlleles = "";
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
public String ethnicity = "Caucasian";
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
public String filterFile = "";
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile;
SAMFileReader HLADictionaryReader = new SAMFileReader();
String[] HLAreads, HLAnames;
Integer[] HLAstartpos, HLAstoppos;
Hashtable AlleleFrequencies;
int[][] LOD, LikelihoodScores;
ArrayList<String> ReadsToDiscard = new ArrayList<String>();
ArrayList<SAMRecord> AllReads = new ArrayList<SAMRecord>();
ArrayList<String> AllReadNames = new ArrayList<String>();
boolean HLAdataLoaded = false;
//Loads HLA dictionary, allele frequencies, and reads to filter
public Pair<Long, Long> reduceInit() {
if (!HLAdataLoaded){
HLAdataLoaded = true;
out.printf("INFO Reading HLA database ... ");
HLADictionaryReader.ReadFile(HLAdatabaseFile);
HLAreads = HLADictionaryReader.GetReads();
HLAnames = HLADictionaryReader.GetReadNames();
HLAstartpos = HLADictionaryReader.GetStartPositions();
HLAstoppos = HLADictionaryReader.GetStopPositions();
InitializeVariables(HLAreads.length);
out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
if (ethnicity.equals("Black")){
AlleleFrequencyFile = BlackAlleleFrequencyFile;
}else{
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
}
out.printf("INFO Reading HLA allele frequencies ... ");
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
HLAfreqReader.ReadFile(AlleleFrequencyFile);
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
if (!filterFile.equals("")){
out.printf("INFO Reading properties file ... ");
SimilarityFileReader similarityReader = new SimilarityFileReader();
similarityReader.ReadFile(filterFile);
ReadsToDiscard = similarityReader.GetReadsToDiscard();
out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
}
}
return new Pair<Long,Long>(0l,0l);
}
private void InitializeVariables(int n){
LOD = new int[n][n];
LikelihoodScores = new int[n][n];
for (int i = 0; i < n; i++){
for (int j = 0; j <n; j++){
LOD[i][j]=0;
LikelihoodScores[i][j]=0;
}
}
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
List<SAMRecord> reads = context.getReads();
if(reads.size() > 0 ) {
List<Integer> offsets = context.getOffsets();
int numAs = 0, numCs = 0, numGs = 0, numTs = 0;
//if (DEBUG){
out.printf("%s\t%s\t", context.getLocation(),ref.getBase());
//}
//Calculate posterior probabilities
GenotypeLikelihoods G = new ThreeStateErrorGenotypeLikelihoods();
SAMRecord read; int offset; char base; byte qual; int mapquality; String readname;
//Check for bad bases and ensure mapping quality
for (int i = 0; i < reads.size(); i++) {
read = reads.get(i);
offset = offsets.get(i);
base = read.getReadString().charAt(offset);
qual = read.getBaseQualities()[offset];
//mapquality = read.getMappingQuality();
if (!ReadsToDiscard.contains(read.getReadName()) && BaseUtils.simpleBaseToBaseIndex(base) != -1) {
//consider base in likelihood calculations if it looks good and has high mapping score
G.add(base, qual, read, offset);
if (DEBUG){
if (base == 'A'){numAs++;}
else if (base == 'C'){numCs++;}
else if (base == 'T'){numTs++;}
else if (base == 'G'){numGs++;}
}
}
}
//if (DEBUG) {
out.printf("A[%s]C[%s]T[%s]G[%s]",numAs,numCs,numTs,numGs);
for ( DiploidGenotype g : DiploidGenotype.values() ) {
out.printf("\t%.2f",G.getLikelihood(g));
}
out.printf("\n");
//}
}
return context.getReads().size();
}
public Pair<Long, Long> reduce(Integer value, Pair<Long, Long> sum) {
long left = value.longValue() + sum.getFirst();
long right = sum.getSecond() + 1l;
return new Pair<Long,Long>(left, right);
}
public void onTraversalDone(Pair<Long, Long> result) {
}
}

View File

@ -0,0 +1,314 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.ArrayList;
import java.util.Hashtable;
/**
*
* @author shermanjia
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true)
public String baseLikelihoodsFile = "";
@Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false)
public boolean DEBUG = false;
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
public String filterFile = "";
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
public String ethnicity = "Caucasian";
@Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false)
public String debugAlleles = "";
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
public boolean ONLYFREQUENT = false;
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile;
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
SAMFileReader HLADictionaryReader = new SAMFileReader();
boolean HLAdataLoaded = false;
String[] HLAnames, HLAreads;
ArrayList<String> ReadsToDiscard;
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites;
int[][] numObservations, totalObservations;
int[] SNPnumInRead, SNPposInRead;
CigarParser cigarparser = new CigarParser();
Hashtable AlleleFrequencies;
public Integer reduceInit() {
if (!HLAdataLoaded){
HLAdataLoaded = true;
//Load HLA dictionary
out.printf("INFO Loading HLA dictionary ... ");
HLADictionaryReader.ReadFile(HLAdatabaseFile);
HLAreads = HLADictionaryReader.GetReads();
HLAnames = HLADictionaryReader.GetReadNames();
HLAstartpos = HLADictionaryReader.GetStartPositions();
HLAstoppos = HLADictionaryReader.GetStopPositions();
out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
if (!filterFile.equals("")){
out.printf("INFO Reading properties file ... ");
SimilarityFileReader similarityReader = new SimilarityFileReader();
similarityReader.ReadFile(filterFile);
ReadsToDiscard = similarityReader.GetReadsToDiscard();
out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
}
//Reading base likelihoods
if (!baseLikelihoodsFile.equals("")){
//Get only sites found to be different from reference
out.printf("INFO Reading base likelihoods file ... ");
BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader();
baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, true);
PolymorphicSites = baseLikelihoodsReader.GetPolymorphicSites();
out.printf("%s polymorphic sites found\n",PolymorphicSites.length);
}else{
//use all sites in class 1 HLA
out.printf("INFO Using all positions in the classical HLA genes ... ");
PolymorphicSites = InitializePolymorphicSites();
}
int l = PolymorphicSites.length;
SNPnumInRead = new int[l];
SNPposInRead = new int[l];
numObservations = new int[l*5][l*5];
totalObservations = new int[l][l];
//Read allele frequencies
if (ethnicity.equals("Black")){
AlleleFrequencyFile = BlackAlleleFrequencyFile;
}else{
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
}
out.printf("INFO Reading HLA allele frequencies ... ");
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
HLAfreqReader.ReadFile(AlleleFrequencyFile);
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
/*
PolymorphicSites = FindPolymorphicSites(HLADictionaryReader.GetMinStartPos(),HLADictionaryReader.GetMaxStopPos());
*/
}
return 0;
}
public Integer map(char[] ref, SAMRecord read) {
if (!ReadsToDiscard.contains(read.getReadName())){
UpdateCorrelation(read);
}else{
}
return 1;
}
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
public void onTraversalDone(Integer numreads) {
String name1, name2;
Double frq1, frq2, likelihood, minfrq = 0.0;
int numCombinations = 0;
if (!debugAlleles.equals("")){
String s[] = debugAlleles.split(",");
int index1 = HLADictionaryReader.GetReadIndex(s[0]);
int index2 = HLADictionaryReader.GetReadIndex(s[1]);
if (index1 > -1 && index2 > -1){
likelihood = CalculatePhaseLikelihood(index1,index2,true);
}
}
if (ONLYFREQUENT){
minfrq = 0.0001;
}
out.printf("NUM\tAllele1\tAllele2\tPhase\tFrq1\tFrq2\n");
for (int i = 0; i < HLAnames.length; i++){
name1 = HLAnames[i].substring(4);
if (AlleleFrequencies.containsKey(name1)){
frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
if (frq1 > minfrq){
for (int j = i; j < HLAnames.length; j++){
name2 = HLAnames[j].substring(4);
if (name1.substring(0,1).equals(name2.substring(0,1))){
if (AlleleFrequencies.containsKey(name2)){
frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
if (frq2 > minfrq){
likelihood = CalculatePhaseLikelihood(i,j,false);
numCombinations++;
out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2));
}
}
}
}
}
}
}
}
private Integer[] InitializePolymorphicSites(){
int HLA_A_start = 30018310, HLA_A_end = 30021211, num_A_positions = HLA_A_end - HLA_A_start + 1;
int HLA_B_start = 31430239, HLA_B_end = 31432914, num_B_positions = HLA_B_end - HLA_B_start + 1;
int HLA_C_start = 31344925, HLA_C_end = 31347827, num_C_positions = HLA_C_end - HLA_C_start + 1;
Integer[] polymorphicSites = new Integer[num_A_positions+num_B_positions+num_C_positions];
for (int i = 0; i < num_A_positions; i++){
polymorphicSites[i]=HLA_A_start + i;
}
for (int i = 0; i < num_C_positions; i++){
polymorphicSites[i+num_A_positions]=HLA_C_start + i;
}
for (int i = 0; i < num_B_positions; i++){
polymorphicSites[i+num_A_positions+num_C_positions]=HLA_B_start + i;
}
return polymorphicSites;
}
private int IndexOf(char c){
switch(c){
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
//case 'D': return 4;
default: return -1;
}
}
private void UpdateCorrelation(SAMRecord read){
//Updates correlation table with SNPs from specific read (for phasing)
String s = cigarparser.FormatRead(read.getCigarString(), read.getReadString());
ArrayList<Integer> SNPsInRead = new ArrayList<Integer>();
ArrayList<Integer> readindex = new ArrayList<Integer>();
int readstart = read.getAlignmentStart();
int readend = read.getAlignmentEnd();
int numPositions = PolymorphicSites.length;
char c1, c2;
int a, b, i, j, SNPcount = 0;
//Find all SNPs in read
for (i = 0; i < numPositions; i++){
if (PolymorphicSites[i] > readstart && PolymorphicSites[i] < readend){
SNPnumInRead[i] = SNPcount;
SNPposInRead[i] = PolymorphicSites[i]-readstart;
SNPcount++;
}else{
SNPnumInRead[i] = -1;
SNPposInRead[i] = -1;
}
}
//Update correlation table; for each combination of SNP positions
for (i = 0; i < numPositions; i++){
if (SNPnumInRead[i] > -1){
c1 = s.charAt(SNPposInRead[i]);
if (IndexOf(c1) > -1){
for (j = i+1; j < numPositions; j ++){
if (SNPnumInRead[j] > -1){
c2 = s.charAt(SNPposInRead[j]);
if (IndexOf(c2) > -1){
a = i*5 + IndexOf(c1);
b = j*5 + IndexOf(c2);
numObservations[a][b]++;
totalObservations[i][j]++;
if (DEBUG){
out.printf("DEBUG %s\t%s %s\t[i=%s,j=%s]\t[%s,%s]\t[%s,%s]\n",read.getReadName(),PolymorphicSites[i],PolymorphicSites[j],i,j,c1,c2,a,b);
}
}
}
}
}
}
}
}
private double CalculatePhaseLikelihood(int alleleIndex1, int alleleIndex2, boolean PRINTDEBUG){
//calculate the likelihood that the particular combination of alleles satisfies the phase count data
double likelihood = 0, prob = 0;
int readstart1 = HLAstartpos[alleleIndex1]; int readend1 = HLAstoppos[alleleIndex1];
int readstart2 = HLAstartpos[alleleIndex2]; int readend2 = HLAstoppos[alleleIndex2];
int combinedstart = Math.max(readstart1,readstart2);
int combinedstop = Math.min(readend1,readend2);
int numPositions = PolymorphicSites.length, SNPcount = 0;
int i, j, a1, a2, b1, b2;
char c11, c12, c21, c22;
int numInPhase = 0, numOutOfPhase;
//Find all SNPs in read
for (i = 0; i < numPositions; i++){
if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop){
SNPnumInRead[i] = SNPcount;
SNPposInRead[i] = PolymorphicSites[i]-combinedstart;
SNPcount++;
}else{
SNPnumInRead[i] = -1;
SNPposInRead[i] = -1;
}
}
String s1 = HLAreads[alleleIndex1];
String s2 = HLAreads[alleleIndex2];
//Iterate through every pairwise combination of SNPs, and update likelihood for the allele combination
for (i = 0; i < numPositions; i++){
if (SNPnumInRead[i] > -1){
c11 = s1.charAt(SNPposInRead[i]);
c21 = s2.charAt(SNPposInRead[i]);
if (IndexOf(c11) > -1 && IndexOf(c21) > -1){
for (j = i+1; j < numPositions; j ++){
if (SNPnumInRead[j] > -1 && totalObservations[i][j] > 0){
c12 = s1.charAt(SNPposInRead[j]);
c22 = s2.charAt(SNPposInRead[j]);
if (IndexOf(c12) > -1 && IndexOf(c22) > -1){
a1 = i*5 + IndexOf(c11);
b1 = j*5 + IndexOf(c12);
a2 = i*5 + IndexOf(c21);
b2 = j*5 + IndexOf(c22);
//check if the two alleles are identical at the chosen 2 locations
if ((c11 == c21) && (c12 == c22)){
numInPhase = numObservations[a1][b1];
}else{
numInPhase = numObservations[a1][b1] + numObservations[a2][b2];
}
prob = Math.max((double) numInPhase / (double) totalObservations[i][j], 0.0001);
likelihood += Math.log10(prob);
if (PRINTDEBUG){
out.printf("DEBUG %s %s %s[%s%s] %s[%s%s]\t[%s,%s]\t[%s,%s] [%s,%s]\t%s / %s\t%.4f\t%.2f\n",HLAnames[alleleIndex1],HLAnames[alleleIndex2],PolymorphicSites[i],c11,c21,PolymorphicSites[j],c12,c22, i,j,a1,b1,a2,b2,numInPhase,totalObservations[i][j],prob,likelihood);
}
}
}
}
}
}
}
return likelihood;
}
}

View File

@ -6,6 +6,7 @@
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -35,21 +36,39 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
@Argument(fullName="suppressLocusPrinting",doc="Suppress printing",required=false)
public boolean suppressPrinting = false;
@Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false)
public boolean DEBUG = false;
@Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false)
public String inputAlleles = "";
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
public String ethnicity = "Caucasian";
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
public String filterFile = "";
String al1 = "", al2 = "", al3 = "", al4 = "";
//String HLAdatabaseFile = "/Users/shermanjia/Work/HLA.sam";
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.4digitUnique.sam";
//String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.4digitUnique.sam";
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
//String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam";
//String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.4digitUnique.sam";
//String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/CEU_Founders_HLA.freq";
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile;
ArrayList<String> HLAreads = new ArrayList<String>();
ArrayList<String> HLAcigars = new ArrayList<String>();
ArrayList<String> HLAnames = new ArrayList<String>();
ArrayList<String> HLApositions = new ArrayList<String>();
ArrayList<String> ReadsToFilter = new ArrayList<String>();
ArrayList<SAMRecord> AllReads = new ArrayList<SAMRecord>();
ArrayList<String> AllReadNames = new ArrayList<String>();
@ -82,13 +101,27 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
boolean DatabaseLoaded = false;
boolean PrintedOutput = false;
boolean DEBUG = false;
public Pair<Long, Long> reduceInit() {
//Load sequences corresponding to HLA alleles from sam file
if (!DatabaseLoaded){
try{
//Load sequences corresponding to HLA alleles from sam file
if (!inputAlleles.equals("")){
String[] str = inputAlleles.split(",");
al1 = str[0];
al2 = str[1];
al3 = str[2];
al4 = str[3];
}
//set ethnic group to look up allele frequencies
if (ethnicity.equals("Black")){
AlleleFrequencyFile = BlackAlleleFrequencyFile;
}else{
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
}
out.printf("Reading HLA database ...");
FileInputStream fstream = new FileInputStream(HLAdatabaseFile);
DataInputStream in = new DataInputStream(fstream);
@ -140,23 +173,23 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
CombinedAlleleFrequencies[i][j]=0.0;
}
//For debugging: get index for specific alleles
if (HLAnames.get(i).equals("HLA_A*3101"))
if (HLAnames.get(i).equals("HLA_" + al1))
j1 = i;
if (HLAnames.get(i).equals("HLA_A*3201"))
if (HLAnames.get(i).equals("HLA_" + al2))
k1 = i;
if (HLAnames.get(i).equals("HLA_A*3121"))
if (HLAnames.get(i).equals("HLA_" + al3))
j2 = i;
if (HLAnames.get(i).equals("HLA_A*3207"))
if (HLAnames.get(i).equals("HLA_" + al4))
k2 = i;
}
out.printf("DONE! Read %s alleles\n",HLAreads.size());
}catch (Exception e){//Catch exception if any
System.err.println("Error: " + e.getMessage());
System.err.println("CallHLAWalker Error: " + e.getMessage());
}
try{
out.printf("Reading allele frequences ...");
FileInputStream fstream = new FileInputStream(CaucasianAlleleFrequencyFile);
FileInputStream fstream = new FileInputStream(AlleleFrequencyFile);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine; String [] s = null;
@ -170,9 +203,31 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
in.close();
out.printf("Done! Read %s alleles\n",count);
}catch (Exception e){//Catch exception if any
System.err.println("Error: " + e.getMessage());
System.err.println("CallHLAWalker Error: " + e.getMessage());
}
if (!filterFile.equals("")){
try{
out.printf("Reading reads to filter ...");
FileInputStream fstream = new FileInputStream(filterFile);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine; String [] s = null;
//Read File Line By Line
int count = 0;
while ((strLine = br.readLine()) != null){
s = strLine.split("\\t");
if (Integer.valueOf(s[6]) > 10){
ReadsToFilter.add(s[0]);
}
count++;
}
in.close();
out.printf("Done! %s reads to exclude\n",count);
}catch (Exception e){//Catch exception if any
System.err.println("CallHLAWalker Error: " + e.getMessage());
}
}
DatabaseLoaded = true;
out.printf("Comparing reads to database ...\n");
@ -283,21 +338,27 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
SAMRecord read; int offset; char base; byte qual; int mapquality; String readname;
//Check for bad bases and ensure mapping quality myself. This works.
for (int i = 0; i < context.getReads().size(); i++) {
read = context.getReads().get(i);
offset = context.getOffsets().get(i);
for (int i = 0; i < reads.size(); i++) {
read = reads.get(i);
offset = offsets.get(i);
base = read.getReadString().charAt(offset);
qual = read.getBaseQualities()[offset];
mapquality = read.getMappingQuality();
if (mapquality >= 5 && BaseUtils.simpleBaseToBaseIndex(base) != -1) {
//consider base in likelihood calculations if it looks good and has high mapping score
G.add(base, qual, read, offset);
readname = read.getReadName();
if (!AllReadNames.contains(readname)){AllReadNames.add(readname); AllReads.add(read);}
if (base == 'A'){numAs++; depth++;}
else if (base == 'C'){numCs++; depth++;}
else if (base == 'T'){numTs++; depth++;}
else if (base == 'G'){numGs++; depth++;}
if (ReadsToFilter.contains(read.getReadName())){
if (DEBUG){
out.printf("\n%s %s %s %s\n",read.getReadName(),read.getAlignmentStart(),read.getAlignmentEnd(),base);
}
}else{
//consider base in likelihood calculations if it looks good and has high mapping score
G.add(base, qual, read, offset);
readname = read.getReadName();
if (!AllReadNames.contains(readname)){AllReadNames.add(readname); AllReads.add(read);}
if (base == 'A'){numAs++; depth++;}
else if (base == 'C'){numCs++; depth++;}
else if (base == 'T'){numTs++; depth++;}
else if (base == 'G'){numGs++; depth++;}
}
}
}
@ -402,9 +463,9 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
}
if ( DEBUG ){
//Debugging: print updated likelihoods for 2 sets of HLA alleles, as well as normalized likelihoods for all 10 genotypes
out.printf("LOD[%s%s]:%5.1fL:%5.1fLOD[%s%s]:%5.1fL:%5.1f\t",r1,r2,LOD[j1][k1],LikelihoodScores[j1][k1],s1,s2,LOD[j2][k2],LikelihoodScores[j2][k2]);
out.printf("Likelihoods %s%s[%5.1f] %s%s[%5.1f]\t",r1,r2,LikelihoodScores[j1][k1],s1,s2,LikelihoodScores[j2][k2]);
for ( DiploidGenotype g : DiploidGenotype.values() ) {
out.printf("%s %5.1f\t",g.toString(),Scores.get(g.toString()));
out.printf("%s[%5.1f] ",g.toString(),Scores.get(g.toString()));
}
out.printf("\n");
}
@ -596,10 +657,6 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
return prob;
}
public boolean isReduceByInterval() {
return true;
}
public Pair<Long, Long> reduce(Integer value, Pair<Long, Long> sum) {
long left = value.longValue() + sum.getFirst();
long right = sum.getSecond() + 1l;
@ -653,7 +710,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
for (Integer i = 0; i < numHLAlleles; i++){
for (Integer j = i; j < numHLAlleles; j++){
if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1 && maxA > 0){
if (maxA - LOD[i][j] <= 5 && maxA >= LOD[i][j]){
if (maxA - LOD[i][j] <= 10 && maxA >= LOD[i][j]){
inverseMaxProbA = inverseMaxProbA + java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodA);
if (!TopAlleles.contains(i)){TopAlleles.add(i);}
if (!TopAlleles.contains(j)){TopAlleles.add(j);}
@ -662,7 +719,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
}
}
} else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1 && maxB > 0){
if (maxB - LOD[i][j] <= 5 && maxB - LOD[i][j] >= 0){
if (maxB - LOD[i][j] <= 10 && maxB - LOD[i][j] >= 0){
inverseMaxProbB = inverseMaxProbB + java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodB);
if (!TopAlleles.contains(i)){TopAlleles.add(i);}
if (!TopAlleles.contains(j)){TopAlleles.add(j);}
@ -671,7 +728,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
}
}
} else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1 && maxC > 0){
if (maxC - LOD[i][j] <= 5 && maxC - LOD[i][j] >= 0){
if (maxC - LOD[i][j] <= 10 && maxC - LOD[i][j] >= 0){
inverseMaxProbC = inverseMaxProbC + java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodC);
if (!TopAlleles.contains(i)){TopAlleles.add(i);}
if (!TopAlleles.contains(j)){TopAlleles.add(j);}
@ -754,11 +811,11 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
for (Integer i = 0; i < numHLAlleles; i++){
for (Integer j = i; j < numHLAlleles; j++){
if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){
if ((LOD[i][j] >= maxA - 5) && LOD[i][j] > 0){
if ((LOD[i][j] >= maxA - 10) && LOD[i][j] > 0){
PhasingScores[i][j]= SinglePhaseScores[TopAlleles.indexOf(i)] * SinglePhaseScores[TopAlleles.indexOf(j)];
if (PhasingScores[i][j] > maxAphase){maxAphase = PhasingScores[i][j];}
alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.0001;}
alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.0001;}
alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.00001;}
alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.00001;}
SingleAlleleFrequencies[i]=freq1; SingleAlleleFrequencies[j]=freq2; CombinedAlleleFrequencies[i][j]=freq1*freq2;
if (CombinedAlleleFrequencies[i][j] > maxAfreq){maxAfreq = CombinedAlleleFrequencies[i][j];}
likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodA)/inverseMaxProbA;
@ -767,11 +824,11 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] > maxProbA){maxProbA = likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j];}
}
} else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1){
if ((LOD[i][j] >= maxB - 5) && LOD[i][j] > 0){
if ((LOD[i][j] >= maxB - 10) && LOD[i][j] > 0){
PhasingScores[i][j]= SinglePhaseScores[TopAlleles.indexOf(i)] * SinglePhaseScores[TopAlleles.indexOf(j)];
if (PhasingScores[i][j] > maxBphase){maxBphase = PhasingScores[i][j];}
alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.0001;}
alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.0001;}
alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.00001;}
alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.00001;}
SingleAlleleFrequencies[i]=freq1; SingleAlleleFrequencies[j]=freq2; CombinedAlleleFrequencies[i][j]=freq1*freq2;
if (freq1*freq2 > maxBfreq){maxBfreq = freq1*freq2;}
likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodB)/inverseMaxProbB;
@ -780,12 +837,11 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] > maxProbB){maxProbB = likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j];}
}
} else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1){
if ((LOD[i][j] >= maxC - 5)&& LOD[i][j] > 0){
if ((LOD[i][j] >= maxC - 10)&& LOD[i][j] > 0){
PhasingScores[i][j]= SinglePhaseScores[TopAlleles.indexOf(i)] * SinglePhaseScores[TopAlleles.indexOf(j)];
if (PhasingScores[i][j] > maxCphase){maxCphase = PhasingScores[i][j];}
alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.0001;}
alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.0001;}
alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.00001;}
alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.00001;}
SingleAlleleFrequencies[i]=freq1; SingleAlleleFrequencies[j]=freq2; CombinedAlleleFrequencies[i][j]=freq1*freq2;
if (freq1*freq2 > maxCfreq){maxCfreq = freq1*freq2;}
likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodC)/inverseMaxProbC;
@ -808,7 +864,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
for (Integer i = 0; i < numHLAlleles; i++){
for (Integer j = i; j < numHLAlleles; j++){
if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){
if((LOD[i][j] >= maxA - 5) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxAphase
if((LOD[i][j] >= maxA - 10) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxAphase
likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodA)/inverseMaxProbA;
//out.printf("%s\t%s\tloglikelihood=%5.0f\tmax=%5.0f\tinvP=%.2f\tPrior=%.3f\tPhase=%.3f\tfreq1=%s\tfreq2=%s\tf1*f2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],maxlikelihoodA,inverseMaxProbA,likelihoodPrior,PhasingScores[i][j],SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]/ProbSumA);
out.printf("%s\t%s\tloglikelihood=%5.0f\tP(SSG)=%.3f\tP(Phase)=%.3f\tF1=%s\tF2=%s\tF1*F2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],likelihoodPrior,PhasingScores[i][j]/PhaseSumA,SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]/ProbSumA);
@ -816,14 +872,14 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
out.printf("\n");
}
} else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1){
if((LOD[i][j] >= maxB - 5) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxBphase
if((LOD[i][j] >= maxB - 10) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxBphase
likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodB)/inverseMaxProbB;
out.printf("%s\t%s\tloglikelihood=%5.0f\tP(SSG)=%.3f\tP(Phase)=%.3f\tF1=%s\tF2=%s\tF1*F2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],likelihoodPrior,PhasingScores[i][j]/PhaseSumB,SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]/ProbSumB);
if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] == maxProbB){out.printf("\tBEST");}
out.printf("\n");
}
} else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1){
if((LOD[i][j] >= maxC - 5) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxCphase
if((LOD[i][j] >= maxC - 10) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxCphase
likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodC)/inverseMaxProbC;
out.printf("%s\t%s\tloglikelihood=%5.0f\tP(SSG)=%.3f\tP(Phase)=%.3f\tF1=%s\tF2=%s\tF1*F2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],likelihoodPrior,PhasingScores[i][j]/PhaseSumC,SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]/ProbSumC);
if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] == maxProbC){out.printf("\tBEST");}

View File

@ -12,10 +12,19 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
*
* @author shermanjia
*/
public class ReadCigarFormatter {
public class CigarParser {
String formattedRead;
public String GetFormattedRead(){
return formattedRead;
}
public String FormatRead(String cigar, String read){
// returns a cigar-formatted sequence (removes insertions, inserts 'D' to where deletions occur
String formattedRead = ""; char c; String count;
formattedRead = "";
char c; String count;
int cigarPlaceholder = 0; int subcigarLength = 0;
int readPlaceholder = 0; int subreadLength = 0;
@ -43,15 +52,25 @@ public class ReadCigarFormatter {
//increment placeholders without adding inserted bases to sequence (effectively removes insertion).
cigarPlaceholder = i+1;
readPlaceholder = readPlaceholder + subreadLength;
} else if (c == 'H' || c == 'S'){
//(H = Headers or S = Soft clipped removed here)***
} else if (c == 'H'){
//(H = hard clip)
//If reaches H for insertion, get number before 'H' and skip that many characters in sequence
//If reaches H for hard clip, simply carry on
count = cigar.substring(cigarPlaceholder, i);
subreadLength = Integer.parseInt(count);
//increment cigar placeholder without adding inserted bases to sequence (effectively removes insertion).
cigarPlaceholder = i+1;
} else if (c == 'S'){
//(S = Soft clipped bases discarded here)***
//If reaches S for soft clip, get number before 'S' and skip that many characters in sequence
count = cigar.substring(cigarPlaceholder, i);
subreadLength = Integer.parseInt(count);
//increment cigar placeholder without adding inserted bases to sequence (effectively removes insertion).
cigarPlaceholder = i+1;
readPlaceholder = readPlaceholder + subreadLength;
} else if (c == 'D'){
//If reaches D for deletion, insert 'D' into sequence as placeholder
count = cigar.substring(cigarPlaceholder, i);

View File

@ -4,25 +4,15 @@
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.playground.gatk.walkers.HLAcaller.ReadCigarFormatter;
import org.broadinstitute.sting.gatk.walkers.*;
import java.io.FileInputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import java.lang.Math;
/**
*
* @author shermanjia
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class CreateHaplotypesWalker extends ReadWalker<Integer, Integer> {
ReadCigarFormatter formatter = new ReadCigarFormatter();
CigarParser formatter = new CigarParser();
char c;
boolean DEBUG = false;
int HLA_A_start = 30018310;

View File

@ -5,142 +5,103 @@
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.playground.gatk.walkers.HLAcaller.ReadCigarFormatter;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.FileInputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import java.lang.Math;
/**
*
* @author shermanjia
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.sam";
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
@Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false)
public String debugRead = "";
@Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false)
public boolean findFirst = false;
@Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false)
public String debugAllele = "";
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
public String ethnicity = "Caucasian";
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
public boolean ONLYFREQUENT = false;
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
SAMFileReader HLADictionaryReader = new SAMFileReader();
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile;
String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
boolean DatabaseLoaded = false;
boolean DEBUG = false;
ArrayList<String> HLAreads = new ArrayList<String>();
ArrayList<String> HLAcigars = new ArrayList<String>();
ArrayList<String> HLAnames = new ArrayList<String>();
ArrayList<String> HLApositions = new ArrayList<String>();
String[] HLAnames, HLAreads;
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
double[] SingleAlleleFrequencies;
double[] nummatched, concordance, numcompared;
int numHLAlleles = 0;
int[] HLAstartpos;
int[] HLAstoppos;
int minstartpos = 0;
int maxstoppos = 0;
int HLA_A_start = 30018310;
int HLA_A_end = 30021211;
ArrayList<String> PolymorphicSites = new ArrayList<String>();
Hashtable AlleleFrequencies = new Hashtable();
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
ReadCigarFormatter formatter = new ReadCigarFormatter();
CigarParser formatter = new CigarParser();
public Integer reduceInit() {
if (!DatabaseLoaded){
try{
out.printf("Reading HLA database ...\n");
FileInputStream fstream = new FileInputStream(HLAdatabaseFile);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine; String [] s = null;
//Read File Line By Line
int i = 0;
while ((strLine = br.readLine()) != null) {
s = strLine.split("\\t");
if (s.length>=10){
//Parse the reads with cigar parser
HLAreads.add(formatter.FormatRead(s[5],s[9]));
HLAcigars.add(s[5]);
HLAnames.add(s[0]);
HLApositions.add(s[3]);
if (s[0].indexOf("HLA_A") > -1){
if (iAstart < 0){iAstart=i;}
iAstop = i; i++;
}else if (s[0].indexOf("HLA_B") > -1){
if (iBstart < 0){iBstart=i;}
iBstop = i; i++;
}else if (s[0].indexOf("HLA_C") > -1){
if (iCstart < 0){iCstart=i;}
iCstop = i; i++;
}
}
}
in.close();
int n = HLApositions.size(); numHLAlleles = n;
HLAstartpos = new int[n]; HLAstoppos = new int[n];
SingleAlleleFrequencies = new double[n];
for (i = 0; i < n; i++){
//Find start and stop positions for each allele
HLAstartpos[i]=Integer.parseInt(HLApositions.get(i));
HLAstoppos[i]=HLAstartpos[i]+HLAreads.get(i).length()-1;
if (minstartpos == 0){minstartpos = HLAstartpos[i];}
minstartpos = Math.min(minstartpos, HLAstartpos[i]);
maxstoppos = Math.max(maxstoppos, HLAstoppos[i]);
SingleAlleleFrequencies[i]=0.0;
//Initialize matrix of probabilities / likelihoods
}
out.printf("DONE! Read %s alleles\n",HLAreads.size());
}catch (Exception e){//Catch exception if any
System.err.println("Error: " + e.getMessage());
}
try{
out.printf("Reading allele frequences ...");
FileInputStream fstream = new FileInputStream(CaucasianAlleleFrequencyFile);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine; String [] s = null;
//Read File Line By Line
int count = 0;
while ((strLine = br.readLine()) != null) {
s = strLine.split("\\t");
AlleleFrequencies.put(s[0].substring(0, 6), s[1]);
count++;
}
in.close();
out.printf("Done! Read %s alleles\n",count);
}catch (Exception e){//Catch exception if any
System.err.println("Error: " + e.getMessage());
}
char c;
if (!DatabaseLoaded){
DatabaseLoaded = true;
//Find polymorphic sites in dictionary
for (int pos = minstartpos; pos <= maxstoppos; pos++){
c = '0';
for (int i = 0; i < HLAreads.size(); i++){
if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
if (c == '0'){c = HLAreads.get(i).charAt(pos-HLAstartpos[i]);}
if (HLAreads.get(i).charAt(pos-HLAstartpos[i]) != c){
PolymorphicSites.add(String.valueOf(pos));
break;
}
}
}
}
out.printf("%s polymorphic sites found in HLA dictionary\n",PolymorphicSites.size());
out.printf("Comparing reads to database ...\n");
//Load HLA dictionary
out.printf("INFO Loading HLA dictionary ... ");
HLADictionaryReader.ReadFile(HLAdatabaseFile);
HLAreads = HLADictionaryReader.GetReads();
HLAnames = HLADictionaryReader.GetReadNames();
HLAstartpos = HLADictionaryReader.GetStartPositions();
HLAstoppos = HLADictionaryReader.GetStopPositions();
minstartpos = HLADictionaryReader.GetMinStartPos();
maxstoppos = HLADictionaryReader.GetMaxStopPos();
out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
nummatched = new double[HLAreads.length];
concordance = new double[HLAreads.length];
numcompared = new double[HLAreads.length];
//Read allele frequencies
if (ethnicity.equals("Black")){
AlleleFrequencyFile = BlackAlleleFrequencyFile;
}else{
AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
}
out.printf("INFO Reading HLA allele frequencies ... ");
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
HLAfreqReader.ReadFile(AlleleFrequencyFile);
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
//FindPolymorphicSites(minstartpos,maxstoppos);
PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader();
siteFileReader.ReadFile(PolymorphicSitesFile);
PolymorphicSites = siteFileReader.GetPolymorphicSites();
NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
out.printf("INFO Comparing reads to database ...\n");
if (DEBUG){
//out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles);
@ -149,97 +110,148 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
return 0;
}
private void FindPolymorphicSites(int start, int stop){
boolean initialized, polymorphic, examined;
char c = ' ';
ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
//Find polymorphic sites in dictionary
for (int pos = start; pos <= stop; pos++){
initialized = false; polymorphic = false; examined = false;
//look across all alleles at specific position to see if it is polymorphic
for (int i = 0; i < HLAreads.length; i++){
if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
if (!initialized){
c = HLAreads[i].charAt(pos-HLAstartpos[i]);
initialized = true;
examined = true;
}
if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
polymorphicsites.add(pos);
out.printf("POLYMORPHIC\t6\t%s\n", pos);
polymorphic = true;
break;
}
}
public Integer map(char[] ref, SAMRecord read) {
}
if (!polymorphic && examined){
nonpolymorphicsites.add(pos);
out.printf("CONSERVED\t6\t%s\n", pos);
}
}
PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
}
private double CalculateConcordance(SAMRecord read){
int readstart = read.getAlignmentStart();
int readstop = read.getAlignmentEnd();
double[] nummatched = new double[HLAreads.size()];
double[] concordance = new double[HLAreads.size()];
double[] numcompared = new double[HLAreads.size()];
double maxConcordance = 0;
String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString());
int numpolymorphicsites, numnonpolymorphicsites, pos;
char c1, c2;
String s2 = "", name = "";
double maxConcordance = 0.0, freq = 0.0, minFreq = 0.0;
String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString());
String s2;
int allelestart, allelestop;
//For every allele that overlaps with current read
for (int i = 0; i < HLAreads.size(); i++){
nummatched[i] = 0;
numpolymorphicsites = PolymorphicSites.length;
numnonpolymorphicsites = NonPolymorphicSites.length;
if (ONLYFREQUENT){
minFreq = 0.0001;
}
for (int i = 0; i < HLAreads.length; i++){
nummatched[i] = 0; concordance[i] = 0; numcompared[i] = 0;
freq = GetAlleleFrequency(HLAnames[i]);
//Get concordance between read and specific allele
if (DEBUG){
//out.printf("%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames.get(i),readstart,readstop,HLAstartpos[i],HLAstoppos[i]);
}
if (readstart <= HLAstoppos[i] && readstop >= HLAstartpos[i]){
s2 = HLAreads.get(i);
for (int j = read.getAlignmentStart(); j <= read.getAlignmentEnd(); j++){
c1 = s1.charAt(j-readstart);
c2 = s2.charAt(j-HLAstartpos[i]);
if (DEBUG){
//out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames.get(i),j,j-readstart,j-HLAstartpos[i],c1,c2);
}
if (c1 != 'D'){
if (PolymorphicSites.contains(String.valueOf(j))){
if (readstart <= HLAstoppos[i] && readstop >= HLAstartpos[i] && freq > minFreq){
s2 = HLAreads[i];
allelestart = HLAstartpos[i];
allelestop = HLAstoppos[i];
//Polymorphic sites: always increment denominator, increment numerator when bases are concordant
for (int j = 0; j < numpolymorphicsites; j++){
pos = PolymorphicSites[j];
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
c1 = s1.charAt(pos-readstart);
c2 = s2.charAt(pos-allelestart);
if (c1 != 'D'){
numcompared[i]++;
if (c1 == c2){
nummatched[i]++;
}else{
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
}
}
}else if (c1 != c2){
}
}
}
//Non-polymorphic sites: increment denominator only when bases are discordant
for (int j = 0; j < numnonpolymorphicsites; j++){
pos = NonPolymorphicSites[j];
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
c1 = s1.charAt(pos-readstart);
c2 = s2.charAt(pos-allelestart);
if (c1 != c2){
numcompared[i]++;
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
}
}
}
}
}
//Update concordance array
concordance[i]=nummatched[i]/numcompared[i];
if (concordance[i] > maxConcordance){maxConcordance = concordance[i];}
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
}
if (findFirst && (concordance[i] == 1)){
break;
}
}
double freq, freq2, maxFreq = 0.0;
for (int i = 0; i < HLAreads.size(); i++){
return maxConcordance;
}
private double FindMaxAlleleFrequency(double maxConcordance){
//finds the max frequency of the alleles that share the maximum concordance with the read of interest
double freq, maxFreq = 0.0;
for (int i = 0; i < HLAreads.length; i++){
if (concordance[i] == maxConcordance && maxConcordance > 0){
if (HLAnames.get(i).length() >= 10){
name=HLAnames.get(i).substring(4,10);
}else{
name=HLAnames.get(i).substring(4);
}
if (AlleleFrequencies.containsKey(name)){
freq = Double.parseDouble((String) AlleleFrequencies.get(name).toString());
if (DEBUG){
out.printf("%s\t%s\t%s\t%s\t%s\t%.4f\n",read.getReadName(),name,nummatched[i],numcompared[i],concordance[i],freq);
}
}else{
freq=0.0001;
}
freq = GetAlleleFrequency(HLAnames[i]);
if (freq > maxFreq){maxFreq = freq;}
}
}
return maxFreq;
}
if (read.getReadName().length() >= 10){
name=read.getReadName().substring(4,10);
}else{
name=read.getReadName().substring(4);
}
public Integer map(char[] ref, SAMRecord read) {
//Calculate concordance for this read and all overlapping reads
double maxConcordance = CalculateConcordance(read);
String readname = read.getReadName(), allelename = ""; double freq;
//For input bam files that contain HLA alleles, find and print allele frequency
freq = GetAlleleFrequency(readname);
out.printf("%s\t%.4f", readname,freq);
//Find the maximum frequency of the alleles most concordant with the read
double maxFreq = FindMaxAlleleFrequency(maxConcordance);
if (AlleleFrequencies.containsKey(name)){
freq = Double.parseDouble((String) AlleleFrequencies.get(name).toString());
}else{
freq=0.0001;
}
out.printf("%s\t%.4f", read.getReadName(),freq);
for (int i = 0; i < HLAreads.size(); i++){
//Print concordance statistics between this read and the most similar HLA allele(s)
for (int i = 0; i < HLAreads.length; i++){
if (concordance[i] == maxConcordance && maxConcordance > 0){
if (HLAnames.get(i).length() >= 10){
name=HLAnames.get(i).substring(4,10);
}else{
name=HLAnames.get(i).substring(4);
}
if (AlleleFrequencies.containsKey(name)){
freq = Double.parseDouble((String) AlleleFrequencies.get(name).toString());
}else{
freq=0.0001;
}
freq = GetAlleleFrequency(HLAnames[i]);
if (freq == maxFreq){
out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames.get(i),freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
}
}
}
@ -247,6 +259,22 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
return 1;
}
private double GetAlleleFrequency(String allelename){
double frequency = 0.0;
//Truncate names to 4-digit "A*0101" format
if (allelename.length() >= 10){
allelename=allelename.substring(4,10);
}else{
allelename=allelename.substring(4);
}
if (AlleleFrequencies.containsKey(allelename)){
frequency = Double.parseDouble((String) AlleleFrequencies.get(allelename).toString());
}else{
frequency=0.0001;
}
return frequency;
}
public Integer reduce(Integer value, Integer sum) {
return value + sum;

View File

@ -0,0 +1,39 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import java.io.*;
import java.util.Hashtable;
/**
*
* @author shermanjia
*/
public class FrequencyFileReader {
Hashtable AlleleFrequencies = new Hashtable();
public Hashtable GetAlleleFrequencies(){
return AlleleFrequencies;
}
public void ReadFile(String filename){
try{
FileInputStream fstream = new FileInputStream(filename);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine; String [] s = null;
//Read File Line By Line
while ((strLine = br.readLine()) != null) {
s = strLine.split("\\t");
AlleleFrequencies.put(s[0], s[1]);
//System.out.printf("Loaded: %s\t%s\n",s[0],AlleleFrequencies.get(s[0]).toString());
}
in.close();
}catch (Exception e){//Catch exception if any
System.err.println("FrequencyFileReader Error: " + e.getMessage());
}
}
}

View File

@ -5,8 +5,6 @@
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.playground.gatk.walkers.HLAcaller.ReadCigarFormatter;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.walkers.*;
import java.io.FileInputStream;
@ -15,8 +13,6 @@ import java.io.DataInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import java.lang.Math;
/**
*
* @author shermanjia
@ -53,7 +49,7 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
Hashtable ClosestAllele = new Hashtable();
int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
ReadCigarFormatter formatter = new ReadCigarFormatter();
CigarParser formatter = new CigarParser();
public Integer reduceInit() {
if (!DatabaseLoaded){
@ -106,7 +102,7 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
}
out.printf("DONE! Read %s alleles\n",HLAreads.size());
}catch (Exception e){//Catch exception if any
System.err.println("Error: " + e.getMessage());
System.err.println("ImputeAllelsWalker Error: " + e.getMessage());
}
try{
@ -126,7 +122,7 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
in.close();
out.printf("Done! Read %s alleles\n",count);
}catch (Exception e){//Catch exception if any
System.err.println("Error: " + e.getMessage());
System.err.println("ImputeAllelsWalker Error: " + e.getMessage());
}
char c;

View File

@ -0,0 +1,49 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import java.io.*;
import java.util.ArrayList;
/**
*
* @author shermanjia
*/
public class PolymorphicSitesFileReader {
ArrayList<Integer> PolymorphicSites = new ArrayList<Integer>();
ArrayList<Integer> NonPolymorphicSites = new ArrayList<Integer>();
public Integer[] GetPolymorphicSites(){
return PolymorphicSites.toArray(new Integer[PolymorphicSites.size()]);
}
public Integer[] GetNonPolymorphicSites(){
return NonPolymorphicSites.toArray(new Integer[NonPolymorphicSites.size()]);
}
public void ReadFile(String filename){
try{
FileInputStream fstream = new FileInputStream(filename);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine; String [] s = null;
//Read File Line By Line
int i = 0;
while ((strLine = br.readLine()) != null) {
s = strLine.split("\\t");
if (s[0].equals("POLYMORPHIC")){
PolymorphicSites.add(Integer.valueOf(s[2]));
}else{
NonPolymorphicSites.add(Integer.valueOf(s[2]));
}
}
in.close();
}catch (Exception e){//Catch exception if any
System.err.println("PolymorphicSitesFileReader Error: " + e.getMessage());
}
}
}

View File

@ -0,0 +1,94 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import java.io.*;
import java.util.ArrayList;
/**
*
* @author shermanjia
*/
public class SAMFileReader {
ArrayList<String> ReadStrings = new ArrayList<String>();
ArrayList<String> CigarStrings = new ArrayList<String>();
ArrayList<String> ReadNames = new ArrayList<String>();
ArrayList<Integer> ReadStartPositions = new ArrayList<Integer>();
ArrayList<Integer> ReadStopPositions = new ArrayList<Integer>();
int minstartpos;
int maxstoppos;
CigarParser formatter = new CigarParser();
public String[] GetReads(){
return ReadStrings.toArray(new String[ReadStrings.size()]);
}
public String[] GetReadNames(){
return ReadNames.toArray(new String[ReadNames.size()]);
}
public String[] GetCigarStrings(){
return CigarStrings.toArray(new String[CigarStrings.size()]);
}
public Integer[] GetStartPositions(){
return ReadStartPositions.toArray(new Integer[ReadStartPositions.size()]);
}
public Integer[] GetStopPositions(){
return ReadStopPositions.toArray(new Integer[ReadStopPositions.size()]);
}
public Integer GetMinStartPos(){
return minstartpos;
}
public Integer GetMaxStopPos(){
return maxstoppos;
}
public int GetReadIndex(String readname){
if (ReadNames.contains(readname)){
return ReadNames.indexOf(readname);
}else{
return -1;
}
}
public void ReadFile(String filename){
try{
FileInputStream fstream = new FileInputStream(filename);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine; String [] s = null;
//Read File Line By Line
int i = 0;
while ((strLine = br.readLine()) != null) {
s = strLine.split("\\t");
if (s.length>=10){
//Parse the reads with cigar parser
String read = formatter.FormatRead(s[5],s[9]);
ReadStrings.add(read);
CigarStrings.add(s[5]);
ReadNames.add(s[0]);
ReadStartPositions.add(Integer.valueOf(s[3]));
ReadStopPositions.add(Integer.valueOf(s[3]) + read.length() - 1);
if (i == 0){
minstartpos = Integer.valueOf(s[3]);
maxstoppos = Integer.valueOf(Integer.valueOf(s[3]) + read.length() - 1);
}
minstartpos = Math.min(minstartpos, Integer.valueOf(s[3]));
maxstoppos = Math.max(maxstoppos, Integer.valueOf(Integer.valueOf(s[3]) + read.length() - 1));
i++;
}
}
in.close();
}catch (Exception e){//Catch exception if any
System.err.println("SAMFileReader Error: " + e.getMessage());
}
}
}

View File

@ -0,0 +1,49 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import java.io.*;
import java.util.ArrayList;
/**
*
* @author shermanjia
*/
public class SimilarityFileReader {
ArrayList<String> ReadsToDiscard = new ArrayList<String>();
public ArrayList<String> GetReadsToDiscard(){
return ReadsToDiscard;
}
public String[] GetReadsToDiscardArray(){
return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]);
}
public void ReadFile(String filename){
try{
FileInputStream fstream = new FileInputStream(filename);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine; String [] s = null;
//Read File Line By Line
int i = 0;
while ((strLine = br.readLine()) != null) {
s = strLine.split("\\t");
if (s.length >= 6){
Double matchFraction = Double.valueOf(s[4]);
int numMismatches = Integer.valueOf(s[6]);
if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches >= 6)){
ReadsToDiscard.add(s[0]);
}
}
}
in.close();
}catch (Exception e){//Catch exception if any
System.err.println("SimilarityFile Error: " + e.getMessage());
}
}
}