Remove PhaselikelihoodsWalker.java

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3544 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
sjia 2010-06-14 15:21:43 +00:00
parent 5704294f9d
commit d8c963c91c
1 changed files with 0 additions and 402 deletions

View File

@ -1,402 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.commandline.Argument;
import java.util.ArrayList;
import java.util.Hashtable;
/**
* Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T CalculatePhaseLikelihoods -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\
dMismatches 7] -ethnicity Caucasian | grep -v "INFO" | grep -v "DEBUG" | grep -v "DONE!" > OUTPUT.phaselikelihoods
* @author shermanjia
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true)
public String baseLikelihoodsFile = "";
@Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false)
public boolean DEBUG = false;
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
public String filterFile = "";
@Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
public String ethnicity = "CaucasiansUSA";
@Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false)
public String debugAlleles = "";
@Argument(fullName = "phaseInterval", shortName = "phaseInterval", doc = "Use only these intervals in phase calculation", required = false)
public String phaseIntervalFile = "";
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
public boolean ONLYFREQUENT = false;
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
public int MINALLOWEDMISMATCHES = 7;
GATKArgumentCollection args = this.getToolkit().getArguments();
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAllelesCommon";
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
SAMFileReader HLADictionaryReader = new SAMFileReader();
boolean HLAdataLoaded = false;
String[] HLAnames, HLAreads;
ArrayList<String> ReadsToDiscard;
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites;
int[][] numObservations, totalObservations, intervals;
int[] SNPnumInRead, SNPposInRead;
CigarParser cigarparser = new CigarParser();
Hashtable AlleleFrequencies, UniqueAlleles;
int numIntervals;
double P_err = 0.01;
double P_correct = 1 - P_err;
double L_err = Math.log10(P_err);
double L_correct = Math.log10(P_correct);
public Integer reduceInit() {
if (!HLAdataLoaded){
HLAdataLoaded = true;
//Load HLA dictionary
out.printf("INFO Loading HLA dictionary ... ");
HLADictionaryReader.ReadFile(HLAdatabaseFile);
HLAreads = HLADictionaryReader.GetReads();
HLAnames = HLADictionaryReader.GetReadNames();
HLAstartpos = HLADictionaryReader.GetStartPositions();
HLAstoppos = HLADictionaryReader.GetStopPositions();
out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
if (!filterFile.equals("")){
out.printf("INFO Reading properties file ... ");
SimilarityFileReader similarityReader = new SimilarityFileReader();
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
ReadsToDiscard = similarityReader.GetReadsToDiscard();
out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
}
//Reading base likelihoods
if (!baseLikelihoodsFile.equals("")){
//Get only sites found to be different from reference
out.printf("INFO Reading base likelihoods file ... ");
BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader();
baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, true);
PolymorphicSites = baseLikelihoodsReader.GetPolymorphicSites();
out.printf("%s polymorphic sites found\n",PolymorphicSites.length);
}else{
//use all sites in class 1 HLA
out.printf("INFO Using all positions in the classical HLA genes ... ");
PolymorphicSites = InitializePolymorphicSites();
}
int l = PolymorphicSites.length;
SNPnumInRead = new int[l];
SNPposInRead = new int[l];
numObservations = new int[l*5][l*5];
totalObservations = new int[l][l];
//Read allele frequencies
if (!ethnicity.equals("CaucasianUSA")){
AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_" + ethnicity + ".freq";
}
out.printf("INFO Reading HLA allele frequencies for %s population ... ", ethnicity);
FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
UniqueAlleles = HLAfreqReader.GetUniqueAlleles();
out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
/*
PolymorphicSites = FindPolymorphicSites(HLADictionaryReader.GetMinStartPos(),HLADictionaryReader.GetMaxStopPos());
*/
//Determine intervals
if (!phaseIntervalFile.equals("")){
TextFileReader fileReader = new TextFileReader();
fileReader.ReadFile(phaseIntervalFile);
String[] lines = fileReader.GetLines();
intervals = new int[lines.length][2];
for (int i = 0; i < lines.length; i++) {
String[] s = lines[i].split(":");
out.printf("INFO Interval: %s\n",lines[i], s[1]);
String[] intervalPieces = s[1].split("-");
intervals[i][0] = Integer.valueOf(intervalPieces[0]);
intervals[i][1] = Integer.valueOf(intervalPieces[1]);
}
numIntervals = intervals.length;
}
}
return 0;
}
private boolean IsWithinInterval(int pos){
boolean isWithinInterval = false;
for (int i = 0; i < numIntervals; i++){
if (pos >= intervals[i][0] && pos <= intervals[i][1]){
isWithinInterval = true;
break;
}
}
return isWithinInterval;
}
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
if (!ReadsToDiscard.contains(read.getReadName())){
UpdateCorrelation(read);
}else{
}
return 1;
}
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
public void onTraversalDone(Integer numreads) {
String name1, name2;
Double frq1 = 0.0, frq2 = 0.0, likelihood, minfrq = 0.0;
int numCombinations = 0;
if (!debugAlleles.equals("")){
String s[] = debugAlleles.split(",");
int index1 = HLADictionaryReader.GetReadIndex(s[0]);
int index2 = HLADictionaryReader.GetReadIndex(s[1]);
out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2);
if (index1 > -1 && index2 > -1){
likelihood = CalculatePhaseLikelihood(index1,index2,true);
}
}
if (ONLYFREQUENT){
minfrq = 0.0001;
}
out.printf("NUM\tAllele1\tAllele2\tPhase\tFrq1\tFrq2\n");
for (int i = 0; i < HLAnames.length; i++){
name1 = HLAnames[i].substring(4);
String [] n1 = name1.split("\\*");
if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){
if (AlleleFrequencies.containsKey(name1)){
frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
}else{
frq1 = .0001;
}
//if (frq1 > minfrq){
for (int j = i; j < HLAnames.length; j++){
name2 = HLAnames[j].substring(4);
String [] n2 = name2.split("\\*");
if (n1[0].equals(n2[0]) && UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){
if (AlleleFrequencies.containsKey(name2)){
frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
}else{
frq2 = .0001;
}
// if (frq2 > minfrq){
likelihood = CalculatePhaseLikelihood(i,j,false);
numCombinations++;
out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2));
// }
}
}
//}
}
}
}
private Integer[] InitializePolymorphicSites(){
int HLA_A_start = 30018310, HLA_A_end = 30021211, num_A_positions = HLA_A_end - HLA_A_start + 1;
int HLA_B_start = 31430239, HLA_B_end = 31432914, num_B_positions = HLA_B_end - HLA_B_start + 1;
int HLA_C_start = 31344925, HLA_C_end = 31347827, num_C_positions = HLA_C_end - HLA_C_start + 1;
Integer[] polymorphicSites = new Integer[num_A_positions+num_B_positions+num_C_positions];
for (int i = 0; i < num_A_positions; i++){
polymorphicSites[i]=HLA_A_start + i;
}
for (int i = 0; i < num_C_positions; i++){
polymorphicSites[i+num_A_positions]=HLA_C_start + i;
}
for (int i = 0; i < num_B_positions; i++){
polymorphicSites[i+num_A_positions+num_C_positions]=HLA_B_start + i;
}
return polymorphicSites;
}
private int IndexOf(char c){
switch(c){
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
//case 'D': return 4;
default: return -1;
}
}
private void UpdateCorrelation(SAMRecord read){
//Updates correlation table with SNPs from specific read (for phasing)
String s = cigarparser.FormatRead(read.getCigarString(), read.getReadString());
ArrayList<Integer> SNPsInRead = new ArrayList<Integer>();
ArrayList<Integer> readindex = new ArrayList<Integer>();
int readstart = read.getAlignmentStart();
int readend = read.getAlignmentEnd();
int numPositions = PolymorphicSites.length;
char c1, c2;
int a, b, i, j, SNPcount = 0;
//Find all SNPs in read
for (i = 0; i < numPositions; i++){
if (PolymorphicSites[i] > readstart && PolymorphicSites[i] < readend){
SNPnumInRead[i] = SNPcount;
SNPposInRead[i] = PolymorphicSites[i]-readstart;
SNPcount++;
}else{
SNPnumInRead[i] = -1;
SNPposInRead[i] = -1;
}
}
//Update correlation table; for each combination of SNP positions
for (i = 0; i < numPositions; i++){
if (SNPnumInRead[i] > -1){
c1 = s.charAt(SNPposInRead[i]);
if (IndexOf(c1) > -1){
for (j = i+1; j < numPositions; j ++){
if (SNPnumInRead[j] > -1){
c2 = s.charAt(SNPposInRead[j]);
if (IndexOf(c2) > -1){
a = i*5 + IndexOf(c1);
b = j*5 + IndexOf(c2);
numObservations[a][b]++;
totalObservations[i][j]++;
if (DEBUG){
out.printf("DEBUG %s\t%s %s\t[i=%s,j=%s]\t[%s,%s]\t[%s,%s]\n",read.getReadName(),PolymorphicSites[i],PolymorphicSites[j],i,j,c1,c2,a,b);
}
}
}
}
}
}
}
}
private double CalculatePhaseLikelihood(int alleleIndex1, int alleleIndex2, boolean PRINTDEBUG){
//calculate the likelihood that the particular combination of alleles satisfies the phase count data
double likelihood = 0, prob = 0;
int readstart1 = HLAstartpos[alleleIndex1]; int readend1 = HLAstoppos[alleleIndex1];
int readstart2 = HLAstartpos[alleleIndex2]; int readend2 = HLAstoppos[alleleIndex2];
int combinedstart = Math.max(readstart1,readstart2);
int combinedstop = Math.min(readend1,readend2);
int numPositions = PolymorphicSites.length, SNPcount = 0;
int i, j, a1, a2, b1, b2;
char c11, c12, c21, c22;
int numInPhase = 0, numOutOfPhase = 0;
double sumInPhase = 0.0, sumObservations = 0.0;
//Find all SNPs in read
for (i = 0; i < numPositions; i++){
if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop && IsWithinInterval(PolymorphicSites[i])){
SNPnumInRead[i] = SNPcount;
SNPposInRead[i] = PolymorphicSites[i]-combinedstart;
SNPcount++;
}else{
SNPnumInRead[i] = -1;
SNPposInRead[i] = -1;
}
}
String s1 = HLAreads[alleleIndex1];
String s2 = HLAreads[alleleIndex2];
if (PRINTDEBUG){
out.printf("DEBUG %s SNPs found in %s and %s\n",SNPcount,HLAnames[alleleIndex1], HLAnames[alleleIndex2]);
}
//Iterate through every pairwise combination of SNPs, and update likelihood for the allele combination
for (i = 0; i < numPositions; i++){
if (SNPnumInRead[i] > -1){
c11 = s1.charAt(SNPposInRead[i]);
c21 = s2.charAt(SNPposInRead[i]);
if (IndexOf(c11) > -1 && IndexOf(c21) > -1){
for (j = i+1; j < numPositions; j ++){
if (SNPnumInRead[j] > -1 && totalObservations[i][j] > 0){
c12 = s1.charAt(SNPposInRead[j]);
c22 = s2.charAt(SNPposInRead[j]);
if (IndexOf(c12) > -1 && IndexOf(c22) > -1){
a1 = i*5 + IndexOf(c11);
b1 = j*5 + IndexOf(c12);
a2 = i*5 + IndexOf(c21);
b2 = j*5 + IndexOf(c22);
//check if the two alleles are identical at the chosen 2 locations
if ((c11 == c21) && (c12 == c22)){
numInPhase = numObservations[a1][b1];
}else{
numInPhase = numObservations[a1][b1] + numObservations[a2][b2];
}
numOutOfPhase = totalObservations[i][j] - numInPhase;
sumInPhase += (double) numInPhase;
sumObservations += (double) totalObservations[i][j];
likelihood += numInPhase * L_correct + numOutOfPhase * L_err;
//prob = Math.max((double) numInPhase / (double) totalObservations[i][j], 0.0001);
//likelihood += Math.log10(prob);
//likelihood = Math.max(Math.log10(sumInPhase / sumObservations),-10);
if (PRINTDEBUG){
out.printf("DEBUG %s %s %s[%s%s] %s[%s%s]\t[%s,%s]\t[%s,%s] [%s,%s]\t%s / %s\t%s / %s\t %.2f\n",HLAnames[alleleIndex1],HLAnames[alleleIndex2],PolymorphicSites[i],c11,c21,PolymorphicSites[j],c12,c22, i,j,a1,b1,a2,b2,numInPhase,totalObservations[i][j],sumInPhase,sumObservations,likelihood);
}
break;
}
}
}
}
}
}
return likelihood;
}
}