Modified CreatePedFileWalker to output PED file given HLA allele names

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1763 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
sjia 2009-10-05 03:06:42 +00:00
parent 56bc4fa21a
commit 98076db6b4
6 changed files with 242 additions and 58 deletions

View File

@ -8,10 +8,6 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.FileInputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Hashtable;

View File

@ -85,6 +85,9 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
similarityReader.ReadFile(filterFile);
ReadsToDiscard = similarityReader.GetReadsToDiscard();
out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
for (int i = 0; i < ReadsToDiscard.size(); i++){
out.printf("MISALIGNED %s\n", ReadsToDiscard.get(i).toString());
}
}
}
return new Pair<Long,Long>(0l,0l);
@ -130,12 +133,12 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
//consider base in likelihood calculations if it looks good and has high mapping score
G.add(base, qual, read, offset);
if (DEBUG){
//if (DEBUG){
if (base == 'A'){numAs++;}
else if (base == 'C'){numCs++;}
else if (base == 'T'){numTs++;}
else if (base == 'G'){numGs++;}
}
//}
}
}

View File

@ -6,6 +6,7 @@
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.GATKArgumentCollection;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.ArrayList;
@ -32,9 +33,14 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
@Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false)
public String debugAlleles = "";
@Argument(fullName = "phaseInterval", shortName = "phaseInterval", doc = "Use only these intervals in phase calculation", required = false)
public String phaseIntervalFile = "";
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
public boolean ONLYFREQUENT = false;
GATKArgumentCollection args = this.getToolkit().getArguments();
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
String AlleleFrequencyFile;
@ -45,11 +51,13 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
String[] HLAnames, HLAreads;
ArrayList<String> ReadsToDiscard;
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites;
int[][] numObservations, totalObservations;
int[][] numObservations, totalObservations, intervals;
int[] SNPnumInRead, SNPposInRead;
CigarParser cigarparser = new CigarParser();
Hashtable AlleleFrequencies;
int numIntervals;
public Integer reduceInit() {
@ -109,10 +117,37 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
PolymorphicSites = FindPolymorphicSites(HLADictionaryReader.GetMinStartPos(),HLADictionaryReader.GetMaxStopPos());
*/
//Determine intervals
if (!phaseIntervalFile.equals("")){
TextFileReader fileReader = new TextFileReader();
fileReader.ReadFile(phaseIntervalFile);
String[] lines = fileReader.GetLines();
intervals = new int[lines.length][2];
for (int i = 0; i < lines.length; i++) {
String[] s = lines[i].split(":");
String[] intervalPieces = s[0].split("-");
intervals[i][0] = Integer.valueOf(intervalPieces[0]);
intervals[i][1] = Integer.valueOf(intervalPieces[1]);
}
numIntervals = intervals.length;
for (int i = 0; i < numIntervals; i++){
out.printf("INFO Interval %s: %s-%s\n",i+1,intervals[i][0],intervals[i][1]);
}
}
}
return 0;
}
private boolean IsWithinInterval(int pos){
boolean isWithinInterval = false;
for (int i = 0; i < numIntervals; i++){
if (pos >= intervals[i][0] && pos <= intervals[i][1]){
isWithinInterval = true;
break;
}
}
return isWithinInterval;
}
public Integer map(char[] ref, SAMRecord read) {
if (!ReadsToDiscard.contains(read.getReadName())){
@ -264,7 +299,7 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
//Find all SNPs in read
for (i = 0; i < numPositions; i++){
if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop){
if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop && IsWithinInterval(PolymorphicSites[i])){
SNPnumInRead[i] = SNPcount;
SNPposInRead[i] = PolymorphicSites[i]-combinedstart;
SNPcount++;

View File

@ -4,29 +4,41 @@
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.playground.gatk.walkers.HLAcaller.ReadCigarFormatter;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.FileInputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import java.lang.Math;
/**
*
* @author shermanjia
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class CreatePedFileWalker extends ReadWalker<Integer, Integer> {
@Argument(fullName = "allelesFile", shortName = "allelesFile", doc = "Create ped file for HLA alleles named in this file", required = true)
public String alleleNamesFile = "";
@Argument(fullName = "pedIntervals", shortName = "pedIntervals", doc = "Create genotypes in these intervals", required = false)
public String pedIntervalsFile = "";
String[] HLAnames, HLAreads, inputFileContents;
Integer[] HLAstartpos, HLAstoppos;
ArrayList<String> HLAnamesAL, HLAreadsAL;
ArrayList<Integer> HLAstartposAL, HLAstopposAL;
int[][] intervals;
int numIntervals;
ReadCigarFormatter formatter = new ReadCigarFormatter();
char c;
boolean DEBUG = false;
boolean FilesLoaded = false;
int HLA_A_start = 30018310;
int HLA_A_end = 30021211;
int HLA_B_start = 31430239;
int HLA_B_end = 31432914;
int HLA_C_start = 31344925;
int HLA_C_end = 31347827;
String[] SNPnames;
String SNPname;
int start, end;
@ -35,60 +47,155 @@ public class CreatePedFileWalker extends ReadWalker<Integer, Integer> {
Hashtable indexer = new Hashtable();
public Integer reduceInit() {
SNPnames = new String[HLA_A_end-HLA_A_start+1];
start = HLA_A_start;
end = HLA_A_end;
if (!FilesLoaded){
FilesLoaded = true;
HLAnamesAL = new ArrayList<String>();
HLAreadsAL = new ArrayList<String>();
HLAstartposAL = new ArrayList<Integer>();
HLAstopposAL = new ArrayList<Integer>();
indexer.put('A', (Integer) 1);
indexer.put('C', (Integer) 2);
indexer.put('G', (Integer) 3);
indexer.put('T', (Integer) 4);
indexer.put('D', (Integer) 0); // D for deletion
out.print("Reads:\n");
TextFileReader fileReader = new TextFileReader();
fileReader.ReadFile(alleleNamesFile);
inputFileContents = fileReader.GetLines();
//Determine intervals
if (!pedIntervalsFile.equals("")){
fileReader = new TextFileReader();
fileReader.ReadFile(pedIntervalsFile);
String[] lines = fileReader.GetLines();
intervals = new int[lines.length][2];
for (int i = 0; i < lines.length; i++) {
String[] s = lines[i].split(":");
String[] intervalPieces = s[0].split("-");
intervals[i][0] = Integer.valueOf(intervalPieces[0]);
intervals[i][1] = Integer.valueOf(intervalPieces[1]);
}
numIntervals = intervals.length;
for (int i = 0; i < numIntervals; i++){
out.printf("INFO Interval %s: %s-%s\n",i+1,intervals[i][0],intervals[i][1]);
}
}
}
return 0;
}
private int BaseCharToInt(char c){
switch(c){
case 'A': return 1;
case 'C': return 2;
case 'G': return 3;
case 'T': return 4;
default: return -1;
}
}
public Integer map(char[] ref, SAMRecord read) {
int readstart = read.getAlignmentStart();
int readstop = read.getAlignmentEnd();
if(readstart <= HLA_A_end && readstop >= HLA_A_start){
String s = formatter.FormatRead(read.getCigarString(),read.getReadString());
String name = read.getReadName();
out.printf("%s\t%s\t0\t0\tM",name,name);
for (int i = start; i <= end; i++){
if (i - readstart < s.length() && i - readstart >= 0){
c = s.charAt(i-readstart);
I = (Integer) indexer.get(c);
out.printf("\t%s %s",I,I);
}else{
out.print("\t0 0");
}
}
out.printf("\n");
}
HLAnamesAL.add(read.getReadName());
HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString()));
HLAstartposAL.add(read.getAlignmentStart());
HLAstopposAL.add(read.getAlignmentEnd());
return 1;
}
private void PrintGenotypes(String alleleName1, String alleleName2, int startpos, int stoppos){
//prints genotypes for allele1 and allele2 at given interval
int i1 = GetAlleleIndex(alleleName1);
int i2 = GetAlleleIndex(alleleName2);
String s1, s2;
int start1, start2, stop1, stop2;
char c1, c2;
if (i1 > -1){
s1 = HLAreads[i1];
start1 = HLAstartpos[i1];
stop1 = HLAstoppos[i1];
}else{
s1 = "";
start1 = -1;
stop1 = -1;
}
if (i2 > -1){
s2 = HLAreads[i2];
start2 = HLAstartpos[i2];
stop2 = HLAstoppos[i2];
}else{
s2 = "";
start2 = -1;
stop2 = -1;
}
for (int pos = startpos; pos <= stoppos; pos++){
if (pos >= start1 && pos <= stop1){
c1 = s1.charAt(pos-start1);
if (c1 == 'D'){c1 = '0';}
}else{
c1 = '0';
}
if (pos >= start2 && pos <= stop2){
c2 = s2.charAt(pos-start2);
if (c2 == 'D'){c2 = '0';}
}else{
c2 = '0';
}
out.printf("\t%s %s",c1,c2);
}
}
private int GetAlleleIndex(String alleleName){
int i;
for (i = 0; i < HLAnames.length; i++){
if (HLAnames[i].equals(alleleName)){
return i;
}
}
return -1;
}
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
public void onTraversalDone(Integer value) {
out.print("\nSNP names:\n");
for (int pos = start; pos <= end; pos++){
SNPname = "M CHR6_POS" + String.valueOf(pos);
SNPnames[pos-start]=SNPname;
out.printf("%s\n",SNPname);
public void onTraversalDone(Integer numreads) {
HLAnames = HLAnamesAL.toArray(new String[numreads]);
HLAreads = HLAreadsAL.toArray(new String[numreads]);
HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]);
HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]);
//Print individual info and genotypes
for (int i = 0; i < inputFileContents.length; i++){
String[] s = inputFileContents[i].split(" ");
//out.printf("%s\t%s\n",inputFileContents[i],s.length);
if (s.length > 10){
out.printf("%s\t%s\t%s\t%s\t%s\t%s",s[0],s[1],s[2],s[3],s[4],s[5]);
String HLA_A1 = "HLA_A" + s[6];
String HLA_A2 = "HLA_A" + s[7];
String HLA_B1 = "HLA_B" + s[8];
String HLA_B2 = "HLA_B" + s[9];
String HLA_C1 = "HLA_C" + s[10];
String HLA_C2 = "HLA_C" + s[11];
PrintGenotypes(HLA_A1,HLA_A2, HLA_A_start,HLA_A_end);
PrintGenotypes(HLA_C1,HLA_C2, HLA_C_start,HLA_C_end);
PrintGenotypes(HLA_B1,HLA_B2, HLA_B_start,HLA_B_end);
out.printf("\n");
}
}
//Prints SNP names for each site
PrintSNPS(HLA_A_start,HLA_A_end);
PrintSNPS(HLA_C_start,HLA_C_end);
PrintSNPS(HLA_B_start,HLA_B_end);
}
private void PrintSNPS(int startpos, int stoppos){
for (int pos = startpos; pos <= stoppos; pos++){
SNPname = "CHR6_POS" + String.valueOf(pos);
out.printf("6\t%s\t0\t%s\n",SNPname,pos);
}
}
}

View File

@ -177,7 +177,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
c1 = s1.charAt(pos-readstart);
c2 = s2.charAt(pos-allelestart);
if (c1 != 'D'){
if (c1 != 'D'){//allow for deletions (sequencing errors)
numcompared[i]++;
if (c1 == c2){
nummatched[i]++;
@ -197,7 +197,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
c1 = s1.charAt(pos-readstart);
c2 = s2.charAt(pos-allelestart);
if (c1 != c2){
if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors)
numcompared[i]++;
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
@ -241,7 +241,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
String readname = read.getReadName(), allelename = ""; double freq;
//For input bam files that contain HLA alleles, find and print allele frequency
freq = GetAlleleFrequency(readname);
out.printf("%s\t%.4f", readname,freq);
out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());
//Find the maximum frequency of the alleles most concordant with the read
double maxFreq = FindMaxAlleleFrequency(maxConcordance);

View File

@ -0,0 +1,43 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
import java.io.*;
import java.util.ArrayList;
/**
*
* @author shermanjia
*/
public class TextFileReader {
ArrayList<String> lines = new ArrayList<String>();
int numLines = 0;
public String[] GetLines(){
return lines.toArray(new String[lines.size()]);
}
public int GetNumLines(){
return numLines;
}
public void ReadFile(String filename){
try{
FileInputStream fstream = new FileInputStream(filename);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine;
//Read File Line By Line
while ((strLine = br.readLine()) != null) {
lines.add(strLine);
numLines++;
}
in.close();
}catch (Exception e){//Catch exception if any
System.err.println("TextFileReader Error: " + e.getMessage());
}
}
}