Modified CreatePedFileWalker to output PED file given HLA allele names
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1763 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
56bc4fa21a
commit
98076db6b4
|
|
@ -8,10 +8,6 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Hashtable;
|
||||
|
||||
|
|
|
|||
|
|
@ -85,6 +85,9 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
|||
similarityReader.ReadFile(filterFile);
|
||||
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
||||
out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
|
||||
for (int i = 0; i < ReadsToDiscard.size(); i++){
|
||||
out.printf("MISALIGNED %s\n", ReadsToDiscard.get(i).toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
return new Pair<Long,Long>(0l,0l);
|
||||
|
|
@ -130,12 +133,12 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
|||
|
||||
//consider base in likelihood calculations if it looks good and has high mapping score
|
||||
G.add(base, qual, read, offset);
|
||||
if (DEBUG){
|
||||
//if (DEBUG){
|
||||
if (base == 'A'){numAs++;}
|
||||
else if (base == 'C'){numCs++;}
|
||||
else if (base == 'T'){numTs++;}
|
||||
else if (base == 'G'){numGs++;}
|
||||
}
|
||||
//}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -32,9 +33,14 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
|
|||
@Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false)
|
||||
public String debugAlleles = "";
|
||||
|
||||
@Argument(fullName = "phaseInterval", shortName = "phaseInterval", doc = "Use only these intervals in phase calculation", required = false)
|
||||
public String phaseIntervalFile = "";
|
||||
|
||||
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
||||
public boolean ONLYFREQUENT = false;
|
||||
|
||||
GATKArgumentCollection args = this.getToolkit().getArguments();
|
||||
|
||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
|
||||
String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
|
||||
String AlleleFrequencyFile;
|
||||
|
|
@ -45,11 +51,13 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
|
|||
String[] HLAnames, HLAreads;
|
||||
ArrayList<String> ReadsToDiscard;
|
||||
Integer[] HLAstartpos, HLAstoppos, PolymorphicSites;
|
||||
|
||||
int[][] numObservations, totalObservations;
|
||||
|
||||
|
||||
int[][] numObservations, totalObservations, intervals;
|
||||
int[] SNPnumInRead, SNPposInRead;
|
||||
CigarParser cigarparser = new CigarParser();
|
||||
Hashtable AlleleFrequencies;
|
||||
int numIntervals;
|
||||
|
||||
public Integer reduceInit() {
|
||||
|
||||
|
|
@ -109,10 +117,37 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
|
|||
PolymorphicSites = FindPolymorphicSites(HLADictionaryReader.GetMinStartPos(),HLADictionaryReader.GetMaxStopPos());
|
||||
*/
|
||||
|
||||
//Determine intervals
|
||||
if (!phaseIntervalFile.equals("")){
|
||||
TextFileReader fileReader = new TextFileReader();
|
||||
fileReader.ReadFile(phaseIntervalFile);
|
||||
String[] lines = fileReader.GetLines();
|
||||
intervals = new int[lines.length][2];
|
||||
for (int i = 0; i < lines.length; i++) {
|
||||
String[] s = lines[i].split(":");
|
||||
String[] intervalPieces = s[0].split("-");
|
||||
intervals[i][0] = Integer.valueOf(intervalPieces[0]);
|
||||
intervals[i][1] = Integer.valueOf(intervalPieces[1]);
|
||||
}
|
||||
numIntervals = intervals.length;
|
||||
for (int i = 0; i < numIntervals; i++){
|
||||
out.printf("INFO Interval %s: %s-%s\n",i+1,intervals[i][0],intervals[i][1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private boolean IsWithinInterval(int pos){
|
||||
boolean isWithinInterval = false;
|
||||
for (int i = 0; i < numIntervals; i++){
|
||||
if (pos >= intervals[i][0] && pos <= intervals[i][1]){
|
||||
isWithinInterval = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return isWithinInterval;
|
||||
}
|
||||
|
||||
public Integer map(char[] ref, SAMRecord read) {
|
||||
if (!ReadsToDiscard.contains(read.getReadName())){
|
||||
|
|
@ -264,7 +299,7 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
|
|||
|
||||
//Find all SNPs in read
|
||||
for (i = 0; i < numPositions; i++){
|
||||
if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop){
|
||||
if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop && IsWithinInterval(PolymorphicSites[i])){
|
||||
SNPnumInRead[i] = SNPcount;
|
||||
SNPposInRead[i] = PolymorphicSites[i]-combinedstart;
|
||||
SNPcount++;
|
||||
|
|
|
|||
|
|
@ -4,29 +4,41 @@
|
|||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.HLAcaller.ReadCigarFormatter;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Hashtable;
|
||||
import java.util.List;
|
||||
import java.lang.Math;
|
||||
/**
|
||||
*
|
||||
* @author shermanjia
|
||||
*/
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
public class CreatePedFileWalker extends ReadWalker<Integer, Integer> {
|
||||
@Argument(fullName = "allelesFile", shortName = "allelesFile", doc = "Create ped file for HLA alleles named in this file", required = true)
|
||||
public String alleleNamesFile = "";
|
||||
|
||||
@Argument(fullName = "pedIntervals", shortName = "pedIntervals", doc = "Create genotypes in these intervals", required = false)
|
||||
public String pedIntervalsFile = "";
|
||||
|
||||
String[] HLAnames, HLAreads, inputFileContents;
|
||||
Integer[] HLAstartpos, HLAstoppos;
|
||||
ArrayList<String> HLAnamesAL, HLAreadsAL;
|
||||
ArrayList<Integer> HLAstartposAL, HLAstopposAL;
|
||||
int[][] intervals;
|
||||
int numIntervals;
|
||||
|
||||
ReadCigarFormatter formatter = new ReadCigarFormatter();
|
||||
char c;
|
||||
boolean DEBUG = false;
|
||||
boolean FilesLoaded = false;
|
||||
int HLA_A_start = 30018310;
|
||||
int HLA_A_end = 30021211;
|
||||
int HLA_B_start = 31430239;
|
||||
int HLA_B_end = 31432914;
|
||||
int HLA_C_start = 31344925;
|
||||
int HLA_C_end = 31347827;
|
||||
|
||||
String[] SNPnames;
|
||||
String SNPname;
|
||||
int start, end;
|
||||
|
|
@ -35,60 +47,155 @@ public class CreatePedFileWalker extends ReadWalker<Integer, Integer> {
|
|||
Hashtable indexer = new Hashtable();
|
||||
|
||||
public Integer reduceInit() {
|
||||
SNPnames = new String[HLA_A_end-HLA_A_start+1];
|
||||
start = HLA_A_start;
|
||||
end = HLA_A_end;
|
||||
if (!FilesLoaded){
|
||||
FilesLoaded = true;
|
||||
HLAnamesAL = new ArrayList<String>();
|
||||
HLAreadsAL = new ArrayList<String>();
|
||||
HLAstartposAL = new ArrayList<Integer>();
|
||||
HLAstopposAL = new ArrayList<Integer>();
|
||||
|
||||
indexer.put('A', (Integer) 1);
|
||||
indexer.put('C', (Integer) 2);
|
||||
indexer.put('G', (Integer) 3);
|
||||
indexer.put('T', (Integer) 4);
|
||||
indexer.put('D', (Integer) 0); // D for deletion
|
||||
out.print("Reads:\n");
|
||||
TextFileReader fileReader = new TextFileReader();
|
||||
fileReader.ReadFile(alleleNamesFile);
|
||||
inputFileContents = fileReader.GetLines();
|
||||
|
||||
|
||||
//Determine intervals
|
||||
if (!pedIntervalsFile.equals("")){
|
||||
fileReader = new TextFileReader();
|
||||
fileReader.ReadFile(pedIntervalsFile);
|
||||
String[] lines = fileReader.GetLines();
|
||||
intervals = new int[lines.length][2];
|
||||
for (int i = 0; i < lines.length; i++) {
|
||||
String[] s = lines[i].split(":");
|
||||
String[] intervalPieces = s[0].split("-");
|
||||
intervals[i][0] = Integer.valueOf(intervalPieces[0]);
|
||||
intervals[i][1] = Integer.valueOf(intervalPieces[1]);
|
||||
}
|
||||
numIntervals = intervals.length;
|
||||
for (int i = 0; i < numIntervals; i++){
|
||||
out.printf("INFO Interval %s: %s-%s\n",i+1,intervals[i][0],intervals[i][1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private int BaseCharToInt(char c){
|
||||
switch(c){
|
||||
case 'A': return 1;
|
||||
case 'C': return 2;
|
||||
case 'G': return 3;
|
||||
case 'T': return 4;
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Integer map(char[] ref, SAMRecord read) {
|
||||
|
||||
int readstart = read.getAlignmentStart();
|
||||
int readstop = read.getAlignmentEnd();
|
||||
|
||||
if(readstart <= HLA_A_end && readstop >= HLA_A_start){
|
||||
String s = formatter.FormatRead(read.getCigarString(),read.getReadString());
|
||||
String name = read.getReadName();
|
||||
|
||||
out.printf("%s\t%s\t0\t0\tM",name,name);
|
||||
|
||||
for (int i = start; i <= end; i++){
|
||||
|
||||
if (i - readstart < s.length() && i - readstart >= 0){
|
||||
c = s.charAt(i-readstart);
|
||||
I = (Integer) indexer.get(c);
|
||||
out.printf("\t%s %s",I,I);
|
||||
}else{
|
||||
out.print("\t0 0");
|
||||
}
|
||||
}
|
||||
out.printf("\n");
|
||||
}
|
||||
HLAnamesAL.add(read.getReadName());
|
||||
HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString()));
|
||||
HLAstartposAL.add(read.getAlignmentStart());
|
||||
HLAstopposAL.add(read.getAlignmentEnd());
|
||||
return 1;
|
||||
}
|
||||
|
||||
private void PrintGenotypes(String alleleName1, String alleleName2, int startpos, int stoppos){
|
||||
//prints genotypes for allele1 and allele2 at given interval
|
||||
int i1 = GetAlleleIndex(alleleName1);
|
||||
int i2 = GetAlleleIndex(alleleName2);
|
||||
String s1, s2;
|
||||
int start1, start2, stop1, stop2;
|
||||
char c1, c2;
|
||||
|
||||
if (i1 > -1){
|
||||
s1 = HLAreads[i1];
|
||||
start1 = HLAstartpos[i1];
|
||||
stop1 = HLAstoppos[i1];
|
||||
}else{
|
||||
s1 = "";
|
||||
start1 = -1;
|
||||
stop1 = -1;
|
||||
}
|
||||
|
||||
if (i2 > -1){
|
||||
s2 = HLAreads[i2];
|
||||
start2 = HLAstartpos[i2];
|
||||
stop2 = HLAstoppos[i2];
|
||||
}else{
|
||||
s2 = "";
|
||||
start2 = -1;
|
||||
stop2 = -1;
|
||||
}
|
||||
|
||||
for (int pos = startpos; pos <= stoppos; pos++){
|
||||
if (pos >= start1 && pos <= stop1){
|
||||
c1 = s1.charAt(pos-start1);
|
||||
if (c1 == 'D'){c1 = '0';}
|
||||
}else{
|
||||
c1 = '0';
|
||||
}
|
||||
|
||||
if (pos >= start2 && pos <= stop2){
|
||||
c2 = s2.charAt(pos-start2);
|
||||
if (c2 == 'D'){c2 = '0';}
|
||||
}else{
|
||||
c2 = '0';
|
||||
}
|
||||
|
||||
out.printf("\t%s %s",c1,c2);
|
||||
}
|
||||
}
|
||||
|
||||
private int GetAlleleIndex(String alleleName){
|
||||
int i;
|
||||
for (i = 0; i < HLAnames.length; i++){
|
||||
if (HLAnames[i].equals(alleleName)){
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer value) {
|
||||
out.print("\nSNP names:\n");
|
||||
for (int pos = start; pos <= end; pos++){
|
||||
SNPname = "M CHR6_POS" + String.valueOf(pos);
|
||||
SNPnames[pos-start]=SNPname;
|
||||
out.printf("%s\n",SNPname);
|
||||
public void onTraversalDone(Integer numreads) {
|
||||
HLAnames = HLAnamesAL.toArray(new String[numreads]);
|
||||
HLAreads = HLAreadsAL.toArray(new String[numreads]);
|
||||
HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]);
|
||||
HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]);
|
||||
|
||||
//Print individual info and genotypes
|
||||
for (int i = 0; i < inputFileContents.length; i++){
|
||||
String[] s = inputFileContents[i].split(" ");
|
||||
//out.printf("%s\t%s\n",inputFileContents[i],s.length);
|
||||
if (s.length > 10){
|
||||
out.printf("%s\t%s\t%s\t%s\t%s\t%s",s[0],s[1],s[2],s[3],s[4],s[5]);
|
||||
String HLA_A1 = "HLA_A" + s[6];
|
||||
String HLA_A2 = "HLA_A" + s[7];
|
||||
String HLA_B1 = "HLA_B" + s[8];
|
||||
String HLA_B2 = "HLA_B" + s[9];
|
||||
String HLA_C1 = "HLA_C" + s[10];
|
||||
String HLA_C2 = "HLA_C" + s[11];
|
||||
PrintGenotypes(HLA_A1,HLA_A2, HLA_A_start,HLA_A_end);
|
||||
PrintGenotypes(HLA_C1,HLA_C2, HLA_C_start,HLA_C_end);
|
||||
PrintGenotypes(HLA_B1,HLA_B2, HLA_B_start,HLA_B_end);
|
||||
out.printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
//Prints SNP names for each site
|
||||
PrintSNPS(HLA_A_start,HLA_A_end);
|
||||
PrintSNPS(HLA_C_start,HLA_C_end);
|
||||
PrintSNPS(HLA_B_start,HLA_B_end);
|
||||
}
|
||||
|
||||
private void PrintSNPS(int startpos, int stoppos){
|
||||
for (int pos = startpos; pos <= stoppos; pos++){
|
||||
SNPname = "CHR6_POS" + String.valueOf(pos);
|
||||
out.printf("6\t%s\t0\t%s\n",SNPname,pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -177,7 +177,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
|
||||
c1 = s1.charAt(pos-readstart);
|
||||
c2 = s2.charAt(pos-allelestart);
|
||||
if (c1 != 'D'){
|
||||
if (c1 != 'D'){//allow for deletions (sequencing errors)
|
||||
numcompared[i]++;
|
||||
if (c1 == c2){
|
||||
nummatched[i]++;
|
||||
|
|
@ -197,7 +197,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
|
||||
c1 = s1.charAt(pos-readstart);
|
||||
c2 = s2.charAt(pos-allelestart);
|
||||
if (c1 != c2){
|
||||
if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors)
|
||||
numcompared[i]++;
|
||||
if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
|
||||
out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
|
||||
|
|
@ -241,7 +241,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
|||
String readname = read.getReadName(), allelename = ""; double freq;
|
||||
//For input bam files that contain HLA alleles, find and print allele frequency
|
||||
freq = GetAlleleFrequency(readname);
|
||||
out.printf("%s\t%.4f", readname,freq);
|
||||
out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());
|
||||
|
||||
//Find the maximum frequency of the alleles most concordant with the read
|
||||
double maxFreq = FindMaxAlleleFrequency(maxConcordance);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,43 @@
|
|||
/*
|
||||
* To change this template, choose Tools | Templates
|
||||
* and open the template in the editor.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
/**
|
||||
*
|
||||
* @author shermanjia
|
||||
*/
|
||||
public class TextFileReader {
|
||||
ArrayList<String> lines = new ArrayList<String>();
|
||||
int numLines = 0;
|
||||
|
||||
public String[] GetLines(){
|
||||
return lines.toArray(new String[lines.size()]);
|
||||
}
|
||||
|
||||
public int GetNumLines(){
|
||||
return numLines;
|
||||
}
|
||||
|
||||
public void ReadFile(String filename){
|
||||
try{
|
||||
FileInputStream fstream = new FileInputStream(filename);
|
||||
DataInputStream in = new DataInputStream(fstream);
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(in));
|
||||
String strLine;
|
||||
//Read File Line By Line
|
||||
while ((strLine = br.readLine()) != null) {
|
||||
lines.add(strLine);
|
||||
numLines++;
|
||||
}
|
||||
in.close();
|
||||
}catch (Exception e){//Catch exception if any
|
||||
System.err.println("TextFileReader Error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue