From 98076db6b40b37405d0d4be9defebf8778c61fd2 Mon Sep 17 00:00:00 2001 From: sjia Date: Mon, 5 Oct 2009 03:06:42 +0000 Subject: [PATCH] Modified CreatePedFileWalker to output PED file given HLA allele names git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1763 348d0f76-0448-11de-a6fe-93d51630548a --- .../CalculateAlleleLikelihoodsWalker.java | 4 - .../CalculateBaseLikelihoodsWalker.java | 7 +- .../CalculatePhaseLikelihoodsWalker.java | 41 +++- .../HLAcaller/CreatePedFileWalker.java | 199 ++++++++++++++---- .../HLAcaller/FindClosestAlleleWalker.java | 6 +- .../walkers/HLAcaller/TextFileReader.java | 43 ++++ 6 files changed, 242 insertions(+), 58 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/TextFileReader.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java index 6fb482efc..a6acfb238 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java @@ -8,10 +8,6 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.cmdLine.Argument; -import java.io.FileInputStream; -import java.io.BufferedReader; -import java.io.DataInputStream; -import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Hashtable; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java index 888f3bc1f..bb14a64fe 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java @@ -85,6 +85,9 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker(0l,0l); @@ -130,12 +133,12 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker ReadsToDiscard; Integer[] HLAstartpos, HLAstoppos, PolymorphicSites; - - int[][] numObservations, totalObservations; + + + int[][] numObservations, totalObservations, intervals; int[] SNPnumInRead, SNPposInRead; CigarParser cigarparser = new CigarParser(); Hashtable AlleleFrequencies; + int numIntervals; public Integer reduceInit() { @@ -109,10 +117,37 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker= intervals[i][0] && pos <= intervals[i][1]){ + isWithinInterval = true; + break; + } + } + return isWithinInterval; + } public Integer map(char[] ref, SAMRecord read) { if (!ReadsToDiscard.contains(read.getReadName())){ @@ -264,7 +299,7 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker combinedstart && PolymorphicSites[i] < combinedstop){ + if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop && IsWithinInterval(PolymorphicSites[i])){ SNPnumInRead[i] = SNPcount; SNPposInRead[i] = PolymorphicSites[i]-combinedstart; SNPcount++; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreatePedFileWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreatePedFileWalker.java index 41bc1f173..d179e67b1 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreatePedFileWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreatePedFileWalker.java @@ -4,29 +4,41 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.playground.gatk.walkers.HLAcaller.ReadCigarFormatter; import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; -import java.io.FileInputStream; -import java.io.BufferedReader; -import java.io.DataInputStream; -import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Hashtable; -import java.util.List; -import java.lang.Math; /** * * @author shermanjia */ @Requires({DataSource.READS, DataSource.REFERENCE}) public class CreatePedFileWalker extends ReadWalker { + @Argument(fullName = "allelesFile", shortName = "allelesFile", doc = "Create ped file for HLA alleles named in this file", required = true) + public String alleleNamesFile = ""; + + @Argument(fullName = "pedIntervals", shortName = "pedIntervals", doc = "Create genotypes in these intervals", required = false) + public String pedIntervalsFile = ""; + + String[] HLAnames, HLAreads, inputFileContents; + Integer[] HLAstartpos, HLAstoppos; + ArrayList HLAnamesAL, HLAreadsAL; + ArrayList HLAstartposAL, HLAstopposAL; + int[][] intervals; + int numIntervals; + ReadCigarFormatter formatter = new ReadCigarFormatter(); char c; boolean DEBUG = false; + boolean FilesLoaded = false; int HLA_A_start = 30018310; int HLA_A_end = 30021211; + int HLA_B_start = 31430239; + int HLA_B_end = 31432914; + int HLA_C_start = 31344925; + int HLA_C_end = 31347827; + String[] SNPnames; String SNPname; int start, end; @@ -35,60 +47,155 @@ public class CreatePedFileWalker extends ReadWalker { Hashtable indexer = new Hashtable(); public Integer reduceInit() { - SNPnames = new String[HLA_A_end-HLA_A_start+1]; - start = HLA_A_start; - end = HLA_A_end; + if (!FilesLoaded){ + FilesLoaded = true; + HLAnamesAL = new ArrayList(); + HLAreadsAL = new ArrayList(); + HLAstartposAL = new ArrayList(); + HLAstopposAL = new ArrayList(); - indexer.put('A', (Integer) 1); - indexer.put('C', (Integer) 2); - indexer.put('G', (Integer) 3); - indexer.put('T', (Integer) 4); - indexer.put('D', (Integer) 0); // D for deletion - out.print("Reads:\n"); + TextFileReader fileReader = new TextFileReader(); + fileReader.ReadFile(alleleNamesFile); + inputFileContents = fileReader.GetLines(); + + + //Determine intervals + if (!pedIntervalsFile.equals("")){ + fileReader = new TextFileReader(); + fileReader.ReadFile(pedIntervalsFile); + String[] lines = fileReader.GetLines(); + intervals = new int[lines.length][2]; + for (int i = 0; i < lines.length; i++) { + String[] s = lines[i].split(":"); + String[] intervalPieces = s[0].split("-"); + intervals[i][0] = Integer.valueOf(intervalPieces[0]); + intervals[i][1] = Integer.valueOf(intervalPieces[1]); + } + numIntervals = intervals.length; + for (int i = 0; i < numIntervals; i++){ + out.printf("INFO Interval %s: %s-%s\n",i+1,intervals[i][0],intervals[i][1]); + } + } + } return 0; } + private int BaseCharToInt(char c){ + switch(c){ + case 'A': return 1; + case 'C': return 2; + case 'G': return 3; + case 'T': return 4; + default: return -1; + } + } public Integer map(char[] ref, SAMRecord read) { - - int readstart = read.getAlignmentStart(); - int readstop = read.getAlignmentEnd(); - - if(readstart <= HLA_A_end && readstop >= HLA_A_start){ - String s = formatter.FormatRead(read.getCigarString(),read.getReadString()); - String name = read.getReadName(); - - out.printf("%s\t%s\t0\t0\tM",name,name); - - for (int i = start; i <= end; i++){ - - if (i - readstart < s.length() && i - readstart >= 0){ - c = s.charAt(i-readstart); - I = (Integer) indexer.get(c); - out.printf("\t%s %s",I,I); - }else{ - out.print("\t0 0"); - } - } - out.printf("\n"); - } + HLAnamesAL.add(read.getReadName()); + HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString())); + HLAstartposAL.add(read.getAlignmentStart()); + HLAstopposAL.add(read.getAlignmentEnd()); return 1; } + private void PrintGenotypes(String alleleName1, String alleleName2, int startpos, int stoppos){ + //prints genotypes for allele1 and allele2 at given interval + int i1 = GetAlleleIndex(alleleName1); + int i2 = GetAlleleIndex(alleleName2); + String s1, s2; + int start1, start2, stop1, stop2; + char c1, c2; + if (i1 > -1){ + s1 = HLAreads[i1]; + start1 = HLAstartpos[i1]; + stop1 = HLAstoppos[i1]; + }else{ + s1 = ""; + start1 = -1; + stop1 = -1; + } + + if (i2 > -1){ + s2 = HLAreads[i2]; + start2 = HLAstartpos[i2]; + stop2 = HLAstoppos[i2]; + }else{ + s2 = ""; + start2 = -1; + stop2 = -1; + } + + for (int pos = startpos; pos <= stoppos; pos++){ + if (pos >= start1 && pos <= stop1){ + c1 = s1.charAt(pos-start1); + if (c1 == 'D'){c1 = '0';} + }else{ + c1 = '0'; + } + + if (pos >= start2 && pos <= stop2){ + c2 = s2.charAt(pos-start2); + if (c2 == 'D'){c2 = '0';} + }else{ + c2 = '0'; + } + + out.printf("\t%s %s",c1,c2); + } + } + + private int GetAlleleIndex(String alleleName){ + int i; + for (i = 0; i < HLAnames.length; i++){ + if (HLAnames[i].equals(alleleName)){ + return i; + } + } + return -1; + + } public Integer reduce(Integer value, Integer sum) { - return value + sum; } - public void onTraversalDone(Integer value) { - out.print("\nSNP names:\n"); - for (int pos = start; pos <= end; pos++){ - SNPname = "M CHR6_POS" + String.valueOf(pos); - SNPnames[pos-start]=SNPname; - out.printf("%s\n",SNPname); + public void onTraversalDone(Integer numreads) { + HLAnames = HLAnamesAL.toArray(new String[numreads]); + HLAreads = HLAreadsAL.toArray(new String[numreads]); + HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]); + HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]); + + //Print individual info and genotypes + for (int i = 0; i < inputFileContents.length; i++){ + String[] s = inputFileContents[i].split(" "); + //out.printf("%s\t%s\n",inputFileContents[i],s.length); + if (s.length > 10){ + out.printf("%s\t%s\t%s\t%s\t%s\t%s",s[0],s[1],s[2],s[3],s[4],s[5]); + String HLA_A1 = "HLA_A" + s[6]; + String HLA_A2 = "HLA_A" + s[7]; + String HLA_B1 = "HLA_B" + s[8]; + String HLA_B2 = "HLA_B" + s[9]; + String HLA_C1 = "HLA_C" + s[10]; + String HLA_C2 = "HLA_C" + s[11]; + PrintGenotypes(HLA_A1,HLA_A2, HLA_A_start,HLA_A_end); + PrintGenotypes(HLA_C1,HLA_C2, HLA_C_start,HLA_C_end); + PrintGenotypes(HLA_B1,HLA_B2, HLA_B_start,HLA_B_end); + out.printf("\n"); + } + } + + //Prints SNP names for each site + PrintSNPS(HLA_A_start,HLA_A_end); + PrintSNPS(HLA_C_start,HLA_C_end); + PrintSNPS(HLA_B_start,HLA_B_end); + } + + private void PrintSNPS(int startpos, int stoppos){ + for (int pos = startpos; pos <= stoppos; pos++){ + SNPname = "CHR6_POS" + String.valueOf(pos); + out.printf("6\t%s\t0\t%s\n",SNPname,pos); } } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java index 17d644be4..76e8ec77b 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java @@ -177,7 +177,7 @@ public class FindClosestAlleleWalker extends ReadWalker { if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){ c1 = s1.charAt(pos-readstart); c2 = s2.charAt(pos-allelestart); - if (c1 != 'D'){ + if (c1 != 'D'){//allow for deletions (sequencing errors) numcompared[i]++; if (c1 == c2){ nummatched[i]++; @@ -197,7 +197,7 @@ public class FindClosestAlleleWalker extends ReadWalker { if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){ c1 = s1.charAt(pos-readstart); c2 = s2.charAt(pos-allelestart); - if (c1 != c2){ + if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors) numcompared[i]++; if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){ out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2); @@ -241,7 +241,7 @@ public class FindClosestAlleleWalker extends ReadWalker { String readname = read.getReadName(), allelename = ""; double freq; //For input bam files that contain HLA alleles, find and print allele frequency freq = GetAlleleFrequency(readname); - out.printf("%s\t%.4f", readname,freq); + out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd()); //Find the maximum frequency of the alleles most concordant with the read double maxFreq = FindMaxAlleleFrequency(maxConcordance); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/TextFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/TextFileReader.java new file mode 100644 index 000000000..adc52546c --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/TextFileReader.java @@ -0,0 +1,43 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; + +import java.io.*; +import java.util.ArrayList; +/** + * + * @author shermanjia + */ +public class TextFileReader { + ArrayList lines = new ArrayList(); + int numLines = 0; + + public String[] GetLines(){ + return lines.toArray(new String[lines.size()]); + } + + public int GetNumLines(){ + return numLines; + } + + public void ReadFile(String filename){ + try{ + FileInputStream fstream = new FileInputStream(filename); + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; + //Read File Line By Line + while ((strLine = br.readLine()) != null) { + lines.add(strLine); + numLines++; + } + in.close(); + }catch (Exception e){//Catch exception if any + System.err.println("TextFileReader Error: " + e.getMessage()); + } + } +} +