From 6e9a81aabe8d8aa67deb114cc222b5d4b45943b7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 21 Jun 2012 16:56:56 -0400 Subject: [PATCH 01/32] Minor bugfix -- now that the testfile is in our testdata regenerate the idx file as needed to pass tests --- .../sting/gatk/refdata/utils/TestRMDTrackBuilder.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java index 9d14cd74c..4e6fe5939 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java @@ -58,8 +58,8 @@ public class TestRMDTrackBuilder extends RMDTrackBuilder { Index index; try { // Create a feature reader that creates checkable tribble iterators. + index = loadIndex(inputFile, codec); featureReader = new TestFeatureReader(inputFile.getAbsolutePath(), codec); - index = loadFromDisk(inputFile, Tribble.indexFile(inputFile)); } catch (IOException e) { throw new RuntimeException(e); } From 7dbba465ee22d21ff9a2f1fca46963a3092b7d00 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Jun 2012 11:32:18 -0400 Subject: [PATCH 02/32] Bugfix for shadow BCFs to not attempt to write to /dev/null.bcf --- .../gatk/io/storage/VariantContextWriterStorage.java | 6 ++++-- .../sting/utils/codecs/bcf2/BCF2Utils.java | 11 ++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java index d0fdae639..fb05a6b04 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java @@ -107,8 +107,10 @@ public class VariantContextWriterStorage implements Storage foo.bcf * foo.xxx => foo.xxx.bcf * + * If the resulting BCF file cannot be written, return null. Happens + * when vcfFile = /dev/null for example + * * @param vcfFile - * @return + * @return the BCF */ @Requires("vcfFile != null") @Ensures("result != null") @@ -209,8 +212,10 @@ public final class BCF2Utils { final String path = vcfFile.getAbsolutePath(); if ( path.contains(".vcf") ) return new File(path.replace(".vcf", ".bcf")); - else - return new File( path + ".bcf" ); + else { + final File bcf = new File( path + ".bcf" ); + return bcf.canWrite() ? bcf : null; + } } @Ensures("BCF2Type.INTEGERS.contains(result)") From 61f0c46423fd62e7824218aee9d2122543f1a224 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Jun 2012 13:03:53 -0400 Subject: [PATCH 03/32] Rev tribble to 110. Log is: Optimization for PositionalBufferedStream with specialized read(byte, int, int) method -- For binary codecs having an efficient reader of lots of bytes that doesn't fall back into read() itself vastly improves performance. The old version was 10x slower than InputStream, while the new version is +30%. -- Generalize PositionalBufferedStream main() method for performance testing, now accepts cmdline arguments for the file to read, how many iterations, etc Generalize AsciiLineReader main() method for performance testing -- Now accepts cmdline arguments for the file to read, how many iterations, etc AsciiLineReaderTest and PositionBufferedStreamTest were in src not test/src --- settings/repository/org.broad/tribble-107.xml | 3 --- .../{tribble-107.jar => tribble-110.jar} | Bin 315157 -> 313966 bytes settings/repository/org.broad/tribble-110.xml | 3 +++ 3 files changed, 3 insertions(+), 3 deletions(-) delete mode 100644 settings/repository/org.broad/tribble-107.xml rename settings/repository/org.broad/{tribble-107.jar => tribble-110.jar} (91%) create mode 100644 settings/repository/org.broad/tribble-110.xml diff --git a/settings/repository/org.broad/tribble-107.xml b/settings/repository/org.broad/tribble-107.xml deleted file mode 100644 index 0d3a50baa..000000000 --- a/settings/repository/org.broad/tribble-107.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/org.broad/tribble-107.jar b/settings/repository/org.broad/tribble-110.jar similarity index 91% rename from settings/repository/org.broad/tribble-107.jar rename to settings/repository/org.broad/tribble-110.jar index 7157387eee4239b0958245b9d39eb6d0138aa13d..f8e312ad94d7c96eba2e9093a65a2470204911fb 100644 GIT binary patch delta 9716 zcmZ`<2|!g<`v2~|=e_%QT;KsdWEBw+1x0ZIHBr=@&|C;GF$GjKP4j7LO_rKH z^W-vaxG%Y&JPLPBbIqp7shKfza+;cH(b7!E#{ch}d)b`-Q1{;Np6@%~e!laa`+R$C z=nu<70|xffn3KpsG-&*lWdY5#fp~OWB_E&0%E#RV@$kkft;DnZDdlyW0Lre~<+J=t z|IY#>rYd5%Um!4F)r{9RR2_HS0E1PIDqKe8Ps ziRdKHEmI@B1DWoPY5s|1lIklv>jM*?$z&ii_?|Lm_r9U{m8Z6-G#P!DypTzbrET1r z1m0*DssszxjWVYl`7Mx0T%EnC!@~He%`Dj4FU!R}KJZ!6%&+{H_Wuc#v+nk65h8*3 zVRubFF(WpZB91@P(**P}_e_+Q_iuAV8TXbsvZ8R23B*t8Y6AHsnh8vpI$oEYKYcOC zE`fi{y&ool8+@H9^pzbShl&yJh$9`PTRzGA>7@jnD1l#}#Ui|K-VEkPb~1bUM@L#q zW?toVs$|jObdec^E$7REju1)%-z~woNKO(kaqeKyD2Y7Lwy?tDX&Wde64eoZ%rsFSJ^IA;AaYzbSX}mpp;7Q z`DRQ~@+HKtl_+Tv89rI*?UeQm&Tn}y3mKB7@^xiuIKTOd67E|&ODT{J@Y&}m<e67RP zdy!J9_fdb zSz|eO+f4OSQ~9|bHOX}M?q%wEnFGFAzgIt(SU$W=?JOrOFH<*~Q#+kfM@PxARo7Ho z;D1M#e;lMW;+MWx{Uq+@!U{*HU?Op~7PuJiOIbnQa3jcb#!1cT1)XL%Nu`-iilP_& zFfvQrX6y8lgXTC%CyzkS6}Ndhz3e1|=8KmFPFhH>IOtWu<#%G_HJuhYDO5~&o!$_) z#ZFp6OU3O?KTt1o&|5lrowS_Z)~Wvrow$=4(K}9xp_PK&Dgmu_&>F$>_u>US*E*>g zy`$4Qe`+m|>m9VgNgJumNtX72!g{g(RJV+uJ7OADstmKGKlfy7-{ls9o&enF3E{)RBDI!_SB)~Jq_>ab>2II3W1OX53#`0544C6?%nX#9y}C0?MaG{ zlzo6y%EVpdFi~7H2u!5z01cCjdQeZi2^NnF7W~p(wNZym44E^MK$t8igh>to)&!ye z=OrWs;t2r=AS;hqBZQt{rl4vWwl7NODn z4p^dT6iglKs2pKU?*qI>N-)sKE-2TH-}qJCq-65f?y3De!p5#-r(z1DEI7&rRJ zjgg^39XunG4v>v!-&do0Wa8Ddqy*w8gM4ZoOCle&q?WHD7K`-A-ur5TrWjc=!Su)Nvv%pxhp6vBgqD+VFiVE-A@rA zk?b%<1;IG2bzoVy-8e?e@71A4s(qedb2`H9ggTvE9oq)qCWtAmDbST!1NnhD}%C;Tr!iMrW`Dip-AEpG!pspG(8Cocsr*wmG>2N>?6I))0ZF zd@8`xpT3}R^c<4NO@Bnfi-eA)bFgs&ydzS32I&u>U4Tx{>oidjxhwwwJftRMRF5;r z$Dg9pBqG;CYWzQ@#Mh3q>KR2kRv?8u#5TvO`GHq;rk9|+p+t*J$LJj7Dw<775y_aW zo5`4+!#uH-)G|BA)}oSHW#`z#Y~od9DCjCErMAkMYi$Ec+(wk2G8v8}xy)0r@=C!P z%%vZNu_@AfqRym9kJ_NdG`y)az4kOBI(T0!)7mMr_bEKHYG52D$0@RP<;Q+gLVSyC zTB_OWU-&xncvS-;adi|ONe6!PD6{hig_^@d|IkRkj94d#zM+HY8xm-lxV}(=p8zU#vy^$3241e8+6*}5IsXEZPG>C5Jp>^w3W7r z+jb}Ipq;pRXqO)t?RL<+*qohWbH@JbL^#VGbU^SsC~g%x`NXr*LGOtt_`k1Hm6MXF z(n&4okkIC^fQ|_CQ30WCIp)NoIN_v|bV{)PK&R6>ok2A%6#YjQc6~l-5exM#*0mr- zRq}YzK`o}ALVEL&J^J)%(FeO}QP1(YQ>G3n?pa(iQz2WC=u#B&n^G_-x3H+N=sAo% zJF~O^{A^Q8bElMI$DLSEgg3Bg(TDpS+Dc^uALOUS@NT!%#!R$F{Ms!wW8@?4P>9tI z^(iWuR+?Qpr66~b=!Fyt$uAZ{MQhRr+Dt2YeyUC-aH+J$r_+aMF+eUUdd$o!EGqEk z7Dw~&)mj7o_&6daC)jF|=do5|qc`P=#F8Y^nvpjyxez zr_Q1UOXfv>TC40#deT5C%QMggbhLV~4C*2qF#}sfHv<(S%RrIH(&?;$^?hEa3kF@J zO9pnufd(qTcmWMJP=Im_)D`b7oa72e$8)z+YlZ##qeq%DA@8zI9~-D{pBShe;QXn0 zxnj_t1dGq;^Lm}xpAGs8{Z+JC(NURnRU!X6jx|tTUovPmy(IfddnR>PNcLf7E0_N~ ziw@7EFATayeGU4Ot{bAmuHU2mO^E+WryJt=cZ0sBJf4YGj<<`^qP%J<_5P>bPjtL`Nd(oD1~u1MWR|OMfk*Lr)Bv{Z>2*U{kbW zgf*<s7F6jABpN2CLab zS!~2uZ)l~WDd~d_O6YRSUkmdHhIYJ)22L}$xT?$Xj&Oq$JYTT8>l7%qK1 z1tj^%xrXdpseAQcz8f2gf0`h$C3!%V9PMUf-S$c{q#16zu*1#pOk-}F+n%3h+xl>) zG}g@)3Ul48*2=!<8A*1R1L4R)*wF{00~>}CW%m`|UMcW!%P&e%VgRm~2 z!4{rJo_y@j1=wlFqt7dpuIUZyx}ggn1b^7UW&#*?L9pFe+u49b@MqTqcs4|dHdc%c z1y)38HX6b%2!imXKiTY#-^ivY5gCs-RP>C)L}W9-UCi@KO4gE(R8WAhPL(k&M>L}LMIzyzZC6m%jQfy8k5!R2Ym?o)tT z$5X+`*gQ-Xo+`1@r6Fu!6%@Rcg0jSMBcuU3eE0Bu(v3PQTu8k_um^$KkI}uz$$cnd zGoAT=}(NZ}O}VIGhoD?Aj{HOY>%Qe*{1rLp}K9TLM1 z<1}O)mSS;OLZ!iBX*AAC&8w)fINZc_w!7I%3J7TufnpNBm7FHjG{O!qRZ#*?K_;P( znpM*#*3e2w(l+-ju;FC0(LC9RRWty_lkknipQ=y?4xzI@jDmj@@jr(B;y4QW2`t8w zSbV1trqieyXHc)tVR4+tg1CSby@aF8N64aMw1GaBJ{pWYW(YDx>@#*yMxhGz0+dO) z9xAd5%|@^(foOl6g3Qj{jRVtES!l%ZsuV4>232OzbU;i-Em0gaGRdyXFLDb0h#pWA zoo3qaiwuj^=|%fJWJ2{KnFSafgBc;ZPPzy3h+AQ*TZa!!gQQJ{gT3L;eKIyCA9`+n_4F(K@6e6anYw(wO5d8f5Iob z&#<8Wj3dEc;E2Csm%fSw^n4*bBq}k~z(GYMq=3vl*@ca4E{cu{Hf>RO#7b(2H923d zq!bEAA?k*?=~%b;u!#IKE3x_?YN*qzj0M%09s-@vF{L~Ml%@Awm!t+xK(3eG^;V1Sw<%#*&a3eIzsa|F#if`{08*#lI|EjgwgYZ=wooN}(5OZOw3O1ckVY`84Aj}urj~%S4jZXbiziVqdBI0i;nPfIn0(4G1>wiZjx})+9XVIG;OV-@J=w!`leyneiqHQ*zk5je1<`?n>=~{40!H~a` zr?r)wFXm}COyq$=ZAqLwbS^yn+49#>xnH+=Hx%OMg*==#GEOnKZ`teG+a_^~W!ic3 zi~Ze|T9~EeT5Y!ZiGS%XZG`zF$9Lu0a`Ok2`A0OD`AvSsSuM}x^5Bwo$u#TYW$jvX zsnDK#+8mRvjg9>rB*&V$*+(u3{hGjBrVRh~Y_2)B{oNvn0z*^q156}vsg=0 zj(q{UXnOg|t1QIS=U<$SF?sf1$vVjiudQVL?HYb*nnjNWftmG z85=4`qBpUD5-8rp21(%LCRQMU#35T*u0&1W%En9J z`c_NBzT4OoIdXIxE0jR;c9t!HdE41!3EbJvCQ2ZGhedOF2OBL%Qg^c65}3WyGVqg~ zth*d(xXUtd+OBGocUe|M?Pen+&5Yd^&6VAjIVta2KAe33AM(rIWz8kwrFYpd2_){Z zJo@q;_JkZcx5t{%bT1nvM;7e0M!wi<$$4y_rSF=3>`6)U^*+l1Pwi*V%8>*6SuY9v zw%<}}WI4;1BPYsPjszMVuu#yhr-?dXxwhFsOGD2=Hb9~-AGFMETEU)?Bl9aP)E_G> zb6fhX5H0svK~pN(7)hL0Y0*?wvMf2`e9toHx%VvPFTcm~Br5HF%Rv4Ot##0un*BP>eIe#rOvSOfXP z()un=-unm(X^d5nS^Eu)u+i84ZJM`;_^qR?8P7Rtk=FYjR!d2n=KZV)()Qx>nEZ{9 zcRdD~tf3f<=6ho`8;^-X6&-Md8B*%fyOXlN0}FArs>doZs%BCE|Mr;0?>~ay=;#`h z!RIvA0(taiCVz|NV~+#>?`C!IQOTOY$5&bSV*;Ohd_-Km z#qz>;WhX~>i6atMdOa_kFR)nUX|a69X)6G=TL)oRul?IJ@0nL?=;dw_C$lQ|6hvn( zu0c8Z8jMKP`vMiXLZVjuUJK&)Pq73Unv^R$&UX&Pziz;lSStfjPFz(>{_F>qP7yDE z^F}lpDRITuBj2*k6x`;5V&l6%VBwOz%~3Gn33O`WYG03D+gn2)Ep3*HCY^?&tKK(J zu2?mu7PVNQc7ISa%f>%Cr@8rOaF?9=y}*Z_uaEcREzel^?q`6%alynRXbt&mXROF% zoIQE14SF?kwX3J)={qLWttTdOtqKjen0a{sT*iHgn>hJ{>?LuGaOG2x?+Z`eL%?~m+_3JOw!CD%*8L9v$$LrTxJZaK^gqbK~SK@c?;k5Jn$vMYVb}z z2Pi43$-}0P;X|VMn23*KwSSxDP02Co-yF^Scuii`QizXZ)qgJfzI%_6CjHp4Xw?Vy zQ`80;auNT6MKlg=EwG;vT$6OXtn{%rKe)?FG!{rMsRw~!bKY8&tSODT`Q7sG4 zP4j+up+=a_TVG_d#o>J}f=kTYI;dAJvT&KS8wBe6`E^iVV3w3})A_{w5STBnw)Lc_ z%WLQvU$=|}^PZP1zty3`aj5!F_#yvoe*6+^-0~3@WWMvt^&7X*po+`<&}q)vG;h6} S(JSy}92F*{5dE;40rh{F0d;-= delta 10524 zcmb_ic~})!vajww_cmM_xF902K)@w2PeS(SWJYt< zGcoRxV2p_xqnBtjW;5F)FM0VgZ)RRHCf{V6H`&L;XcphEPB(%k^L_KjyV%{;b?Vfq zQ&p#`&S}5c)NS*QZlUFq4B;UPB8s1IaARnu5r;?NR`qx%Sv}6D^TVI4_2uW0GHtz! zfxYj!)$_>vA%6;0Ehpk<28W@gsD8|H?{_-Uce?-ZYjOS^y)1ctofdwiBJI4cC`)_q z)k2x_z?WkG^F;K#tlF!``^$yt@1FU#V$v_L_mu#c1Z*bd!t&n*9{u{X+dMwYzEdb> zP5IG=0)A7_O^aZzJ&IPG`YDWvuc7{aGh+OSV*^$#{iTX4UP*;2F42GW*|w1N5ch7BxkTx2TF@7x^j zR=_8>UW`$|hjOPq=w18Y>c$xVtW$-mW%H>T(|{TL+L?X|G4M>St>TAY+Z&-UCGS0> zGs%&#_oG4-kp7SRLKSfIdw&qq1SV%E#=-VmmFRO8|>t;~j=j?>-_ zSL%ZPFaHw!T4uD|@R$ger!uu*)#uerZI+TfkTFVIrGyXco}#T+GYsTaYJaz(#47C( zC4zjZMjNcUYKyfdrIEnuW!iiN$+Pv^Ak`SRTpRCEww^ja=Ryf=U82jKd-XVZ=%C1u zrbvhI z{T+LVTbuPPTdR+4)L#oxt!uXHe|D+CX71I8G7#9bSKpJOpdl~lHzq1*>sj3u7QusE zwjco8h^K$jg5?{Z>-L)b`|HPzmT;n$aJgZMi1W`)i1o)=;mwCUluC6b)q6;%XS)Qwh| zbcacIdMK9e@=y%j&BXUGbZ-!?;~wih1Mr3}zwuBq-DT2!A(X|fzYPZI?}BK9NgF+M zKlwehi8gy^3rUlHANcYM{aRgnKt9`|$IJ4s^-}rV*LtDc{jLQ9s3-{=XE zCe4LBN{)JTG-<1>|3+V86w+v^UC;(c7Sbq-3TcQ%18AT{gJ`h-@RAHU`JZ}4Xdyjl zQ8A6NXd7+!Z(Q9il1X=1w3D`*w9BI1v`4=EPkrU^adeGEepyxhoPykf+<66qa~o^Wl)JKef&4khNR%&})1#9LX)lk^ zYSBYH>NqOXNH3mhQ4KAY+PC_6OCOE5hpA1Y;pwt8*cc|?uGXTheHJ}J`%QWjPydOx zl40E-|E}Sd|HG10*?l|~bI5o4o$})!wQlmo^Lm#2=e>Fl`L`o_M3A|%YUv7J8N@iR zkC7GU_3N*LVN&4-8cgA$1j?oyz4*3JU08Ix%%YnToLRu+ipAy&dR8H`2qOdc zfmjBEC;}sNiX88BAB%Q_vEuslDQAz2)ERCNgVL+IlHc|=bbmAc*$^4OY?MF(~aEsQWLbIt9cmc}EYV0x| z8<^=V4xR~FPm)V6_(4xB?wmg0ic!sPP;i(!d+AcMJgN78}V0VmI7vo-6HzX)BDMrKv#kkw4I}`Uf zK{_}kGpCdQ8AGiTlv){Y1xOnj+QBsRDe9?F7q(yhf@@kQSL+x>-r>>OE?sw+n%UB< zgBuao2w}=YEzrSj)QfJ1G4Ftf-bn>?7j$wD9O+(|Zyn8~^>jVmN6YB9w3>cLx6lSK z+KATs@wbT{pv|-g(d{vk^aS{|1H(?U0Ni__PX$e*5cD2L)0^QJ7MKjB8DJZVAve%< zR0$1dV}MzJ1cqHfvjMp=_)-jf0}8xQdK4n(fFblxb7(G}9y&->G!M3A?*jcLkcF63 zZPI)o)DK>CiMo@=#tMP?V~LvNBl2FtTDjaH491$Yz@&w6)k~C4`ejxf-n0k-1JR7V z32d6+4Yx>Q5ye=qN9U@=faJfh+VkslbC3Js^k%@^g>}3%t;S6*H7? z!%P}((g^=(^BTG2yb;B0o0;t}ja-esrs?$>dB&}-_SHAl)YUecG*YAZsVi!mYL@wC z)HK%2Tk0EKTU*yurJ6NLFR5!-n8#_Qc41ysXG>nOZ$VvyuhiGHsBS)Z2Q~Ss8phVG zs@161)m?HNZh4~@H2E4XcL`;#Q+-wQOKNI;jbO0=Ijr3Om670|ytTU=UaSpH4XLVb zT2Zysq0lC16y{r9@2hU|&7Ztt**sr^KfkbhGiE%>q#|fWqwZIa-Z8kN1W!fXiiT>R zoiF1?H&)lwFn5;2*T6KnoKh{sjc|<;SVTLYE?iaBnAfThC5hT9AhCKr`Kyty4FG%o?s#VPLdI|hN2Zbs??_Fi85}y2t($N zmUXFyNA`Wvu>41Ajg!nLpi-MzYI#kLQg^QQUE0lLgt@< z0;L=)mlZnRmdeqKfvnk`dO#EG4$0J<0#!yMo}^Mwu#Lm|O#^cQmIEB-3^N<@XMjX-^t4@@TM; z;dW)t#+d%9y8J|58oV%EkR5|Ty;$dZH-?{0Jr%@TFjw(+dH)6lOLla1Elw%WM)IHjK-_#j7zBeKh&e55^#@ z=`3{A;ZvN?Is1Td7TkLqx%2~Bl}}QTr|KvLaizv@<}5MyD47ND`U4;};EEkpYyqY+ z0P+li=>{uV>;RGB$imKrx-VE&>%8E@DI$oq%EQaQg&P?on3)}6hIe5Q(}jVX46jjS z`>4Z%jip#z>ow9AhS-9$xRCZrITOI#US$&?=S0}11R5!Y$)|J~F00FMg7n~ST?E$I zAJkxPd5zfl)=_e28PEsDaT@Q!zQ_XTZ-`27(JYy*4*18`ib zNG?td-n4^;!4RRcB*N%{Xuc%E=-bQ+>W_FpC>C_oM&VSA8O+D13t-)a(B%>oAWPx8 zwNym)h#SihT^f`G1He5IlmieL*ncVt&{fqH4hnjxfD}YixWt=kQUzC4(}Lkb z(-|{^p_v}8c}<$-f&8=icD)Bx?u`u1VQ4NxReYPruhlA08kE8{?|kxkz+*uMg9{m4 z#J3uzUd)WEOj=^nQjOePl}zxLZ%vF0?I?4K5%`eGwTY^vyJ?Gvk{pn)uIcmzHAw9x z9VH1fLREG}rMK+=v(md>m3pBSO;y!PN~`LpRiSYFmC~=H_`0(4L$TP{RPOUF!G^O2 za&ukhZfdAeMT=h3D2qmGL+0x9<|)O@`ZtO?W;mg7Y+WscP(>Tpb7Sfn8c-_Qg`%TJ zbKZ&t6*V`5tecD&Yed(Hs)mMMkYhGv5KDcv4AWjj-f~Se%ffIF%p$s6|D&p}B=k zi!?%EIuV6u_0qaV`PaopVly-&>XoVOnl+}gX3ZO|s?~|pYip3!f3CR1xnfI!{J z5!1&mi@6r#@IkCi+)+iTC}twCa3O~6k1CUUqs0zF-4K&I%Vf^F9E3Q(7%IunaRO#C z9|}Yos6tD!xC}+ObaPV4M)`S^g6$%j+1kfNi3k%(2$U>P4>g2cJ0z>bjxyPUilg&j zjPndSg&7^f^%+7K7@J&PR=@|Dfm^){o@m|mH6BJU(;|9no%|Xf=i4r_oB%;{v&adqh z7K~?jJ9$xjw_W;#k(DDKq%Z&x-O=23>C4NqHbcRDjF5#1W+T<)AO-h@u5zLBJV1Ss zZuyX{0Eb-z)g(AE#;PiM5QO3a!8R4A!NEv~T&WCEc}@qf0Z2)Mz?&xY@EHv7gn&l2A{YSDOaoE3qbfWGM;9H@hKib z5gA@Qf+HrU;NgixM{h`EbXiGuY0fFz&fWJ=T28dcZKs%_E>t|RS@Eu86nBi`@sgk{ z{$A(2d%yCGd}Hz*9m(5Oj%qHvQ+L}(YeHFYkl)8s1(p+H2eks{P5uiu?t zR`D6Se@1dc8NSCHni3_KBpN@duRlwYjIH)px}XfB*#1manq@p>sIITOK0ZDVbD!r@ z-<9fn7>R*`T%*DMnzXOLh{#c2j-H-l^jGY@o@0DyH~z_I{5Dm6IjXta_`t@M-e0nd~DqFL+r0&f6!e1_tyl;yysC#eSX{s z4y1f%^b1k6f1f`%az8#=?GKj`YcY6!2l0mIMYycX7r}vTns_u^4KXo7{HIqzNj-%( zOhI}1;#RNv$ocy%o0iYPzKySJ`N6kF&n?Aod0=DdQnm*krrYlU*E>W%k|1L2_0iWy=J_VHRc4*pmJKfLi7K>F= z)gA|UV~;~TXs@VJOAP9}qVv%)SE-Mt=;0 z@x$TJcpi7O(D=BRr!Yk?f$KjX7uo79^`J9_=7VCs!hC#CR4SnOkRv(d z9Ah^fa+Eyrup{f6VAlTk2uC^KjN4#{Yf!L5#RkJ zXjn^UpA;nulY7*e)B2;B0XLmHDykJQ>zFgk6URiYBF+go=6f&z8rFo?4pF(6uN}_A zz!UA5AOrDFIrCce6jZ_(D0-XwQFwL!T3d#uI34Pf4O5;S zhF$}9I+}RuX>k1StWE-tygeYI)oAf2(Uz3k*=EV~GtOw8x$A1fywI<+Ly$amQp71{ ze`aQ@26wi(uw`jKKhK)WPK!j{*ujW2dE&G) z=9|D8w;fP?z&p}a){+iKyE*Nt3iz0eGXsw#44HODL@DghGmM?i*mSveD(3Ob6x?1t2^&IPMa>U- zT_?epFWz@_{90Q|W^t!9d2j~k-#g=|_cQP_Cf^{~qvPbRe$dXLN)UJ+b_h6~=so60 z@5`dQy!j;udoN>WDr|3ga<+)Gy@(~~EM$1s*ZFI0nL3}j6smbldKnB}UBieJ>0K{e zGHQ*7xAK4R4=ZW_ diff --git a/settings/repository/org.broad/tribble-110.xml b/settings/repository/org.broad/tribble-110.xml new file mode 100644 index 000000000..84a550b27 --- /dev/null +++ b/settings/repository/org.broad/tribble-110.xml @@ -0,0 +1,3 @@ + + + From 7b96263f8bc558bed6dcf16ba76c2be5ca59ed81 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Jun 2012 13:05:59 -0400 Subject: [PATCH 04/32] Disable shadowBCF for VariantRecalibrationWalkers tests because it cannot handle symbolic alleles yet --- public/java/test/org/broadinstitute/sting/WalkerTest.java | 2 +- .../VariantRecalibrationWalkersIntegrationTest.java | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 708dd042e..a997385d6 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -79,7 +79,7 @@ public class WalkerTest extends BaseTest { public void validateOutputBCFIfPossible(final String name, final File resultFile) { final File bcfFile = BCF2Utils.shadowBCF(resultFile); - if ( bcfFile.exists() ) { + if ( bcfFile != null && bcfFile.exists() ) { logger.warn("Checking shadow BCF output file " + bcfFile + " against VCF file " + resultFile); try { VariantContextTestProvider.assertVCFandBCFFilesAreTheSame(resultFile, bcfFile); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 9bf01ad71..56bf8a567 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -54,6 +54,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -recalFile %s" + " -tranchesFile %s", Arrays.asList(params.recalMD5, params.tranchesMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst(); } @@ -101,6 +102,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -recalFile %s" + " -tranchesFile %s", Arrays.asList(params.recalMD5, params.tranchesMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testVariantRecalibratorIndel-"+params.inVCF, spec).getFirst(); } From fb26c0f0546041c86ace5c012b2b1240e604c806 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Jun 2012 13:06:24 -0400 Subject: [PATCH 05/32] Update integration tests to reflect header changes --- .../sting/gatk/walkers/beagle/BeagleIntegrationTest.java | 2 +- .../sting/utils/codecs/vcf/VCFIntegrationTest.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index 99710831d..234680ad7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -41,7 +41,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " + "--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " + "--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " + - "-o %s --no_cmdline_in_header --allowMissingVCFHeaders", 1, Arrays.asList("c5522304abf0633041c7772dd7dafcea")); + "-o %s --no_cmdline_in_header --allowMissingVCFHeaders", 1, Arrays.asList("cba514105039f7a56f7ecdd241fbdcca")); spec.disableShadowBCF(); executeTest("test BeagleOutputToVCF", spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 422e890de..3a5f2efe8 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -12,7 +12,7 @@ public class VCFIntegrationTest extends WalkerTest { @Test(enabled = true) public void testReadingAndWritingWitHNoChanges() { - String md5ofInputVCF = "babf02baabcfa7f72a2c6f7da5fdc996"; + String md5ofInputVCF = "d991abe6c6a7a778a60a667717903be0"; String testVCF = privateTestDir + "vcf4.1.example.vcf"; String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; From 11dbfc92a70e36702d293f2d417150cd93009afe Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Jun 2012 17:03:59 -0400 Subject: [PATCH 07/32] Horrible bugfix to decodeLoc() in BCF2Codec -- Just completely wrong. -- BCF2 shadowBCF now checks that the shadow bcf can be written to avoid /dev/null.bcf problem -- Added samtools ex2.bcf file for decoding to our integrationtests --- .../sting/utils/codecs/bcf2/BCF2Codec.java | 10 +-------- .../sting/utils/codecs/bcf2/BCF2Utils.java | 22 ++++++++++++++----- .../utils/codecs/vcf/VCFIntegrationTest.java | 14 +++++++++--- 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 91331ac13..94c24b097 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -101,15 +101,7 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD @Override public Feature decodeLoc( final PositionalBufferedStream inputStream ) { - recordNo++; - final VariantContextBuilder builder = new VariantContextBuilder(); - - final int sitesBlockSize = decoder.readBlockSize(inputStream); - final int genotypeBlockSize = decoder.readBlockSize(inputStream); // necessary because it's in the stream - decoder.readNextBlock(sitesBlockSize, inputStream); - decodeSiteLoc(builder); - - return builder.fullyDecoded(true).make(); + return decode(inputStream); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index 51fbd3a7f..21deb4158 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -32,10 +32,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFIDHeaderLine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; +import java.io.*; import java.util.*; /** @@ -207,14 +204,27 @@ public final class BCF2Utils { * @return the BCF */ @Requires("vcfFile != null") - @Ensures("result != null") public static final File shadowBCF(final File vcfFile) { final String path = vcfFile.getAbsolutePath(); if ( path.contains(".vcf") ) return new File(path.replace(".vcf", ".bcf")); else { final File bcf = new File( path + ".bcf" ); - return bcf.canWrite() ? bcf : null; + if ( bcf.canRead() ) + return bcf; + else { + try { + // this is the only way to robustly decide if we could actually write to BCF + final FileOutputStream o = new FileOutputStream(bcf); + o.close(); + bcf.delete(); + return bcf; + } catch ( FileNotFoundException e ) { + return null; + } catch ( IOException e ) { + return null; + } + } } } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 3a5f2efe8..14b75fbc6 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -30,7 +30,6 @@ public class VCFIntegrationTest extends WalkerTest { // See https://getsatisfaction.com/gsa/topics/support_vcf_4_1_structural_variation_breakend_alleles?utm_content=topic_link&utm_medium=email&utm_source=new_topic public void testReadingAndWritingBreakpointAlleles() { String testVCF = privateTestDir + "breakpoint-example.vcf"; - //String testVCF = validationDataLocation + "multiallelic.vcf"; String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; @@ -51,11 +50,20 @@ public class VCFIntegrationTest extends WalkerTest { } @Test - public void testReadingAndWritingSamtoolsWExBCFExample() { + public void testWritingSamtoolsWExBCFExample() { String testVCF = privateTestDir + "ex2.vcf"; String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; String test1 = baseCommand + "-T SelectVariants -V " + testVCF; WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("9773d6a121cfcb18d090965bc520f120")); - executeTest("Test reading and writing samtools WEx vcf/BCF example", spec1); + executeTest("Test writing samtools WEx BCF example", spec1); + } + + @Test + public void testReadingSamtoolsWExBCFExample() { + String testVCF = privateTestDir + "ex2.bcf"; + String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; + String test1 = baseCommand + "-T SelectVariants -V " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("63a2e0484ae37b0680514f53e0bf0c94")); + executeTest("Test reading samtools WEx BCF example", spec1); } } From 0b5980d7b383991e61118f3c873742f7c1382d2c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Jun 2012 17:25:44 -0400 Subject: [PATCH 08/32] Added Heng's nasty ex2.vcf to standard tests --- .../sting/utils/variantcontext/VariantContextTestProvider.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index c75e22041..b0c85fe28 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -63,6 +63,7 @@ public class VariantContextTestProvider { private final static List testSourceVCFs = Arrays.asList( new File(BaseTest.privateTestDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf"), + new File(BaseTest.privateTestDir + "ex2.vcf"), new File(BaseTest.privateTestDir + "dbsnp_135.b37.1000.vcf") ); From c1ac0e2760791a6e5350b4818769f27634a49e23 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 25 Jun 2012 10:27:37 -0400 Subject: [PATCH 09/32] BCF2 cleanup -- allowMissingVCFHeaders is now part of -U argument. If you want specifically unsafe VCF processing you need -U LENIENT_VCF_PROCESSING. Updated lots of files to use this -- LENIENT_VCF_PROCESSING disables on the fly VCF header cleanup. This is now implemented via a member variable, not a class variable, which I believe was changing the GATK behavior during integration tests, causing some files to fail that pass when run as a single test because the header reading behavior was changing depending on previous failures. --- .../sting/gatk/GenomeAnalysisEngine.java | 12 +++++++++++- .../sting/gatk/arguments/GATKArgumentCollection.java | 3 --- .../sting/gatk/arguments/ValidationExclusion.java | 1 + .../gatk/io/stubs/VariantContextWriterStub.java | 2 +- .../sting/gatk/refdata/tracks/FeatureManager.java | 12 +++++++++--- .../sting/gatk/refdata/tracks/RMDTrackBuilder.java | 3 ++- .../gatk/walkers/diffengine/VCFDiffableReader.java | 7 ++++--- .../gatk/walkers/variantutils/SelectVariants.java | 2 +- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 8 ++++++-- .../sting/utils/variantcontext/VariantContext.java | 12 ++++++------ .../utils/variantcontext/writer/BCF2Writer.java | 2 +- .../sting/utils/variantcontext/writer/VCFWriter.java | 2 +- .../gatk/walkers/beagle/BeagleIntegrationTest.java | 8 ++++---- .../VariantRecalibrationWalkersIntegrationTest.java | 4 ++-- .../variantutils/CombineVariantsIntegrationTest.java | 4 ++-- .../variantutils/SelectVariantsIntegrationTest.java | 10 +++++----- .../variantcontext/VariantContextTestProvider.java | 4 ++-- .../writer/VariantContextWritersUnitTest.java | 2 +- 18 files changed, 59 insertions(+), 39 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 6fa70f437..80cbd3dad 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -830,7 +830,8 @@ public class GenomeAnalysisEngine { throw new UserException.CouldNotReadInputFile(getArguments().repairVCFHeader, e); } } - RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser,header,validationExclusionType); + + RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, header, validationExclusionType); List dataSources = new ArrayList(); for (RMDTriplet fileDescriptor : referenceMetaDataFiles) @@ -854,6 +855,15 @@ public class GenomeAnalysisEngine { return readsDataSource.getHeader(); } + public boolean lenientVCFProcessing() { + return lenientVCFProcessing(argCollection.unsafe); + } + + public static boolean lenientVCFProcessing(final ValidationExclusion.TYPE val) { + return val == ValidationExclusion.TYPE.ALL + || val == ValidationExclusion.TYPE.LENIENT_VCF_PROCESSING; + } + /** * Returns the unmerged SAM file header for an individual reader. * @param reader The reader. diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index babbb7ab8..13c737a2e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -347,9 +347,6 @@ public class GATKArgumentCollection { public boolean USE_SLOW_GENOTYPES = false; // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed - @Argument(fullName="allowMissingVCFHeaders",shortName = "allowMissingVCFHeaders",doc="If provided, the GATK will write out VCF files that contain INFO, FILTER, and FORMAT fields not found in the VCF header",required=false) - public boolean allowMissingVCFHeaders = false; - /** * The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file * and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java index 577f7929a..52c77326a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java @@ -40,6 +40,7 @@ public class ValidationExclusion { ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities + LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc. @EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL ALL // do not check for all of the above conditions, DEFAULT } diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java index 819ae6d27..6ed889eb6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java @@ -183,7 +183,7 @@ public class VariantContextWriterStub implements Stub, Var List options = new ArrayList(); if ( doNotWriteGenotypes ) options.add(Options.DO_NOT_WRITE_GENOTYPES); - if ( engine.getArguments().allowMissingVCFHeaders ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER); + if ( engine.lenientVCFProcessing() ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER); if ( indexOnTheFly && ! isCompressed() ) options.add(Options.INDEX_ON_THE_FLY); return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options); diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java index 3f03b30dd..b5d5deedb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -85,16 +86,18 @@ public class FeatureManager { private final PluginManager pluginManager; private final Collection featureDescriptors = new TreeSet(); private final VCFHeader headerForRepairs; + private final boolean lenientVCFProcessing; /** * Construct a FeatureManager without a master VCF header */ public FeatureManager() { - this(null); + this(null, false); } - public FeatureManager(final VCFHeader headerForRepairs) { + public FeatureManager(final VCFHeader headerForRepairs, final boolean lenientVCFProcessing) { this.headerForRepairs = headerForRepairs; + this.lenientVCFProcessing = lenientVCFProcessing; pluginManager = new PluginManager(FeatureCodec.class, "Codecs", "Codec"); for (final String rawName: pluginManager.getPluginsByName().keySet()) { @@ -252,8 +255,11 @@ public class FeatureManager { ((NameAwareCodec)codex).setName(name); if ( codex instanceof ReferenceDependentFeatureCodec ) ((ReferenceDependentFeatureCodec)codex).setGenomeLocParser(genomeLocParser); - if ( codex instanceof VCFCodec) + if ( codex instanceof VCFCodec ) ((VCFCodec)codex).setHeaderForRepairs(headerForRepairs); + if ( codex instanceof AbstractVCFCodec && lenientVCFProcessing ) + ((AbstractVCFCodec)codex).disableOnTheFlyModifications(); + return codex; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java index 25e005601..e183fe169 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -34,6 +34,7 @@ import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; import org.broad.tribble.util.LittleEndianOutputStream; import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; @@ -98,7 +99,7 @@ public class RMDTrackBuilder { // extends PluginManager { this.dict = dict; this.validationExclusionType = validationExclusionType; this.genomeLocParser = genomeLocParser; - this.featureManager = new FeatureManager(headerForRepairs); + this.featureManager = new FeatureManager(headerForRepairs, GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType)); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index df5f5adf1..b4cf96831 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -64,9 +64,10 @@ public class VCFDiffableReader implements DiffableReader { root.add("VERSION", version); br.close(); - // must be read as state is stored in reader itself - AbstractVCFCodec.disableOnTheFlyModifications(); - FeatureReader reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false); + final VCFCodec vcfCodec = new VCFCodec(); + vcfCodec.disableOnTheFlyModifications(); // must be read as state is stored in reader itself + + FeatureReader reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), vcfCodec, false); VCFHeader header = (VCFHeader)reader.getHeader(); for ( VCFHeaderLine headerLine : header.getMetaData() ) { String key = headerLine.getKey(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 33ab5a4c3..fbffd620a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -510,7 +510,7 @@ public class SelectVariants extends RodWalker implements TreeR for (VariantContext vc : vcs) { // an option for performance testing only if ( fullyDecode ) - vc = vc.fullyDecode(vcfRods.get(vc.getSource())); + vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing() ); // an option for performance testing only if ( forceGenotypesDecode ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 77aed0e0b..f9f310538 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -22,7 +22,6 @@ import java.util.zip.GZIPInputStream; public abstract class AbstractVCFCodec extends AsciiFeatureCodec implements NameAwareCodec { public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20); - protected static boolean doOnTheFlyModifications = true; protected final static Logger log = Logger.getLogger(AbstractVCFCodec.class); protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column @@ -61,6 +60,11 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec protected boolean warnedAboutNoEqualsForNonFlag = false; + /** + * If true, then we'll magically fix up VCF headers on the fly when we read them in + */ + protected boolean doOnTheFlyModifications = true; + protected AbstractVCFCodec() { super(VariantContext.class); } @@ -850,7 +854,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec * of VCF records. Useful primarily for raw comparisons such as when comparing * raw VCF records */ - public static final void disableOnTheFlyModifications() { + public final void disableOnTheFlyModifications() { doOnTheFlyModifications = false; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 8908782f1..cb02a91bb 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1337,13 +1337,13 @@ public class VariantContext implements Feature { // to enable tribble integratio * @param header containing types about all fields in this VC * @return a fully decoded version of this VC */ - public VariantContext fullyDecode(final VCFHeader header) { + public VariantContext fullyDecode(final VCFHeader header, final boolean lenientDecoding) { if ( isFullyDecoded() ) return this; else { // TODO -- warning this is potentially very expensive as it creates copies over and over final VariantContextBuilder builder = new VariantContextBuilder(this); - fullyDecodeInfo(builder, header); + fullyDecodeInfo(builder, header, lenientDecoding); fullyDecodeGenotypes(builder, header); builder.fullyDecoded(true); return builder.make(); @@ -1358,13 +1358,13 @@ public class VariantContext implements Feature { // to enable tribble integratio return fullyDecoded; } - private final void fullyDecodeInfo(final VariantContextBuilder builder, final VCFHeader header) { - builder.attributes(fullyDecodeAttributes(getAttributes(), header, false)); + private final void fullyDecodeInfo(final VariantContextBuilder builder, final VCFHeader header, final boolean lenientDecoding) { + builder.attributes(fullyDecodeAttributes(getAttributes(), header, lenientDecoding)); } private final Map fullyDecodeAttributes(final Map attributes, final VCFHeader header, - final boolean allowMissingValuesComparedToHeader) { + final boolean lenientDecoding) { final Map newAttributes = new HashMap(attributes.size()); for ( final Map.Entry attr : attributes.entrySet() ) { @@ -1377,7 +1377,7 @@ public class VariantContext implements Feature { // to enable tribble integratio final Object decoded = decodeValue(field, attr.getValue(), format); if ( decoded != null && - ! allowMissingValuesComparedToHeader + ! lenientDecoding && format.getCountType() != VCFHeaderLineCount.UNBOUNDED && format.getType() != VCFHeaderLineType.Flag ) { // we expect exactly the right number of elements final int obsSize = decoded instanceof List ? ((List) decoded).size() : 1; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index eada05578..1e15c2bc5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -155,7 +155,7 @@ class BCF2Writer extends IndexingVariantContextWriter { public void add( VariantContext vc ) { if ( doNotWriteGenotypes ) vc = new VariantContextBuilder(vc).noGenotypes().make(); - vc = vc.fullyDecode(header); + vc = vc.fullyDecode(header, false); super.add(vc); // allow on the fly indexing diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index fcd3eb071..03a62019c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -569,6 +569,6 @@ class VCFWriter extends IndexingVariantContextWriter { + " at " + vc.getChr() + ":" + vc.getStart() + " but this key isn't defined in the VCFHeader. The GATK now requires all VCFs to have" + " complete VCF headers by default. This error can be disabled with the engine argument" - + " --allowMissingVCFHeaders"); + + " -U LENIENT_VCF_PROCESSING"); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index 234680ad7..0458f2ad7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -41,7 +41,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " + "--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " + "--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " + - "-o %s --no_cmdline_in_header --allowMissingVCFHeaders", 1, Arrays.asList("cba514105039f7a56f7ecdd241fbdcca")); + "-o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("cba514105039f7a56f7ecdd241fbdcca")); spec.disableShadowBCF(); executeTest("test BeagleOutputToVCF", spec); } @@ -51,7 +51,7 @@ public class BeagleIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T ProduceBeagleInput -R " + hg19Reference + " " + "--variant:VCF3 " + beagleValidationDataLocation + "inttestbgl.input.vcf " + - "-o %s --allowMissingVCFHeaders", 1, Arrays.asList("f301b089d21da259873f04bdc468835d")); + "-o %s -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("f301b089d21da259873f04bdc468835d")); spec.disableShadowBCF(); executeTest("test BeagleInput", spec); } @@ -61,7 +61,7 @@ public class BeagleIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T ProduceBeagleInput --variant:VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_HSQ_chr22_14-16m.vcf "+ "--validation:VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_OMNI_chr22_14-16m.vcf "+ - "-L 22:14000000-16000000 -o %s -bvcf %s -bs 0.8 --allowMissingVCFHeaders -valp 0.98 -R /humgen/1kg/reference/human_g1k_v37.fasta --no_cmdline_in_header ",2, + "-L 22:14000000-16000000 -o %s -bvcf %s -bs 0.8 -U LENIENT_VCF_PROCESSING -valp 0.98 -R /humgen/1kg/reference/human_g1k_v37.fasta --no_cmdline_in_header ",2, Arrays.asList("660986891b30cdc937e0f2a3a5743faa","4b6417f892ccfe5c63b8a60cb0ef3740")); spec.disableShadowBCF(); executeTest("test BeagleInputWithBootstrap",spec); @@ -75,7 +75,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ "--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ "--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ - "-L 20:1-70000 -o %s --no_cmdline_in_header --allowMissingVCFHeaders",1,Arrays.asList("fbbbebfda35bab3f6f62eea2f0be1c01")); + "-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("d95a97068a97c9059811b2574b73ea60")); spec.disableShadowBCF(); executeTest("testBeagleChangesSitesToRef",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 56bf8a567..0e213a090 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -66,7 +66,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -L 20:12,000,000-30,000,000" + " --no_cmdline_in_header" + " -input " + params.inVCF + - " -o %s" + + " -U LENIENT_VCF_PROCESSING -o %s" + " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), Arrays.asList(params.cutVCFMD5)); @@ -113,7 +113,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -T ApplyRecalibration" + " -L 20:12,000,000-30,000,000" + " -mode INDEL" + - " --no_cmdline_in_header" + + " -U LENIENT_VCF_PROCESSING --no_cmdline_in_header" + " -input " + params.inVCF + " -o %s" + " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 981f00071..2bd91ca85 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -38,14 +38,14 @@ public class CombineVariantsIntegrationTest extends WalkerTest { // TODO TODO TODO TODO TODO TODO TODO TODO // TODO TODO TODO TODO TODO TODO TODO TODO // - // TODO WHEN THE HC EMITS VALID VCF HEADERS ENABLE BCF AND REMOVE allowMissingVCFHeaders ARGUMENTS + // TODO WHEN THE HC EMITS VALID VCF HEADERS ENABLE BCF AND REMOVE lenientVCFProcessing ARGUMENTS // // TODO TODO TODO TODO TODO TODO TODO TODO // TODO TODO TODO TODO TODO TODO TODO TODO // TODO TODO TODO TODO TODO TODO TODO TODO // private static String baseTestString(String args) { - return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s --allowMissingVCFHeaders -R " + b36KGReference + args; + return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -U LENIENT_VCF_PROCESSING -R " + b36KGReference + args; } private void cvExecuteTest(final String name, final WalkerTestSpec spec) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index a23a22162..7af9a7aa3 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -18,7 +18,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile - + " -o %s --no_cmdline_in_header --allowMissingVCFHeaders --allowMissingVCFHeaders", + + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("d88bdae45ae0e74e8d8fd196627e612c") ); @@ -47,7 +47,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile - + " -o %s --no_cmdline_in_header --allowMissingVCFHeaders", + + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("54289033d35d32b8ebbb38c51fbb614c") ); @@ -93,7 +93,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " + b37hapmapGenotypes + " --variant " + testFile - + " -o %s --no_cmdline_in_header --allowMissingVCFHeaders", + + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("946e7f2e0ae08dc0e931c1634360fc46") ); @@ -161,7 +161,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("a0b7f77edc16df0992d2c1363136a17e") + Arrays.asList("ef3c5f75074a5dd2b2cd2715856a2542") ); executeTest("testNoGTs--" + testFile, spec); @@ -223,7 +223,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { final String testFile = privateTestDir + "missingHeaderLine.vcf"; final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s --no_cmdline_in_header" - + (expectedException == null ? " -allowMissingVCFHeaders" : ""); + + (expectedException == null ? " -lenientVCFProcessing" : ""); WalkerTestSpec spec = expectedException != null ? new WalkerTestSpec(cmd, 1, expectedException) diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index b0c85fe28..14d60b6e9 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -149,7 +149,7 @@ public class VariantContextTestProvider { logger.warn("Reading records from " + file); for ( final VariantContext raw : x.getSecond() ) { if ( raw != null ) - fullyDecoded.add(raw.fullyDecode(x.getFirst())); + fullyDecoded.add(raw.fullyDecode(x.getFirst(), false)); } logger.warn("Done reading " + file); @@ -599,7 +599,7 @@ public class VariantContextTestProvider { public VariantContext next() { try { final VariantContext vc = codec.decode(pbs); - return vc == null ? null : vc.fullyDecode(header); + return vc == null ? null : vc.fullyDecode(header, false); } catch ( IOException e ) { throw new RuntimeException(e); } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java index 9ecffe939..1b791bf6c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java @@ -121,7 +121,7 @@ public class VariantContextWritersUnitTest extends BaseTest { final List fullyDecoded = new ArrayList(vcsAfterIO.size()); for ( final VariantContext withStrings : vcsAfterIO ) - fullyDecoded.add(withStrings.fullyDecode(header)); + fullyDecoded.add(withStrings.fullyDecode(header, false)); return fullyDecoded; } From 39c849aced5bfe060435b18044b2f377b20a785b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 25 Jun 2012 13:14:55 -0400 Subject: [PATCH 10/32] Bugfix to ensure the DB=1 old files decode properly --- .../sting/utils/variantcontext/VariantContext.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index cb02a91bb..b4adc0a8a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1431,7 +1431,7 @@ public class VariantContext implements Feature { // to enable tribble integratio switch ( format.getType() ) { case Character: return string; case Flag: - final boolean b = Boolean.valueOf(string); + final boolean b = Boolean.valueOf(string) || string.equals("1"); if ( b == false ) throw new UserException.MalformedVCF("VariantContext FLAG fields " + field + " cannot contain false values" + " as seen at " + getChr() + ":" + getStart()); From 5f5885ec78c5df2a4566fac4cef8fa62121f3222 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 25 Jun 2012 13:16:18 -0400 Subject: [PATCH 11/32] Updating many MD5s to reflect correct fixed headers -- Previous bugfix ensures that header fixing is always on in the GATK by default, even after integration tests that failed and when through the VCFDiffableReader. Updating md5s to reflect this. --- .../VariantAnnotatorIntegrationTest.java | 42 +++++++++---------- .../walkers/beagle/BeagleIntegrationTest.java | 4 +- .../VariantFiltrationIntegrationTest.java | 22 +++++----- .../PhaseByTransmissionIntegrationTest.java | 14 +++---- .../ReadBackedPhasingIntegrationTest.java | 12 +++--- ...ntRecalibrationWalkersIntegrationTest.java | 3 +- .../CombineVariantsIntegrationTest.java | 10 ++--- .../LiftoverVariantsIntegrationTest.java | 6 +-- .../SelectVariantsIntegrationTest.java | 16 +++---- .../VariantsToVCFIntegrationTest.java | 2 +- 10 files changed, 66 insertions(+), 65 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index b83ef67c4..0b45dc931 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -16,7 +16,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsNotAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("55785745fe13ad81a2c4a14373d091f0")); + Arrays.asList("360610e4990860bb5c45249b8ac31e5b")); executeTest("test file has annotations, not asking for annotations, #1", spec); } @@ -24,7 +24,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsNotAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("d6f749f8dbeb2d42c9effaff9fe571d7")); + Arrays.asList("d69a3c92a0e8f44e09e7377e3eaed4e8")); executeTest("test file has annotations, not asking for annotations, #2", spec); } @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("9084e6c7b1cec0f3a2c6d96711844d5e")); + Arrays.asList("e0a08416249515ea18bd0663c90c9330")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("3dfabdcaa2648ac34380fb71860c42d3")); + Arrays.asList("0b60da46ba0eabb3abe5e0288937f9b0")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -48,7 +48,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsNotAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("b85c1ea28194484b327fbe0add1b5685")); + Arrays.asList("540a9be8a8cb85b0f675fea1184bf78c")); executeTest("test file doesn't have annotations, not asking for annotations, #1", spec); } @@ -58,7 +58,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { // they don't get reordered. It's a good test of the genotype ordering system. WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("fe4d4e2484c4cf8b1cd50ad42cfe468e")); + Arrays.asList("f900e65b65ff0f9d9eb0891ef9b28c73")); executeTest("test file doesn't have annotations, not asking for annotations, #2", spec); } @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("043fc6205b0633edcd3fadc9e044800c")); + Arrays.asList("5eb576d0234c912d8efea184492691d0")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("6fafb42d374a67ba4687a23078a126af")); + Arrays.asList("8860524d793d24b2e32f318433fcf527")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -82,7 +82,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("639462a0e0fa79e33def5f011fe55961")); + Arrays.asList("f33f417fad98c05d9cd08ffa22943b0f")); executeTest("test exclude annotations", spec); } @@ -98,7 +98,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoReads() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("afe6c9d3b4b80635a541cdfcfa48db2f")); + Arrays.asList("1c423b7730b9805e7b885ece924286e0")); executeTest("not passing it any reads", spec); } @@ -106,7 +106,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("21d696ea8c55d2fd4cbb4dcd5f7f7db6")); + Arrays.asList("54d7d5bb9404652857adf5e50d995f30")); executeTest("getting DB tag with dbSNP", spec); } @@ -114,7 +114,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testMultipleIdsWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3withIDs.vcf -L " + privateTestDir + "vcfexample3withIDs.vcf", 1, - Arrays.asList("ef95394c14d5c16682a322f3dfb9000c")); + Arrays.asList("5fe63e511061ed4f91d938e72e7e3c39")); executeTest("adding multiple IDs with dbSNP", spec); } @@ -122,7 +122,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("e6e276b7d517d57626c8409589cd286f")); + Arrays.asList("cc7184263975595a6e2473d153227146")); executeTest("getting DB tag with HM3", spec); } @@ -130,7 +130,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoQuals() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "noQual.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L " + privateTestDir + "noQual.vcf -A QualByDepth", 1, - Arrays.asList("a99e8315571ed1b6bce942451b3d8612")); + Arrays.asList("aea983adc01cd059193538cc30adc17d")); executeTest("test file doesn't have QUALs", spec); } @@ -138,7 +138,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpression() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("7d6ea3b54210620cbc7e14dad8836bcb")); + Arrays.asList("2b0e8cdfd691779befc5ac123d1a1887")); executeTest("using expression", spec); } @@ -146,13 +146,13 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpressionWithID() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.ID -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("35ce4fb0288dfc5c01ec6ce8b14c6157")); + Arrays.asList("3de1d1998203518098ffae233f3e2352")); executeTest("using expression with ID", spec); } @Test public void testTabixAnnotations() { - final String MD5 = "5aebcf8f76c649d645708b1262185c80"; + final String MD5 = "99938d1e197b8f10c408cac490a00a62"; for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -A HomopolymerRun --variant:vcf " + validationDataLocation + file + " -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1, @@ -168,7 +168,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + "snpEff2.0.5.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", 1, - Arrays.asList("0c20cda1cf0b903a287f1807ae5bee02") + Arrays.asList("d9291845ce5a8576898d293a829a05b7") ); executeTest("Testing SnpEff annotations", spec); } @@ -187,7 +187,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testTDTAnnotation() { - final String MD5 = "81f85f0ce8cc36df7c717c478e100ba1"; + final String MD5 = "427dfdc665359b67eff210f909ebf8a2"; WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + b37KGReference + " -A TransmissionDisequilibriumTest --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, @@ -198,7 +198,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testChromosomeCountsPed() { - final String MD5 = "9830fe2247651377e68ad0b0894e9a4e"; + final String MD5 = "6b5cbedf4a8b3385edf128d81c8a46f2"; WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + b37KGReference + " -A ChromosomeCounts --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, @@ -208,7 +208,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testInbreedingCoeffPed() { - final String MD5 = "e94d589b5691e3ecfd9cc9475a384890"; + final String MD5 = "159a771c1deaeffb786097e106943893"; WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + b37KGReference + " -A InbreedingCoeff --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index 0458f2ad7..8fe96b53d 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -41,7 +41,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " + "--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " + "--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " + - "-o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("cba514105039f7a56f7ecdd241fbdcca")); + "-o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("c5522304abf0633041c7772dd7dafcea")); spec.disableShadowBCF(); executeTest("test BeagleOutputToVCF", spec); } @@ -75,7 +75,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ "--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ "--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ - "-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("d95a97068a97c9059811b2574b73ea60")); + "-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("fbbbebfda35bab3f6f62eea2f0be1c01")); spec.disableShadowBCF(); executeTest("testBeagleChangesSitesToRef",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index 573f25b70..ae5128c75 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -16,7 +16,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testNoAction() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("fbf88e25df30181ca5422a374c7b36fa")); + Arrays.asList("a890cd298298e22bc04a2e5a20b71170")); executeTest("test no action", spec); } @@ -24,7 +24,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testClusteredSnps() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -window 10 --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("bb69f49e9ef0054f0ccd6d38f5ffa46a")); + Arrays.asList("f46b2fe2dbe6a423b5cfb10d74a4966d")); executeTest("test clustered SNPs", spec); } @@ -32,7 +32,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testMask1() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -maskName foo --mask " + privateTestDir + "vcfexample2.vcf --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("7e3225a32fcd6066901247992b2c5ca8")); + Arrays.asList("86dbbf62a0623b2dc5e8969c26d8cb28")); executeTest("test mask all", spec1); } @@ -40,7 +40,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testMask2() { WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -maskName foo --mask:VCF " + privateTestDir + "vcfMask.vcf --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("3485fe95e3f0864c3575baf05cef4bcc")); + Arrays.asList("2fb33fccda1eafeea7a2f8f9219baa39")); executeTest("test mask some", spec2); } @@ -48,7 +48,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testMask3() { WalkerTestSpec spec3 = new WalkerTestSpec( baseTestString() + " -maskName foo -maskExtend 10 --mask:VCF " + privateTestDir + "vcfMask.vcf --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("367ab9c028a68e4eda2055e3bb8b486c")); + Arrays.asList("4351e00bd9d821e37cded5a86100c973")); executeTest("test mask extend", spec3); } @@ -56,7 +56,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testFilter1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("5a10d969e50a58d8dfbf1da54bf293df")); + Arrays.asList("2f056b50a41c8e6ba7645ff4c777966d")); executeTest("test filter #1", spec); } @@ -64,7 +64,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testFilter2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("886dbbca2350083819ff67224f6efbd6")); + Arrays.asList("b2a8c1a5d99505be79c03120e9d75f2f")); executeTest("test filter #2", spec); } @@ -72,7 +72,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testFilterWithSeparateNames() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("ee78c2e7128a8f9549233493c7cf6949")); + Arrays.asList("e350d9789bbdf334c1677506590d0798")); executeTest("test filter with separate names #2", spec); } @@ -80,7 +80,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testGenotypeFilters1() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("285dd348c47c8c1e85d2886f9b33559e")); + Arrays.asList("060e9e7b6faf8b2f7b3291594eb6b39c")); executeTest("test genotype filter #1", spec1); } @@ -88,7 +88,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testGenotypeFilters2() { WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -G_filter 'isHomVar == 1' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("a9c835a13eb72aa22d5e271894d8ac33")); + Arrays.asList("00f90028a8c0d56772c47f039816b585")); executeTest("test genotype filter #2", spec2); } @@ -96,7 +96,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testDeletions() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --filterExpression 'QUAL < 100' --filterName foo --variant:VCF " + privateTestDir + "twoDeletions.vcf", 1, - Arrays.asList("a1c02a5a90f1262e9eb3d2cad1fd08f2")); + Arrays.asList("8077eb3bab5ff98f12085eb04176fdc9")); executeTest("test deletions", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java index b3c85622e..19d1e4cb3 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java @@ -29,7 +29,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-o %s" ), 2, - Arrays.asList("cd112ec37a9e28d366aff29a85fdcaa0","313cc749c7ee97713e4551de39e01ac5") + Arrays.asList("cd112ec37a9e28d366aff29a85fdcaa0","f8721f4f5d3bae2848ae15c3f120709b") ); executeTest("testTrueNegativeMV", spec); } @@ -48,7 +48,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-o %s" ), 2, - Arrays.asList("27ccd6feb51de7e7dcdf35f4697fa4eb","dd90dad9fd11e1b16e6660c3ca0553e7") + Arrays.asList("27ccd6feb51de7e7dcdf35f4697fa4eb","547fdfef393f3045a96d245ef6af8acb") ); executeTest("testTruePositiveMV", spec); } @@ -67,7 +67,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-o %s" ), 2, - Arrays.asList("719d681bb0a52a40bc854bba107c5c94","b35a86d2cad17f0db7b5e84ddc0e5545") + Arrays.asList("719d681bb0a52a40bc854bba107c5c94","9529e2bf214d72e792d93fbea22a3b91") ); executeTest("testFalsePositiveMV", spec); } @@ -86,7 +86,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-o %s" ), 2, - Arrays.asList("7f4a277aee2c7398fcfa84d6c98d5fb3","c53b5fd377bef48e9c6035a94db398db") + Arrays.asList("7f4a277aee2c7398fcfa84d6c98d5fb3","8c157d79dd00063d2932f0d2b96f53d8") ); executeTest("testSpecialCases", spec); } @@ -108,7 +108,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-o %s" ), 2, - Arrays.asList("44e09d2f9e4d8a9488226d03a97fe999","6f596470740e1a57679bbb38c0126364") + Arrays.asList("44e09d2f9e4d8a9488226d03a97fe999","343e418850ae4a687ebef2acd55fcb07") ); executeTest("testPriorOption", spec); } @@ -128,7 +128,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("b35a86d2cad17f0db7b5e84ddc0e5545") + Arrays.asList("9529e2bf214d72e792d93fbea22a3b91") ); executeTest("testMVFileOption", spec); } @@ -149,7 +149,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-fatherAlleleFirst" ), 2, - Arrays.asList("60ced3d078792a150a03640b62926857","6d550784382aa910f78b533d889c91c0") + Arrays.asList("60ced3d078792a150a03640b62926857","52ffa82428e63ade22ea37b72ae58492") ); executeTest("testFatherAlleleFirst", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java index bb4b7a1be..11f1a0628 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java @@ -26,7 +26,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:332341-382503", 1, - Arrays.asList("442c819569417c1b7d6be9f41ce05394")); + Arrays.asList("1c9a7fe4db41864cd85d16e5cf88986c")); executeTest("MAX 10 het sites [TEST ONE]; require PQ >= 10", spec); } @@ -36,7 +36,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:1232503-1332503", 1, - Arrays.asList("2a51ee7d3c024f2410dcee40c5412993")); + Arrays.asList("a3ca151145379e0d4bae64a91165ea0b")); executeTest("MAX 10 het sites [TEST TWO]; require PQ >= 10", spec); } @@ -46,7 +46,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 2, 30) + " -L chr20:332341-382503", 1, - Arrays.asList("85bc9b03e24159f746dbd0cb988f9ec8")); + Arrays.asList("f685803333123a102ce1851d984cbd10")); executeTest("MAX 2 het sites [TEST THREE]; require PQ >= 30", spec); } @@ -56,7 +56,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 5, 100) + " -L chr20:332341-382503", 1, - Arrays.asList("96bb413a83c777ebbe622438e4565e8f")); + Arrays.asList("aaa7c25d118383639f273128d241e140")); executeTest("MAX 5 het sites [TEST FOUR]; require PQ >= 100", spec); } @@ -66,7 +66,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 1000, 7, 10) + " -L chr20:332341-482503", 1, - Arrays.asList("7d2402f055d243e2208db9ea47973e13")); + Arrays.asList("418e29400762972e77bae4f73e16befe")); executeTest("MAX 7 het sites [TEST FIVE]; require PQ >= 10; cacheWindow = 1000", spec); } @@ -76,7 +76,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:652810-681757", 1, - Arrays.asList("72682b3f27c33580d2d4515653ba6de7")); + Arrays.asList("4c8f6190ecc86766baba3aba08542991")); executeTest("MAX 10 het sites [TEST SIX]; require PQ >= 10; cacheWindow = 20000; has inconsistent sites", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 0e213a090..8d6a18de0 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -27,7 +27,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf", "0ddd1e0e483d2eaf56004615cea23ec7", // tranches "b9709e4180e56abc691b208bd3e8626c", // recal file - "c58ff4140e8914f0b656ed625c7f73b9"); // cut VCF + "4c73ff0c8c5ae0055bfacf33329a2406"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { @@ -119,6 +119,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testApplyRecalibrationIndel-"+params.inVCF, spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 2bd91ca85..bbee99ba6 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -142,17 +142,17 @@ public class CombineVariantsIntegrationTest extends WalkerTest { cvExecuteTest("combineComplexSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec); } - @Test public void complexTestFull() { combineComplexSites("", "8b19b54516b59de40992f0c4b328258a"); } - @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "a38dd097adc37420fe36ef8be14cfded"); } - @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "a3957dac9a617f50ce2668607e3baef0"); } - @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "a3957dac9a617f50ce2668607e3baef0"); } + @Test public void complexTestFull() { combineComplexSites("", "151a4970367dd3e73ba3e7f3c2f874f6"); } + @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "c0625e092b878b3d3eb1703c48e216b7"); } + @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "6978329d6a1033ac16f83b49072c679b"); } + @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "6978329d6a1033ac16f83b49072c679b"); } @Test public void combineDBSNPDuplicateSites() { WalkerTestSpec spec = new WalkerTestSpec( "-T CombineVariants --no_cmdline_in_header -L 1:902000-903000 -o %s -R " + b37KGReference + " -V:v1 " + b37dbSNP132, 1, - Arrays.asList("3d2a5a43db86e3f6217ed2a63251285b")); + Arrays.asList("aa926eae333208dc1f41fe69dc95d7a6")); cvExecuteTest("combineDBSNPDuplicateSites:", spec); } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java index 1711e6e3c..e14580ead 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java @@ -40,7 +40,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T LiftoverVariants -o %s -R " + b36KGReference + " --variant " + privateTestDir + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", 1, - Arrays.asList("a139480c004859452d4095fe4859b42e")); + Arrays.asList("7d5f91fcf419211ae9eca6c66dcec0e6")); executeTest("test b36 to hg19", spec); } @@ -49,7 +49,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T LiftoverVariants -o %s -R " + b36KGReference + " --variant " + privateTestDir + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.unsortedSamples.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", 1, - Arrays.asList("91344768f1e98c979364ec0d5d3aa9d6")); + Arrays.asList("29dab3555e7f1ee6a60e267b00215a11")); executeTest("test b36 to hg19, unsorted samples", spec); } @@ -58,7 +58,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T LiftoverVariants -o %s -R " + hg18Reference + " --variant:vcf " + privateTestDir + "liftover_test.vcf -chain " + validationDataLocation + "hg18ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", 1, - Arrays.asList("e0b813ff873185ab51995a151f80ec98")); + Arrays.asList("7e7bad0e1890753a01303c09a38ceb8d")); executeTest("test hg18 to hg19, unsorted", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 7af9a7aa3..30cdbee36 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -34,7 +34,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -sn A -sn B -sn C --variant " + testfile), 1, - Arrays.asList("337bb7fc23153cf67acc42a466834775") + Arrays.asList("3d98a024bf3aecbd282843e0af89d0e6") ); executeTest("testRepeatedLineSelection--" + testfile, spec); @@ -64,7 +64,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), 1, - Arrays.asList("ad0514b723ee1479d861291622bd4311") + Arrays.asList("433eccaf1ac6e6be500ef0984a5d8d8b") ); spec.disableShadowBCF(); executeTest("testComplexSelection--" + testfile, spec); @@ -78,7 +78,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile, 1, - Arrays.asList("bc0e00d0629b2bc6799e7e9db0dc775c") + Arrays.asList("1f5c72951a35667c4bdf1be153787e27") ); spec.disableShadowBCF(); @@ -109,7 +109,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("a111642779b05de33ad04073d6022c21") + Arrays.asList("ca2b70e3171420b08b0a2659bfe2a794") ); executeTest("testVariantTypeSelection--" + testFile, spec); @@ -176,7 +176,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { spec = new WalkerTestSpec( baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"), 1, - Arrays.asList("ad0514b723ee1479d861291622bd4311") + Arrays.asList("433eccaf1ac6e6be500ef0984a5d8d8b") ); spec.disableShadowBCF(); executeTest("testParallelization (2 threads)--" + testfile, spec); @@ -190,7 +190,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { spec = new WalkerTestSpec( baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"), 1, - Arrays.asList("ad0514b723ee1479d861291622bd4311") + Arrays.asList("433eccaf1ac6e6be500ef0984a5d8d8b") ); spec.disableShadowBCF(); @@ -204,7 +204,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, 1, - Arrays.asList("9acd6effcc78bfb832bed5edfd6a1b5b") + Arrays.asList("3ab35d5e81a29fb5db3e2add11c7e823") ); executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); } @@ -223,7 +223,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { final String testFile = privateTestDir + "missingHeaderLine.vcf"; final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s --no_cmdline_in_header" - + (expectedException == null ? " -lenientVCFProcessing" : ""); + + (expectedException == null ? " -U LENIENT_VCF_PROCESSING" : ""); WalkerTestSpec spec = expectedException != null ? new WalkerTestSpec(cmd, 1, expectedException) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java index eb79228e7..b0870b346 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java @@ -89,7 +89,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingVCFInput() { List md5 = new ArrayList(); - md5.add("95898aad8c9f9515c0e668e2fb65a024"); + md5.add("21084d32ce7ac5df3cee1730bfaaf71c"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + From 7ef5ce28cc35eb93c2267643244d2afa9bc2b238 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 25 Jun 2012 20:56:00 -0400 Subject: [PATCH 12/32] VariantRecalibrator test currently doesn't work with shadowBCF --- .../VariantRecalibrationWalkersIntegrationTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 8d6a18de0..857032579 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -70,6 +70,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testApplyRecalibration-"+params.inVCF, spec); } From 1f45551a15311a71b90f3bb0f24230f3a0e85803 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 26 Jun 2012 15:28:17 -0400 Subject: [PATCH 13/32] Bugfixes to G count types in VCF header -- Previously VCF header lines of count type G assumed that the sample would be diploid. -- Generalized the code to take a VariantContext and return the right result for G count types by calling into the correct numGenotypes in GenotypeLikelihoods class -- renamed calcNumGenotypes to numGenotypes, which uses a static cache in the class -- calcNumGenotypes is private, and is used to build the static cache or to compute on the fly for uncached No. allele / ploidy combinations -- VariantContext calls into getMaxPloidy in GenotypesContext, which caches the max ploidy among samples -- Added extensive unit tests that compare A and G type values in genotypes --- .../codecs/vcf/VCFCompoundHeaderLine.java | 21 ++- .../sting/utils/variantcontext/Genotype.java | 2 +- .../variantcontext/GenotypeLikelihoods.java | 144 +++++++++--------- .../variantcontext/GenotypesContext.java | 15 ++ .../utils/variantcontext/VariantContext.java | 11 +- .../variantcontext/VariantContextBuilder.java | 12 +- .../variantcontext/VariantContextUtils.java | 4 +- .../writer/BCF2FieldEncoder.java | 2 +- .../variantcontext/writer/VCFWriter.java | 2 +- .../GenotypeLikelihoodsUnitTest.java | 4 +- .../VariantContextTestProvider.java | 52 ++++++- 11 files changed, 176 insertions(+), 93 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index 97f3ecd0c..6f9a8f5e6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -28,6 +28,8 @@ import org.apache.log4j.Logger; import org.broad.tribble.TribbleException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; import java.util.LinkedHashMap; @@ -67,14 +69,25 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF return count; } - // utility method - public int getCount(int numAltAlleles) { + /** + * Get the number of values expected for this header field, given the properties of VariantContext vc + * + * If the count is a fixed count, return that. For example, a field with size of 1 in the header returns 1 + * If the count is of type A, return vc.getNAlleles - 1 + * If the count is of type G, return the expected number of genotypes given the number of alleles in VC and the + * max ploidy among all samples + * If the count is UNBOUNDED return -1 + * + * @param vc + * @return + */ + public int getCount(final VariantContext vc) { int myCount; switch ( countType ) { case INTEGER: myCount = count; break; case UNBOUNDED: myCount = -1; break; - case A: myCount = numAltAlleles; break; - case G: myCount = ((numAltAlleles + 1) * (numAltAlleles + 2) / 2); break; + case A: myCount = vc.getNAlleles() - 1; break; + case G: myCount = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), vc.getMaxPloidy()); break; default: throw new ReviewedStingException("Unknown count type: " + countType); } return myCount; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index f45b0e615..d268aabc6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -554,7 +554,7 @@ public abstract class Genotype implements Comparable { pairs.add(k + "=" + c.get(k)); } - return "{" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}"; + return pairs.isEmpty() ? "" : " {" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}"; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index fa41a3c99..7c745628a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -24,6 +24,8 @@ package org.broadinstitute.sting.utils.variantcontext; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import org.broad.tribble.TribbleException; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -34,6 +36,11 @@ import java.util.Arrays; import java.util.EnumMap; public class GenotypeLikelihoods { + private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5; + private final static int NUM_LIKELIHOODS_CACHE_PLOIDY = 10; + // caching numAlleles up to 5 and ploidy up to 10 + private final static int[][] numLikelihoodCache = new int[NUM_LIKELIHOODS_CACHE_N_ALLELES][NUM_LIKELIHOODS_CACHE_PLOIDY]; + public final static int MAX_PL = Short.MAX_VALUE; // @@ -44,6 +51,30 @@ public class GenotypeLikelihoods { private double[] log10Likelihoods = null; private String likelihoodsAsString_PLs = null; + + /** + * initialize num likelihoods cache + */ + static { + // must be done before PLIndexToAlleleIndex + for ( int numAlleles = 1; numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES; numAlleles++ ) { + //numLikelihoodCache[numAlleles] = new int[NUM_LIKELIHOODS_CACHE_PLOIDY]; + for ( int ploidy = 1; ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY; ploidy++ ) { + numLikelihoodCache[numAlleles][ploidy] = calcNumLikelihoods(numAlleles, ploidy); + } + } + } + + /** + * The maximum number of alleles that we can represent as genotype likelihoods + */ + public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 50; + + /* + * a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles + */ + private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); + public final static GenotypeLikelihoods fromPLField(String PLs) { return new GenotypeLikelihoods(PLs); } @@ -245,47 +276,11 @@ public class GenotypeLikelihoods { return likelihoodsAsVector; } -// // ------------------------------------------------------------------------------------- -// // -// // List interface functions -// // -// // ------------------------------------------------------------------------------------- -// -// private final void notImplemented() { -// throw new ReviewedStingException("BUG: code not implemented"); -// } -// -// @Override public int size() { return getAsVector().length; } -// @Override public Double get(final int i) { return getAsVector()[i];} -// @Override public Double set(final int i, final Double aDouble) { return getAsVector()[i] = aDouble; } -// @Override public boolean isEmpty() { return false; } -// @Override public Iterator iterator() { return Arrays.asList(ArrayUtils.toObject(getAsVector())).iterator(); } -// @Override public Object[] toArray() { return ArrayUtils.toObject(getAsVector()); } -// -// // none of these are implemented -// @Override public boolean contains(final Object o) { notImplemented(); return false; } -// @Override public T[] toArray(final T[] ts) { notImplemented(); return null; } -// @Override public boolean add(final Double aDouble) { notImplemented(); return false; } -// @Override public boolean remove(final Object o) {notImplemented(); return false; } -// @Override public boolean containsAll(final Collection objects) { notImplemented(); return false; } -// @Override public boolean addAll(final Collection doubles) { notImplemented(); return false; } -// @Override public boolean addAll(final int i, final Collection doubles) { notImplemented(); return false; } -// @Override public boolean removeAll(final Collection objects) { notImplemented(); return false; } -// @Override public boolean retainAll(final Collection objects) { notImplemented(); return false; } -// @Override public void clear() { notImplemented(); } -// @Override public void add(final int i, final Double aDouble) { notImplemented(); } -// @Override public Double remove(final int i) { notImplemented(); return null; } -// @Override public int indexOf(final Object o) { notImplemented(); return -1; } -// @Override public int lastIndexOf(final Object o) { notImplemented(); return 0; } -// @Override public ListIterator listIterator() { notImplemented(); return null; } -// @Override public ListIterator listIterator(final int i) { notImplemented(); return null; } -// @Override public List subList(final int i, final int i1) { notImplemented(); return null; } - -// ------------------------------------------------------------------------------------- -// -// Static conversion utilities, going from GL/PL index to allele index and vice versa. -// -// ------------------------------------------------------------------------------------- + // ------------------------------------------------------------------------------------- + // + // Static conversion utilities, going from GL/PL index to allele index and vice versa. + // + // ------------------------------------------------------------------------------------- /* * Class representing the 2 alleles (or rather their indexes into VariantContext.getAllele()) corresponding to a specific PL index. @@ -300,18 +295,8 @@ public class GenotypeLikelihoods { } } - /** - * The maximum number of alleles that we can represent as genotype likelihoods - */ - public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 50; - - /* - * a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles - */ - private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); - private static GenotypeLikelihoodsAllelePair[] calculatePLcache(final int altAlleles) { - final int numLikelihoods = calculateNumLikelihoods(1+altAlleles, 2); + final int numLikelihoods = numLikelihoods(1 + altAlleles, 2); final GenotypeLikelihoodsAllelePair[] cache = new GenotypeLikelihoodsAllelePair[numLikelihoods]; // for all possible combinations of 2 alleles @@ -330,6 +315,32 @@ public class GenotypeLikelihoods { return cache; } + // ------------------------------------------------------------------------------------- + // + // num likelihoods given number of alleles and ploidy + // + // ------------------------------------------------------------------------------------- + + /** + * Actually does the computation in @see #numLikelihoods + * + * @param numAlleles + * @param ploidy + * @return + */ + private static final int calcNumLikelihoods(final int numAlleles, final int ploidy) { + if (numAlleles == 1) + return 1; + else if (ploidy == 1) + return numAlleles; + else { + int acc =0; + for (int k=0; k <= ploidy; k++ ) + acc += calcNumLikelihoods(numAlleles - 1, ploidy - k); + return acc; + } + } + /** * Compute how many likelihood elements are associated with the given number of alleles * Equivalent to asking in how many ways N non-negative integers can add up to P is S(N,P) @@ -344,6 +355,8 @@ public class GenotypeLikelihoods { * which is then, for ordering above, (2,0,0), (1,1,0), (0,2,0), (1,1,0), (0,1,1), (0,0,2) * In general, for P=2 (regular biallelic), then S(N,2) = N*(N+1)/2 * + * Note this method caches the value for most common num Allele / ploidy combinations for efficiency + * * Recursive implementation: * S(N,P) = sum_{k=0}^P S(N-1,P-k) * because if we have N integers, we can condition 1 integer to be = k, and then N-1 integers have to sum to P-K @@ -355,23 +368,16 @@ public class GenotypeLikelihoods { * @param ploidy Ploidy, or number of chromosomes in set * @return Number of likelihood elements we need to hold. */ - public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) { - - // fast, closed form solution for diploid samples (most common use case) - if (ploidy==2) - return numAlleles*(numAlleles+1)/2; - - if (numAlleles == 1) - return 1; - else if (ploidy == 1) - return numAlleles; - - int acc =0; - for (int k=0; k <= ploidy; k++ ) - acc += calculateNumLikelihoods(numAlleles-1, ploidy-k); - - return acc; - + @Requires({"ploidy > 0", "numAlleles > 0"}) + @Ensures("result > 0") + public static int numLikelihoods(final int numAlleles, final int ploidy) { + if ( numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES + && ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY ) + return numLikelihoodCache[numAlleles][ploidy]; + else { + // have to calculate on the fly + return calcNumLikelihoods(numAlleles, ploidy); + } } // As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j. diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java index fc4175735..9577a3e63 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -61,6 +61,11 @@ public class GenotypesContext implements List { */ ArrayList notToBeDirectlyAccessedGenotypes; + /** + * Cached value of the maximum ploidy observed among all samples + */ + private int maxPloidy = -1; + /** Are we allowing users to modify the list? */ boolean immutable = false; @@ -408,6 +413,16 @@ public class GenotypesContext implements List { return getGenotypes().get(i); } + @Ensures("result >= 0") + public int getMaxPloidy() { + if ( maxPloidy == -1 ) { + for ( final Genotype g : getGenotypes() ) { + maxPloidy = Math.max(g.getPloidy(), maxPloidy); + } + } + return maxPloidy; + } + /** * Gets sample associated with this sampleName, or null if none is found * diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index b4adc0a8a..dc600d97c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -626,14 +626,13 @@ public class VariantContext implements Feature { // to enable tribble integratio /** * Returns the maximum ploidy of all samples in this VC, or -1 if there are no genotypes + * + * This function is caching, so it's only expensive on the first call + * * @return -1, or the max ploidy */ public int getMaxPloidy() { - int max = -1; - for ( final Genotype g : getGenotypes() ) { - max = Math.max(g.getPloidy(), max); - } - return max; + return genotypes.getMaxPloidy(); } /** @@ -1381,7 +1380,7 @@ public class VariantContext implements Feature { // to enable tribble integratio && format.getCountType() != VCFHeaderLineCount.UNBOUNDED && format.getType() != VCFHeaderLineType.Flag ) { // we expect exactly the right number of elements final int obsSize = decoded instanceof List ? ((List) decoded).size() : 1; - final int expSize = format.getCount(this.getNAlleles() - 1); + final int expSize = format.getCount(this); if ( obsSize != expSize ) { throw new UserException.MalformedVCFHeader("Discordant field size detected for field " + field + " at " + getChr() + ":" + getStart() + ". Field had " + obsSize + " values " + diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java index 83ddd2a1f..01d3ab456 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java @@ -159,16 +159,20 @@ public class VariantContextBuilder { return this; } - public VariantContextBuilder alleles(final String ... alleleStrings) { - List alleles = new ArrayList(alleleStrings.length); + public VariantContextBuilder alleles(final List alleleStrings) { + List alleles = new ArrayList(alleleStrings.size()); - for ( int i = 0; i < alleleStrings.length; i++ ) { - alleles.add(Allele.create(alleleStrings[i], i == 0)); + for ( int i = 0; i < alleleStrings.size(); i++ ) { + alleles.add(Allele.create(alleleStrings.get(i), i == 0)); } return alleles(alleles); } + public VariantContextBuilder alleles(final String ... alleleStrings) { + return alleles(Arrays.asList(alleleStrings)); + } + public List getAlleles() { return new ArrayList(alleles); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 223b4509b..b697b3381 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -1199,8 +1199,8 @@ public class VariantContextUtils { altAlleleIndexToUse[i] = true; } - // calculateNumLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 - final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(1+numOriginalAltAlleles, DEFAULT_PLOIDY); + // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(1 + numOriginalAltAlleles, DEFAULT_PLOIDY); for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); // consider this entry only if both of the alleles are good diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java index ecc1cd3e0..812e6dd07 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java @@ -185,7 +185,7 @@ public abstract class BCF2FieldEncoder { @Requires("hasContextDeterminedNumElements()") @Ensures("result >= 0") public int numElements(final VariantContext vc) { - return headerLine.getCount(vc.getNAlleles() - 1); + return headerLine.getCount(vc); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index 03a62019c..651223ac3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -402,7 +402,7 @@ class VCFWriter extends IndexingVariantContextWriter { VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field); if ( metaData != null ) { - int numInFormatField = metaData.getCount(vc.getAlternateAlleles().size()); + int numInFormatField = metaData.getCount(vc); if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) { // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. // For example, if Number=2, the string has to be ".,." diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index abaf23132..69f42e1f9 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -100,10 +100,10 @@ public class GenotypeLikelihoodsUnitTest { for (int nAlleles=2; nAlleles<=5; nAlleles++) // simplest case: diploid - Assert.assertEquals(GenotypeLikelihoods.calculateNumLikelihoods(nAlleles, 2), nAlleles*(nAlleles+1)/2); + Assert.assertEquals(GenotypeLikelihoods.numLikelihoods(nAlleles, 2), nAlleles*(nAlleles+1)/2); // some special cases: ploidy = 20, #alleles = 4 - Assert.assertEquals(GenotypeLikelihoods.calculateNumLikelihoods(4, 20), 1771); + Assert.assertEquals(GenotypeLikelihoods.numLikelihoods(4, 20), 1771); } @Test diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 14d60b6e9..e5b45f70f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -51,6 +51,8 @@ import java.util.*; public class VariantContextTestProvider { final protected static Logger logger = Logger.getLogger(VariantContextTestProvider.class); + final private static boolean ENABLE_GENOTYPE_TESTS = true; + final private static boolean ENABLE_A_AND_G_TESTS = true; final private static boolean ENABLE_VARARRAY_TESTS = true; final private static boolean ENABLE_PLOIDY_TESTS = true; final private static boolean ENABLE_PL_TESTS = true; @@ -180,6 +182,7 @@ public class VariantContextTestProvider { addHeaderLine(metaData, "GT", 1, VCFHeaderLineType.String); addHeaderLine(metaData, "GQ", 1, VCFHeaderLineType.Integer); + addHeaderLine(metaData, "ADA", VCFHeaderLineCount.A, VCFHeaderLineType.Integer); addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer); addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String); addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); @@ -268,9 +271,13 @@ public class VariantContextTestProvider { add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2", "s3"))); add(builder().attribute("VAR.INFO.STRING", null)); - addGenotypesToTestData(); + if ( ENABLE_GENOTYPE_TESTS ) { + addGenotypesToTestData(); + addComplexGenotypesTest(); + } - addComplexGenotypesTest(); + if ( ENABLE_A_AND_G_TESTS ) + addGenotypesAndGTests(); } private static void addGenotypesToTestData() { @@ -315,7 +322,6 @@ public class VariantContextTestProvider { } } - private static void addGenotypes( final VariantContext site) { // test ref/ref final Allele ref = site.getReference(); @@ -516,6 +522,46 @@ public class VariantContextTestProvider { } } + private static void addGenotypesAndGTests() { +// for ( final int ploidy : Arrays.asList(2)) { + for ( final int ploidy : Arrays.asList(1, 2, 3, 4, 5)) { + final List> alleleCombinations = + Arrays.asList( + Arrays.asList("A"), + Arrays.asList("A", "C"), + Arrays.asList("A", "C", "G"), + Arrays.asList("A", "C", "G", "T")); + + for ( final List alleles : alleleCombinations ) { + final VariantContextBuilder vcb = builder().alleles(alleles); + final VariantContext site = vcb.make(); + final int nAlleles = site.getNAlleles(); + final Allele ref = site.getReference(); + + // base genotype is ref/.../ref up to ploidy + final List baseGenotype = new ArrayList(ploidy); + for ( int i = 0; i < ploidy; i++) baseGenotype.add(ref); + final int nPLs = GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy); + + // ada is 0, 1, ..., nAlleles - 1 + final List ada = new ArrayList(nAlleles); + for ( int i = 0; i < nAlleles - 1; i++ ) ada.add(i); + + // pl is 0, 1, ..., up to nPLs (complex calc of nAlleles and ploidy) + final int[] pl = new int[nPLs]; + for ( int i = 0; i < pl.length; i++ ) pl[i] = i; + + final GenotypeBuilder gb = new GenotypeBuilder("ADA_PL_SAMPLE"); + gb.alleles(baseGenotype); + gb.PL(pl); + gb.attribute("ADA", nAlleles == 2 ? ada.get(0) : ada); + vcb.genotypes(gb.make()); + + add(vcb); + } + } + } + private static Genotype attr(final String name, final Allele ref, final String key, final Object ... value) { if ( value.length == 0 ) return GenotypeBuilder.create(name, Arrays.asList(ref, ref)); From 91f02dfd851c5449fd55538b34b5fa023e8c2c1b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 26 Jun 2012 17:10:58 -0400 Subject: [PATCH 14/32] fixing pipeline tests (sorry, my bad) --- .../sting/queue/pipeline/DataProcessingPipelineTest.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala index 7e1d09b70..cf4adefe6 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala @@ -39,7 +39,6 @@ class DataProcessingPipelineTest { " -R " + BaseTest.testDir + "exampleFASTA.fasta", " -i " + BaseTest.testDir + "exampleBAM.bam", " -D " + BaseTest.testDir + "exampleDBSNP.vcf", - " -nv ", " -test ", " -p " + projectName).mkString spec.fileMD5s += testOut -> "1f85e76de760167a77ed1d9ab4da2936" @@ -57,7 +56,6 @@ class DataProcessingPipelineTest { " -R " + BaseTest.testDir + "exampleFASTA.fasta", " -i " + BaseTest.testDir + "exampleBAM.bam", " -D " + BaseTest.testDir + "exampleDBSNP.vcf", - " -nv ", " -test ", " -bwa /home/unix/carneiro/bin/bwa", " -bwape ", From cd32b6ae541ecd9a74217acdac3d8b5afcc1c9d5 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 26 Jun 2012 17:30:30 -0400 Subject: [PATCH 15/32] CombineVariantsUnitTest was failing because the header repair was fixing the problem it wanted to detect --- .../gatk/walkers/variantutils/CombineVariantsUnitTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java index 31f704b85..21d49638f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java @@ -39,9 +39,9 @@ public class CombineVariantsUnitTest { "##fileformat=VCFv4.0\n"+ "##filedate=2010-06-21\n"+ "##reference=NCBI36\n"+ - "##INFO=\n"+ + "##INFO=\n"+ "##INFO=\n"+ - "##INFO=\n"+ // string to integer + "##INFO=\n"+ // string to integer "##FILTER=\n"+ "##FORMAT=\n"+ "##FORMAT=\n"+ From 016b25be87f81afd07f215236bb3fa20d45ab3db Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 26 Jun 2012 17:30:48 -0400 Subject: [PATCH 16/32] Update annoying md5s in unit tests, also failing because of header fixing --- .../sting/utils/codecs/vcf/VCFHeaderUnitTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderUnitTest.java index 70460ae01..b8d6f2d1d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderUnitTest.java @@ -33,13 +33,13 @@ public class VCFHeaderUnitTest extends BaseTest { @Test public void testVCF4ToVCF4() { VCFHeader header = createHeader(VCF4headerStrings); - checkMD5ofHeaderFile(header, "47d32e7901650ba69ed41322af638806"); + checkMD5ofHeaderFile(header, "f05a57053a0c6a5bac15dba566f7f7ff"); } @Test public void testVCF4ToVCF4_alternate() { VCFHeader header = createHeader(VCF4headerStrings_with_negativeOne); - checkMD5ofHeaderFile(header, "954e9dd756d5f952cfb401a4db6bd145"); + checkMD5ofHeaderFile(header, "b1d71cc94261053131f8d239d65a8c9f"); } /** @@ -112,7 +112,7 @@ public class VCFHeaderUnitTest extends BaseTest { "##reference=NCBI36\n"+ "##INFO=\n"+ "##INFO=\n"+ - "##INFO=\n"+ + "##INFO=\n"+ "##INFO=\n"+ "##INFO=\n"+ "##INFO=\n"+ @@ -132,7 +132,7 @@ public class VCFHeaderUnitTest extends BaseTest { "##reference=NCBI36\n"+ "##INFO=\n"+ "##INFO=\n"+ - "##INFO=\n"+ + "##INFO=\n"+ "##INFO=\n"+ "##INFO=\n"+ "##INFO=\n"+ From 746a5e95f318b0da85f98ce9da1288d7cc1cf796 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Wed, 27 Jun 2012 01:15:22 -0400 Subject: [PATCH 18/32] Refactored parsing of Rod/IntervalBinding. Queue S/G now uses all interval arguments passed to CommandLineGATK QFunctions including support for BED/tribble types, XL, ISR, and padding. Updated HSP to use new padding arguments instead of flank intervals file, plus latest QC evals. IntervalUtils return unmodifiable lists so that utilities don't mutate the collections. Added a JavaCommandLineFunction.javaGCThreads option to test reducing java's automatic GC thread allocation based on num cpus. Added comma to list of characters to convert to underscores in GridEngine job names so that GE JSV doesn't choke on the -N values. JobRunInfo handles the null done times when jobs crash with strange errors. --- .../commandline/ArgumentTypeDescriptor.java | 231 +++++++++--------- .../sting/commandline/ParsingMethod.java | 41 ++-- .../sting/gatk/GenomeAnalysisEngine.java | 56 +---- .../sting/utils/interval/IntervalUtils.java | 92 ++++++- .../gatk/GenomeAnalysisEngineUnitTest.java | 78 +----- .../utils/interval/IntervalUtilsUnitTest.java | 99 +++++++- .../sting/queue/engine/JobRunInfo.scala | 22 +- .../gridengine/GridEngineJobRunner.scala | 2 +- .../gatk/ContigScatterFunction.scala | 4 +- .../queue/extensions/gatk/GATKIntervals.scala | 66 ++++- .../extensions/gatk/GATKScatterFunction.scala | 51 ++-- .../gatk/IntervalScatterFunction.scala | 4 +- .../gatk/LocusScatterFunction.scala | 4 +- .../extensions/gatk/VcfGatherFunction.scala | 9 +- .../function/JavaCommandLineFunction.scala | 7 + .../sting/queue/util/QJobReport.scala | 4 +- .../ScalaCompoundArgumentTypeDescriptor.scala | 2 +- .../gatk/GATKIntervalsUnitTest.scala | 91 +++++-- .../ExampleUnifiedGenotyperPipelineTest.scala | 39 +++ 19 files changed, 550 insertions(+), 352 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index 94ed23caf..d5503b2a9 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -289,7 +289,7 @@ public abstract class ArgumentTypeDescriptor { return field.isAnnotationPresent(Hidden.class); } - public Class makeRawTypeIfNecessary(Type t) { + public static Class makeRawTypeIfNecessary(Type t) { if ( t == null ) return null; else if ( t instanceof ParameterizedType ) @@ -300,6 +300,114 @@ public abstract class ArgumentTypeDescriptor { throw new IllegalArgumentException("Unable to determine Class-derived component type of field: " + t); } } + + /** + * The actual argument parsing method. + * @param source source + * @param type type to check + * @param matches matches + * @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding. + */ + protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) { + ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); + String value = getArgumentValue(defaultDefinition, matches); + @SuppressWarnings("unchecked") + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + String name = defaultDefinition.fullName; + + return parseBinding(value, parameterType, type, name, tags, source.field.getName()); + } + + /** + * + * @param value The source of the binding + * @param parameterType The Tribble Feature parameter type + * @param bindingClass The class type for the binding (ex: RodBinding, IntervalBinding, etc.) Must have the correct constructor for creating the binding. + * @param bindingName The name of the binding passed to the constructor. + * @param tags Tags for the binding used for parsing and passed to the constructor. + * @param fieldName The name of the field that was parsed. Used for error reporting. + * @return The newly created binding object of type bindingClass. + */ + public static Object parseBinding(String value, Class parameterType, Type bindingClass, + String bindingName, Tags tags, String fieldName) { + try { + String tribbleType = null; + // must have one or two tag values here + if ( tags.getPositionalTags().size() > 2 ) { + throw new UserException.CommandLineException( + String.format("Unexpected number of positional tags for argument %s : %s. " + + "Rod bindings only support -X:type and -X:name,type argument styles", + value, fieldName)); + } else if ( tags.getPositionalTags().size() == 2 ) { + // -X:name,type style + bindingName = tags.getPositionalTags().get(0); + tribbleType = tags.getPositionalTags().get(1); + + FeatureManager manager = new FeatureManager(); + if ( manager.getByName(tribbleType) == null ) + throw new UserException.CommandLineException( + String.format("Unable to find tribble type '%s' provided on the command line. " + + "Please select a correct type from among the supported types:%n%s", + tribbleType, manager.userFriendlyListOfAvailableFeatures(parameterType))); + + } else { + // case with 0 or 1 positional tags + FeatureManager manager = new FeatureManager(); + + // -X:type style is a type when we cannot determine the type dynamically + String tag1 = tags.getPositionalTags().size() == 1 ? tags.getPositionalTags().get(0) : null; + if ( tag1 != null ) { + if ( manager.getByName(tag1) != null ) // this a type + tribbleType = tag1; + else + bindingName = tag1; + } + + if ( tribbleType == null ) { + // try to determine the file type dynamically + File file = new File(value); + if ( file.canRead() && file.isFile() ) { + FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); + if ( featureDescriptor != null ) { + tribbleType = featureDescriptor.getName(); + logger.info("Dynamically determined type of " + file + " to be " + tribbleType); + } + } + + if ( tribbleType == null ) { + // IntervalBinding can be created from a normal String + Class rawType = (makeRawTypeIfNecessary(bindingClass)); + try { + return rawType.getConstructor(String.class).newInstance(value); + } catch (NoSuchMethodException e) { + /* ignore */ + } + + if ( ! file.exists() ) { + throw new UserException.CouldNotReadInputFile(file, "file does not exist"); + } else if ( ! file.canRead() || ! file.isFile() ) { + throw new UserException.CouldNotReadInputFile(file, "file could not be read"); + } else { + throw new UserException.CommandLineException( + String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " + + "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s", + manager.userFriendlyListOfAvailableFeatures(parameterType))); + } + } + } + } + + Constructor ctor = (makeRawTypeIfNecessary(bindingClass)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); + return ctor.newInstance(parameterType, bindingName, value, tribbleType, tags); + } catch (Exception e) { + if ( e instanceof UserException ) + throw ((UserException)e); + else + throw new UserException.CommandLineException( + String.format("Failed to parse value %s for argument %s. Message: %s", + value, fieldName, e.getMessage())); + } + } } /** @@ -324,6 +432,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { public boolean createsTypeDefault(ArgumentSource source) { return ! source.isRequired(); } @Override + @SuppressWarnings("unchecked") public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { Class parameterType = JVMUtils.getParameterizedTypeClass(type); return RodBinding.makeUnbound((Class)parameterType); @@ -336,118 +445,16 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { @Override public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { - return parse(parsingEngine, source, type, matches, false); - } - - /** - * The actual argument parsing method. - * - * IMPORTANT NOTE: the createIntervalBinding argument is a bit of a hack, but after discussions with SE we've decided - * that it's the best way to proceed for now. IntervalBindings can either be proper RodBindings (hence the use of - * this parse() method) or can be Strings (representing raw intervals or the files containing them). If createIntervalBinding - * is true, we do not call parsingEngine.addRodBinding() because we don't want walkers to assume that these are the - * usual set of RodBindings. It also allows us in the future to be smart about tagging rods as intervals. One other - * side point is that we want to continue to allow the usage of non-Feature intervals so that users can theoretically - * continue to input them out of order (whereas Tribble Features are ordered). - * - * @param parsingEngine parsing engine - * @param source source - * @param type type to check - * @param matches matches - * @param createIntervalBinding should we attempt to create an IntervalBinding instead of a RodBinding? - * @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding. - */ - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches, boolean createIntervalBinding) { - ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); - String value = getArgumentValue( defaultDefinition, matches ); - Class parameterType = JVMUtils.getParameterizedTypeClass(type); - - try { - String name = defaultDefinition.fullName; - String tribbleType = null; - Tags tags = getArgumentTags(matches); - // must have one or two tag values here - if ( tags.getPositionalTags().size() > 2 ) { - throw new UserException.CommandLineException( - String.format("Unexpected number of positional tags for argument %s : %s. " + - "Rod bindings only support -X:type and -X:name,type argument styles", - value, source.field.getName())); - } if ( tags.getPositionalTags().size() == 2 ) { - // -X:name,type style - name = tags.getPositionalTags().get(0); - tribbleType = tags.getPositionalTags().get(1); - } else { - // case with 0 or 1 positional tags - FeatureManager manager = new FeatureManager(); - - // -X:type style is a type when we cannot determine the type dynamically - String tag1 = tags.getPositionalTags().size() == 1 ? tags.getPositionalTags().get(0) : null; - if ( tag1 != null ) { - if ( manager.getByName(tag1) != null ) // this a type - tribbleType = tag1; - else - name = tag1; - } - - if ( tribbleType == null ) { - // try to determine the file type dynamically - File file = new File(value); - if ( file.canRead() && file.isFile() ) { - FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); - if ( featureDescriptor != null ) { - tribbleType = featureDescriptor.getName(); - logger.info("Dynamically determined type of " + file + " to be " + tribbleType); - } - } - - if ( tribbleType == null ) { - // IntervalBindings allow streaming conversion of Strings - if ( createIntervalBinding ) { - return new IntervalBinding(value); - } - - if ( ! file.exists() ) { - throw new UserException.CouldNotReadInputFile(file, "file does not exist"); - } else if ( ! file.canRead() || ! file.isFile() ) { - throw new UserException.CouldNotReadInputFile(file, "file could not be read"); - } else { - throw new UserException.CommandLineException( - String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " + - "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s", - manager.userFriendlyListOfAvailableFeatures(parameterType))); - } - } - } - } - - Constructor ctor = (makeRawTypeIfNecessary(type)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); - Object result; - if ( createIntervalBinding ) { - result = ctor.newInstance(parameterType, name, value, tribbleType, tags); - } else { - RodBinding rbind = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags); - parsingEngine.addTags(rbind, tags); - parsingEngine.addRodBinding(rbind); - result = rbind; - } - return result; - } catch (InvocationTargetException e) { - throw new UserException.CommandLineException( - String.format("Failed to parse value %s for argument %s.", - value, source.field.getName())); - } catch (Exception e) { - if ( e instanceof UserException ) - throw ((UserException)e); - else - throw new UserException.CommandLineException( - String.format("Failed to parse value %s for argument %s. Message: %s", - value, source.field.getName(), e.getMessage())); - } + Tags tags = getArgumentTags(matches); + RodBinding rbind = (RodBinding)parseBinding(source, type, matches, tags); + parsingEngine.addTags(rbind, tags); + parsingEngine.addRodBinding(rbind); + return rbind; } } /** - * Parser for RodBinding objects + * Parser for IntervalBinding objects */ class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { /** @@ -475,7 +482,7 @@ class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { */ @Override public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { - return new RodBindingArgumentTypeDescriptor().parse(parsingEngine, source, type, matches, true); + return parseBinding(source, type, matches, getArgumentTags(matches)); } } @@ -783,7 +790,7 @@ class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor { } Class multiplexerType = dependentArgument.field.getAnnotation(Multiplex.class).value(); - Constructor multiplexerConstructor = null; + Constructor multiplexerConstructor; try { multiplexerConstructor = multiplexerType.getConstructor(sourceTypes); multiplexerConstructor.setAccessible(true); @@ -792,7 +799,7 @@ class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor { throw new ReviewedStingException(String.format("Unable to find constructor for class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); } - Multiplexer multiplexer = null; + Multiplexer multiplexer; try { multiplexer = multiplexerConstructor.newInstance(sourceValues); } diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java index 452309e89..26af49e12 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java @@ -78,24 +78,7 @@ public abstract class ParsingMethod { String argument = matcher.group(1).trim(); - Tags tags = new Tags(); - if(matcher.group(2) != null) { - for(String tag: Utils.split(matcher.group(2),",")) { - // Check for presence of an '=' sign, indicating a key-value pair in the tag line. - int equalDelimiterPos = tag.indexOf('='); - if(equalDelimiterPos >= 0) { - // Sanity check; ensure that there aren't multiple '=' in this key-value pair. - if(tag.indexOf('=',equalDelimiterPos+1) >= 0) - throw new ArgumentException(String.format("Tag %s passed to argument %s is malformed. Please ensure that " + - "key-value tags are of the form =, and neither key " + - "nor value contain the '=' character", tag, argument)); - tags.addKeyValueTag(tag.substring(0,equalDelimiterPos),tag.substring(equalDelimiterPos+1)); - } - else - tags.addPositionalTag(tag); - - } - } + Tags tags = parseTags(argument, matcher.group(2)); // Find the most appropriate argument definition for the given argument. ArgumentDefinition argumentDefinition = definitions.findArgumentDefinition( argument, definitionMatcher ); @@ -105,6 +88,28 @@ public abstract class ParsingMethod { return new ArgumentMatch(argument,argumentDefinition,position,tags); } + public static Tags parseTags(String argument, String tagString) { + Tags tags = new Tags(); + if (tagString != null) { + for(String tag: Utils.split(tagString, ",")) { + // Check for presence of an '=' sign, indicating a key-value pair in the tag line. + int equalDelimiterPos = tag.indexOf('='); + if(equalDelimiterPos >= 0) { + // Sanity check; ensure that there aren't multiple '=' in this key-value pair. + if(tag.indexOf('=',equalDelimiterPos+1) >= 0) + throw new ArgumentException(String.format("Tag %s passed to argument %s is malformed. Please ensure that " + + "key-value tags are of the form =, and neither key " + + "nor value contain the '=' character", tag, argument)); + tags.addKeyValueTag(tag.substring(0,equalDelimiterPos),tag.substring(equalDelimiterPos+1)); + } + else + tags.addPositionalTag(tag); + + } + } + return tags; + } + /** * A command-line argument always starts with an alphabetical character or underscore followed by any word character. */ diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 80cbd3dad..68680dd10 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -30,7 +30,6 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; -import org.broad.tribble.Feature; import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; @@ -54,9 +53,9 @@ import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; @@ -582,7 +581,6 @@ public class GenomeAnalysisEngine { * Setup the intervals to be processed */ protected void initializeIntervals() { - // return if no interval arguments at all if ( argCollection.intervals == null && argCollection.excludeIntervals == null ) return; @@ -590,17 +588,22 @@ public class GenomeAnalysisEngine { // Note that the use of '-L all' is no longer supported. // if include argument isn't given, create new set of all possible intervals - GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ? - GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) : - loadIntervals(argCollection.intervals, argCollection.intervalSetRule, argCollection.intervalPadding)); + + Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair( + this.referenceDataSource, + argCollection.intervals, + argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, + argCollection.excludeIntervals); + + GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); + GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); // if no exclude arguments, can return parseIntervalArguments directly - if ( argCollection.excludeIntervals == null ) + if ( excludeSortedSet == null ) intervals = includeSortedSet; // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets else { - GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, IntervalSetRule.UNION); intervals = includeSortedSet.subtractRegions(excludeSortedSet); // logging messages only printed when exclude (-XL) arguments are given @@ -613,43 +616,6 @@ public class GenomeAnalysisEngine { } } - /** - * Loads the intervals relevant to the current execution - * @param argList argument bindings; might include filenames, intervals in samtools notation, or a combination of the above - * @param rule interval merging rule - * @return A sorted, merged list of all intervals specified in this arg list. - */ - protected GenomeLocSortedSet loadIntervals( final List> argList, final IntervalSetRule rule ) { - return loadIntervals(argList, rule, 0); - } - - /** - * Loads the intervals relevant to the current execution - * @param argList argument bindings; might include filenames, intervals in samtools notation, or a combination of the above - * @param rule interval merging rule - * @param padding how much to pad the intervals - * @return A sorted, merged list of all intervals specified in this arg list. - */ - protected GenomeLocSortedSet loadIntervals( final List> argList, final IntervalSetRule rule, final int padding ) { - - List allIntervals = new ArrayList(); - for ( IntervalBinding intervalBinding : argList ) { - List intervals = intervalBinding.getIntervals(this.getGenomeLocParser()); - - if ( intervals.isEmpty() ) { - logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed."); - } - - if ( padding > 0 ) { - intervals = IntervalUtils.getIntervalsWithFlanks(this.getGenomeLocParser(), intervals, padding); - } - - allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, rule); - } - - return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, argCollection.intervalMerging); - } - /** * Add additional, externally managed IO streams for inputs. * diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index c96226405..6ee4af288 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -6,6 +6,8 @@ import net.sf.picard.util.Interval; import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -169,21 +171,23 @@ public class IntervalUtils { */ public static List mergeListsBySetOperator(List setOne, List setTwo, IntervalSetRule rule) { // shortcut, if either set is zero, return the other set - if (setOne == null || setOne.size() == 0 || setTwo == null || setTwo.size() == 0) return (setOne == null || setOne.size() == 0) ? setTwo : setOne; + if (setOne == null || setOne.size() == 0 || setTwo == null || setTwo.size() == 0) + return Collections.unmodifiableList((setOne == null || setOne.size() == 0) ? setTwo : setOne); + + // our master list, since we can't guarantee removal time in a generic list + LinkedList retList = new LinkedList(); // if we're set to UNION, just add them all - if (rule == IntervalSetRule.UNION) { - setOne.addAll(setTwo); - return setOne; + if (rule == null || rule == IntervalSetRule.UNION) { + retList.addAll(setOne); + retList.addAll(setTwo); + return Collections.unmodifiableList(retList); } // else we're INTERSECTION, create two indexes into the lists int iOne = 0; int iTwo = 0; - // our master list, since we can't guarantee removal time in a generic list - LinkedList retList = new LinkedList(); - // merge the second into the first using the rule while (iTwo < setTwo.size() && iOne < setOne.size()) // if the first list is ahead, drop items off the second until we overlap @@ -204,7 +208,7 @@ public class IntervalUtils { throw new UserException.BadInput("The INTERSECTION of your -L options produced no intervals."); // we don't need to add the rest of remaining locations, since we know they don't overlap. return what we have - return retList; + return Collections.unmodifiableList(retList); } /** @@ -218,6 +222,8 @@ public class IntervalUtils { * @return A sorted, merged version of the intervals passed in. */ public static GenomeLocSortedSet sortAndMergeIntervals(GenomeLocParser parser, List intervals, IntervalMergingRule mergingRule) { + // Make a copy of the (potentially unmodifiable) list to be sorted + intervals = new ArrayList(intervals); // sort raw interval list Collections.sort(intervals); // now merge raw interval list @@ -481,6 +487,70 @@ public class IntervalUtils { return new SplitLocusRecursive(split, remaining); } + /** + * Setup the intervals to be processed + */ + public static GenomeLocSortedSet parseIntervalBindings( + final ReferenceDataSource referenceDataSource, + final List> intervals, + final IntervalSetRule intervalSetRule, final IntervalMergingRule intervalMergingRule, final int intervalPadding, + final List> excludeIntervals) { + + Pair includeExcludePair = parseIntervalBindingsPair( + referenceDataSource, intervals, intervalSetRule, intervalMergingRule, intervalPadding, excludeIntervals); + + GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); + GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); + + if (excludeSortedSet != null) { + return includeSortedSet.subtractRegions(excludeSortedSet); + } else { + return includeSortedSet; + } + } + + public static Pair parseIntervalBindingsPair( + final ReferenceDataSource referenceDataSource, + final List> intervals, + final IntervalSetRule intervalSetRule, final IntervalMergingRule intervalMergingRule, final int intervalPadding, + final List> excludeIntervals) { + GenomeLocParser genomeLocParser = new GenomeLocParser(referenceDataSource.getReference()); + + // if include argument isn't given, create new set of all possible intervals + GenomeLocSortedSet includeSortedSet = ((intervals == null || intervals.size() == 0) ? + GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()) : + loadIntervals(intervals, intervalSetRule, intervalMergingRule, intervalPadding, genomeLocParser)); + + GenomeLocSortedSet excludeSortedSet = null; + if (excludeIntervals != null && excludeIntervals.size() > 0) { + excludeSortedSet = loadIntervals(excludeIntervals, IntervalSetRule.UNION, intervalMergingRule, 0, genomeLocParser); + } + return new Pair(includeSortedSet, excludeSortedSet); + } + + public static GenomeLocSortedSet loadIntervals( + final List> intervalBindings, + final IntervalSetRule rule, final IntervalMergingRule intervalMergingRule, final int padding, + final GenomeLocParser genomeLocParser) { + List allIntervals = new ArrayList(); + for ( IntervalBinding intervalBinding : intervalBindings) { + @SuppressWarnings("unchecked") + List intervals = intervalBinding.getIntervals(genomeLocParser); + + if ( intervals.isEmpty() ) { + logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed."); + } + + if ( padding > 0 ) { + intervals = getIntervalsWithFlanks(genomeLocParser, intervals, padding); + } + + allIntervals = mergeListsBySetOperator(intervals, allIntervals, rule); + } + + return sortAndMergeIntervals(genomeLocParser, allIntervals, intervalMergingRule); + } + private final static class SplitLocusRecursive { final List split; final LinkedList remaining; @@ -546,7 +616,7 @@ public class IntervalUtils { */ public static List mergeIntervalLocations(final List raw, IntervalMergingRule rule) { if (raw.size() <= 1) - return raw; + return Collections.unmodifiableList(raw); else { ArrayList merged = new ArrayList(); Iterator it = raw.iterator(); @@ -555,7 +625,7 @@ public class IntervalUtils { GenomeLoc curr = it.next(); if (prev.overlapsP(curr)) { prev = prev.merge(curr); - } else if (prev.contiguousP(curr) && rule == IntervalMergingRule.ALL) { + } else if (prev.contiguousP(curr) && (rule == null || rule == IntervalMergingRule.ALL)) { prev = prev.merge(curr); } else { merged.add(prev); @@ -563,7 +633,7 @@ public class IntervalUtils { } } merged.add(prev); - return merged; + return Collections.unmodifiableList(merged); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index 3ce62b697..2f8b1e9b5 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -24,32 +24,17 @@ package org.broadinstitute.sting.gatk; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.picard.util.Interval; -import net.sf.picard.util.IntervalList; -import net.sf.samtools.SAMFileHeader; -import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.ArgumentException; -import org.broadinstitute.sting.commandline.IntervalBinding; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.walkers.PrintReadsWalker; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalSetRule; -import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; -import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collection; -import java.util.List; - /** * Tests selected functionality in the GenomeAnalysisEngine class @@ -91,65 +76,4 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { testEngine.validateSuppliedIntervals(); } - - @DataProvider(name="invalidIntervalTestData") - public Object[][] invalidIntervalDataProvider() throws Exception { - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - GATKArgumentCollection argCollection = new GATKArgumentCollection(); - testEngine.setArguments(argCollection); - - File fastaFile = new File("public/testdata/exampleFASTA.fasta"); - GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile)); - testEngine.setGenomeLocParser(genomeLocParser); - - return new Object[][] { - new Object[] {testEngine, genomeLocParser, "chr1", 10000000, 20000000}, - new Object[] {testEngine, genomeLocParser, "chr2", 1, 2}, - new Object[] {testEngine, genomeLocParser, "chr1", -1, 50} - }; - } - - @Test(dataProvider="invalidIntervalTestData") - public void testInvalidPicardIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, - String contig, int intervalStart, int intervalEnd ) throws Exception { - - SAMFileHeader picardFileHeader = new SAMFileHeader(); - picardFileHeader.addSequence(genomeLocParser.getContigInfo("chr1")); - IntervalList picardIntervals = new IntervalList(picardFileHeader); - picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname")); - - File picardIntervalFile = createTempFile("testInvalidPicardIntervalHandling", ".intervals"); - picardIntervals.write(picardIntervalFile); - - List> intervalArgs = new ArrayList>(1); - intervalArgs.add(new IntervalBinding(picardIntervalFile.getAbsolutePath())); - - testEngine.loadIntervals(intervalArgs, IntervalSetRule.UNION); - } - - @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") - public void testInvalidGATKFileIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, - String contig, int intervalStart, int intervalEnd ) throws Exception { - - File gatkIntervalFile = createTempFile("testInvalidGATKFileIntervalHandling", ".intervals", - String.format("%s:%d-%d", contig, intervalStart, intervalEnd)); - - List> intervalArgs = new ArrayList>(1); - intervalArgs.add(new IntervalBinding(gatkIntervalFile.getAbsolutePath())); - - testEngine.loadIntervals(intervalArgs, IntervalSetRule.UNION); - } - - private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception { - File tempFile = File.createTempFile(tempFilePrefix, tempFileExtension); - tempFile.deleteOnExit(); - - PrintWriter out = new PrintWriter(tempFile); - for ( String line : lines ) { - out.println(line); - } - out.close(); - - return tempFile; - } } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 28573c600..3a9183e9a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,12 +1,16 @@ package org.broadinstitute.sting.utils.interval; +import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.util.Interval; +import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; import org.apache.commons.io.FileUtils; import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -45,7 +49,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List locs = new ArrayList(); for (String interval: intervals) locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); - return locs; + return Collections.unmodifiableList(locs); } @BeforeClass @@ -277,7 +281,10 @@ public class IntervalUtilsUnitTest extends BaseTest { listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); } - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); + List ret; + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 100); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, null); Assert.assertEquals(ret.size(), 100); ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); Assert.assertEquals(ret.size(), 0); @@ -296,7 +303,10 @@ public class IntervalUtilsUnitTest extends BaseTest { allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); } - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + List ret; + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 150); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, null); Assert.assertEquals(ret.size(), 150); ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); Assert.assertEquals(ret.size(), 50); @@ -316,7 +326,10 @@ public class IntervalUtilsUnitTest extends BaseTest { } } - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + List ret; + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 40); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, null); Assert.assertEquals(ret.size(), 40); ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); Assert.assertEquals(ret.size(), 20); @@ -761,7 +774,13 @@ public class IntervalUtilsUnitTest extends BaseTest { List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(privateTestDir + unmergedIntervals)); Assert.assertEquals(locs.size(), 2); - List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); + List merged; + + merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); + Assert.assertEquals(merged.size(), 1); + + // Test that null means the same as ALL + merged = IntervalUtils.mergeIntervalLocations(locs, null); Assert.assertEquals(merged.size(), 1); } @@ -993,6 +1012,74 @@ public class IntervalUtilsUnitTest extends BaseTest { // Attempting to use the legacy -L "interval1;interval2" syntax should produce an exception: IntervalBinding binding = new IntervalBinding("1;2"); - List intervals = binding.getIntervals(toolkit); + binding.getIntervals(toolkit); + } + + @DataProvider(name="invalidIntervalTestData") + public Object[][] invalidIntervalDataProvider() throws Exception { + GATKArgumentCollection argCollection = new GATKArgumentCollection(); + File fastaFile = new File("public/testdata/exampleFASTA.fasta"); + GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile)); + + return new Object[][] { + new Object[] {argCollection, genomeLocParser, "chr1", 10000000, 20000000}, + new Object[] {argCollection, genomeLocParser, "chr2", 1, 2}, + new Object[] {argCollection, genomeLocParser, "chr1", -1, 50} + }; + } + + @Test(dataProvider="invalidIntervalTestData") + public void testInvalidPicardIntervalHandling(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + + SAMFileHeader picardFileHeader = new SAMFileHeader(); + picardFileHeader.addSequence(genomeLocParser.getContigInfo("chr1")); + IntervalList picardIntervals = new IntervalList(picardFileHeader); + picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname")); + + File picardIntervalFile = createTempFile("testInvalidPicardIntervalHandling", ".intervals"); + picardIntervals.write(picardIntervalFile); + + List> intervalArgs = new ArrayList>(1); + intervalArgs.add(new IntervalBinding(picardIntervalFile.getAbsolutePath())); + + IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser); + } + + @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") + public void testInvalidGATKFileIntervalHandling(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + + File gatkIntervalFile = createTempFile("testInvalidGATKFileIntervalHandling", ".intervals", + String.format("%s:%d-%d", contig, intervalStart, intervalEnd)); + + List> intervalArgs = new ArrayList>(1); + intervalArgs.add(new IntervalBinding(gatkIntervalFile.getAbsolutePath())); + + IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser); + } + + private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception { + File tempFile = BaseTest.createTempFile(tempFilePrefix, tempFileExtension); + FileUtils.writeLines(tempFile, Arrays.asList(lines)); + return tempFile; + } + + @DataProvider(name = "sortAndMergeIntervals") + public Object[][] getSortAndMergeIntervals() { + return new Object[][] { + new Object[] { IntervalMergingRule.OVERLAPPING_ONLY, getLocs("chr1:1", "chr1:3", "chr1:2"), getLocs("chr1:1", "chr1:2", "chr1:3") }, + new Object[] { IntervalMergingRule.ALL, getLocs("chr1:1", "chr1:3", "chr1:2"), getLocs("chr1:1-3") }, + new Object[] { IntervalMergingRule.OVERLAPPING_ONLY, getLocs("chr1:1", "chr1:3", "chr2:2"), getLocs("chr1:1", "chr1:3", "chr2:2") }, + new Object[] { IntervalMergingRule.ALL, getLocs("chr1:1", "chr1:3", "chr2:2"), getLocs("chr1:1", "chr1:3", "chr2:2") }, + new Object[] { IntervalMergingRule.OVERLAPPING_ONLY, getLocs("chr1:1", "chr1"), getLocs("chr1") }, + new Object[] { IntervalMergingRule.ALL, getLocs("chr1:1", "chr1"), getLocs("chr1") } + }; + } + + @Test(dataProvider = "sortAndMergeIntervals") + public void testSortAndMergeIntervals(IntervalMergingRule merge, List unsorted, List expected) { + List sorted = IntervalUtils.sortAndMergeIntervals(hg18GenomeLocParser, unsorted, merge).toList(); + Assert.assertEquals(sorted, expected); } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala index 2caa4d2aa..078331602 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala @@ -24,7 +24,6 @@ package org.broadinstitute.sting.queue.engine -import java.util.Date import java.text.SimpleDateFormat /** @@ -36,18 +35,21 @@ class JobRunInfo { val formatter = new SimpleDateFormat("yy-MM-dd H:mm:ss:SSS"); /** The start time with millisecond resolution of this job */ - var startTime: Date = _ + var startTime: java.util.Date = _ /** The done time with millisecond resolution of this job */ - var doneTime: Date = _ + var doneTime: java.util.Date = _ var exechosts: String = "localhost" - def getStartTime = startTime - def getDoneTime = doneTime - def getFormattedStartTime = formatTime(getStartTime) - def getFormattedDoneTime = formatTime(getDoneTime) + def getStartTime: String = getTime(startTime) + def getDoneTime: String = getTime(doneTime) + def getFormattedStartTime = formatTime(startTime) + def getFormattedDoneTime = formatTime(doneTime) + + /** Helper function that returns the time of the date */ + private def getTime(d: java.util.Date): String = if ( d != null ) d.getTime.toString else "null" /** Helper function that pretty prints the date */ - private def formatTime(d: Date) = if ( d != null ) formatter.format(d) else "null" + private def formatTime(d: java.util.Date): String = if ( d != null ) formatter.format(d) else "null" def getExecHosts = exechosts @@ -55,14 +57,14 @@ class JobRunInfo { * Was any information set for this jobInfo? JobInfo can be unset because * the job never ran or because it already completed. */ - def isFilledIn = startTime != null + def isFilledIn = startTime != null && doneTime != null /** * How long did the job run (in wall time)? Returns -1 if this jobInfo isn't filled in */ def getRuntimeInMs: Long = { if ( isFilledIn ) - getDoneTime.getTime - getStartTime.getTime + doneTime.getTime - startTime.getTime else -1 } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala index 239f83482..76cefe2a5 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala @@ -35,7 +35,7 @@ import org.ggf.drmaa.Session class GridEngineJobRunner(session: Session, function: CommandLineFunction) extends DrmaaJobRunner(session, function) with Logging { // Grid Engine disallows certain characters from being in job names. // This replaces all illegal characters with underscores - protected override val jobNameFilter = """[\n\t\r/:@\\*?]""" + protected override val jobNameFilter = """[\n\t\r/:,@\\*?]""" protected override val minRunnerPriority = -1023 protected override val maxRunnerPriority = 0 diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala index 2609c3607..97669030a 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala @@ -38,11 +38,11 @@ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction { override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount protected override def maxIntervals = { - GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).contigs.size + GATKScatterFunction.getGATKIntervals(this.originalGATK).contigs.size } def run() { - val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + val gi = GATKScatterFunction.getGATKIntervals(this.originalGATK) IntervalUtils.scatterContigIntervals(gi.samFileHeader, gi.locs, this.scatterOutputFiles) } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index 2f604a809..e619c0a02 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -26,13 +26,23 @@ package org.broadinstitute.sting.queue.extensions.gatk import java.io.File import collection.JavaConversions._ -import org.broadinstitute.sting.utils.interval.{IntervalMergingRule, IntervalUtils} +import org.broadinstitute.sting.utils.interval.{IntervalSetRule, IntervalMergingRule, IntervalUtils} import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource import net.sf.samtools.SAMFileHeader -import java.util.Collections -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} +import org.broadinstitute.sting.utils.GenomeLoc +import org.broadinstitute.sting.commandline._ +import org.broad.tribble.Feature + +case class GATKIntervals(reference: File, intervals: Seq[File], intervalsString: Seq[String], + intervalSetRule: IntervalSetRule, intervalMergingRule: IntervalMergingRule, intervalPadding: Option[Int], + excludeIntervals: Seq[File], excludeIntervalsString: Seq[String]) { + + def this(gatk: CommandLineGATK) = this( + gatk.reference_sequence, + gatk.intervals, gatk.intervalsString, + gatk.interval_set_rule, gatk.interval_merging, gatk.interval_padding, + gatk.excludeIntervals, gatk.excludeIntervalsString) -case class GATKIntervals(reference: File, intervals: Seq[String]) { private lazy val referenceDataSource = new ReferenceDataSource(reference) lazy val samFileHeader = { @@ -42,16 +52,46 @@ case class GATKIntervals(reference: File, intervals: Seq[String]) { } lazy val locs: java.util.List[GenomeLoc] = { - val parser = new GenomeLocParser(referenceDataSource.getReference) - val parsedLocs = - if (intervals.isEmpty) - GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList - else - IntervalUtils.parseIntervalArguments(parser, intervals) - Collections.sort(parsedLocs) - val mergedLocs = IntervalUtils.mergeIntervalLocations(parsedLocs, IntervalMergingRule.OVERLAPPING_ONLY) - Collections.unmodifiableList(mergedLocs) + val includeIntervalBindings = this.intervals.map(GATKIntervals.createBinding(_, "intervals")) ++ + this.intervalsString.map(GATKIntervals.createBinding(_, "intervalsString")) + val excludeIntervalBindings = this.excludeIntervals.map(GATKIntervals.createBinding(_, "excludeIntervals")) ++ + this.excludeIntervalsString.map(GATKIntervals.createBinding(_, "excludeIntervalsString")) + + IntervalUtils.parseIntervalBindings( + referenceDataSource, + includeIntervalBindings, + intervalSetRule, intervalMergingRule, intervalPadding.getOrElse(0), + excludeIntervalBindings).toList } lazy val contigs = locs.map(_.getContig).distinct.toSeq } + +object GATKIntervals { + def copyIntervalArguments(src: CommandLineGATK, dst: CommandLineGATK) { + dst.reference_sequence = src.reference_sequence + dst.intervals = src.intervals + dst.intervalsString = src.intervalsString + dst.interval_set_rule = src.interval_set_rule + dst.interval_merging = src.interval_merging + dst.interval_padding = src.interval_padding + dst.excludeIntervals = src.excludeIntervals + dst.excludeIntervalsString = src.excludeIntervalsString + } + + private def createBinding(interval: File, argumentName: String): IntervalBinding[Feature] = { + val tags = interval match { + case taggedFile: TaggedFile => ParsingMethod.parseTags(argumentName, taggedFile.tag) + case file: File => new Tags + } + createBinding(interval.getAbsolutePath, argumentName, tags) + } + + private def createBinding(interval: String, argumentName: String): IntervalBinding[Feature] = { + createBinding(interval, argumentName, new Tags) + } + + private def createBinding(interval: String, argumentName: String, tags: Tags): IntervalBinding[Feature] = { + ArgumentTypeDescriptor.parseBinding(interval, classOf[Feature], classOf[IntervalBinding[Feature]], argumentName, tags, argumentName).asInstanceOf[IntervalBinding[Feature]] + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala index 28c3f41e9..9e79e8f61 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala @@ -28,14 +28,17 @@ import org.broadinstitute.sting.utils.interval.IntervalUtils import java.io.File import org.broadinstitute.sting.utils.io.IOUtils import org.broadinstitute.sting.queue.function.scattergather.{CloneFunction, ScatterFunction} -import org.broadinstitute.sting.commandline.Output +import org.broadinstitute.sting.commandline._ trait GATKScatterFunction extends ScatterFunction { - /** The runtime field to set for specifying an interval file. */ + /* The runtime field to set for specifying intervals. */ private final val intervalsField = "intervals" - - /** The runtime field to set for specifying an interval string. */ private final val intervalsStringField = "intervalsString" + private final val excludeIntervalsField = "excludeIntervals" + private final val excludeIntervalsStringField = "excludeIntervalsString" + private final val intervalsSetRuleField = "interval_set_rule" + private final val intervalMergingField = "interval_merging" + private final val intervalPaddingField = "interval_padding" @Output(doc="Scatter function outputs") var scatterOutputFiles: Seq[File] = Nil @@ -43,25 +46,14 @@ trait GATKScatterFunction extends ScatterFunction { /** The original GATK function. */ protected var originalGATK: CommandLineGATK = _ - /** The reference sequence for the GATK function. */ - protected var referenceSequence: File = _ - - /** The list of interval files ("/path/to/interval.list") or interval strings ("chr1", "chr2") to parse into smaller parts. */ - protected var intervals: Seq[String] = Nil - /** Whether the last scatter job should also include any unmapped reads. */ protected var includeUnmapped: Boolean = _ override def init() { this.originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] - this.referenceSequence = this.originalGATK.reference_sequence - if (this.originalGATK.intervals.isEmpty && (this.originalGATK.intervalsString == null || this.originalGATK.intervalsString.isEmpty)) { - this.intervals ++= GATKScatterFunction.getGATKIntervals(this.referenceSequence, Seq.empty[String]).contigs - } else { - this.intervals ++= this.originalGATK.intervals.map(_.toString) - this.intervals ++= this.originalGATK.intervalsString.filterNot(interval => IntervalUtils.isUnmapped(interval)) + // If intervals have been specified check if unmapped is included + if (this.originalGATK.intervals.size + this.originalGATK.intervalsString.size > 0) this.includeUnmapped = this.originalGATK.intervalsString.exists(interval => IntervalUtils.isUnmapped(interval)) - } } override def isScatterGatherable = { @@ -74,6 +66,12 @@ trait GATKScatterFunction extends ScatterFunction { cloneFunction.setFieldValue(this.intervalsStringField, Seq("unmapped")) else cloneFunction.setFieldValue(this.intervalsStringField, Seq.empty[String]) + + cloneFunction.setFieldValue(this.intervalsSetRuleField, null) + cloneFunction.setFieldValue(this.intervalMergingField, null) + cloneFunction.setFieldValue(this.intervalPaddingField, None) + cloneFunction.setFieldValue(this.excludeIntervalsField, Seq.empty[File]) + cloneFunction.setFieldValue(this.excludeIntervalsStringField, Seq.empty[String]) } override def bindCloneInputs(cloneFunction: CloneFunction, index: Int) { @@ -85,29 +83,28 @@ trait GATKScatterFunction extends ScatterFunction { } /** - * Returns true if all interval files exist. + * @return true if all interval files exist. */ protected def intervalFilesExist = { - !this.intervals.exists(interval => IntervalUtils.isIntervalFile(interval, false) && !new File(interval).exists) + !(this.originalGATK.intervals ++ this.originalGATK.excludeIntervals).exists(interval => !interval.exists()) } /** - * Returns the maximum number of intervals or this.scatterCount if the maximum can't be determined ahead of time. * @return the maximum number of intervals or this.scatterCount if the maximum can't be determined ahead of time. */ protected def maxIntervals: Int } object GATKScatterFunction { - var gatkIntervals = Seq.empty[GATKIntervals] + var gatkIntervalsCache = Seq.empty[GATKIntervals] - def getGATKIntervals(reference: File, intervals: Seq[String]) = { - gatkIntervals.find(gi => gi.reference == reference && gi.intervals == intervals) match { - case Some(gi) => gi + def getGATKIntervals(originalFunction: CommandLineGATK) = { + val gatkIntervals = new GATKIntervals(originalFunction) + gatkIntervalsCache.find(_ == gatkIntervals) match { + case Some(existingGatkIntervals) => existingGatkIntervals case None => - val gi = new GATKIntervals(reference, intervals) - gatkIntervals :+= gi - gi + gatkIntervalsCache :+= gatkIntervals + gatkIntervals } } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala index 40a6fc4b4..03b142bca 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala @@ -33,12 +33,12 @@ import org.broadinstitute.sting.queue.function.InProcessFunction */ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction { protected override def maxIntervals = - GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).locs.size + GATKScatterFunction.getGATKIntervals(this.originalGATK).locs.size override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount def run() { - val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + val gi = GATKScatterFunction.getGATKIntervals(this.originalGATK) val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size) IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala index 8f52b9b82..150df4e38 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala @@ -31,13 +31,11 @@ import org.broadinstitute.sting.queue.function.InProcessFunction /** * A scatter function that divides down to the locus level. */ -//class LocusScatterFunction extends IntervalScatterFunction { } - class LocusScatterFunction extends GATKScatterFunction with InProcessFunction { protected override def maxIntervals = scatterCount def run() { - val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + val gi = GATKScatterFunction.getGATKIntervals(this.originalGATK) val splits = IntervalUtils.splitLocusIntervals(gi.locs, this.scatterOutputFiles.size) IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala index 11a66a37b..7862dec41 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala @@ -37,14 +37,11 @@ class VcfGatherFunction extends CombineVariants with GatherFunction { private lazy val originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] - override def freezeFieldValues { + override def freezeFieldValues() { this.jarFile = this.originalGATK.jarFile - this.reference_sequence = this.originalGATK.reference_sequence - this.intervals = this.originalGATK.intervals - this.intervalsString = this.originalGATK.intervalsString - this.variant = this.gatherParts.zipWithIndex map { case (input, index) => new TaggedFile(input, "input"+index) } this.out = this.originalOutput + GATKIntervals.copyIntervalArguments(this.originalGATK, this) // NO_HEADER and sites_only from VCFWriterArgumentTypeDescriptor // are added by the GATKExtensionsGenerator to the subclass of CommandLineGATK @@ -55,6 +52,6 @@ class VcfGatherFunction extends CombineVariants with GatherFunction { val sitesOnly = QFunction.findField(originalFunction.getClass, VCFWriterArgumentTypeDescriptor.SITES_ONLY_ARG_NAME) this.sites_only = originalGATK.getFieldValue(sitesOnly).asInstanceOf[Boolean] - super.freezeFieldValues + super.freezeFieldValues() } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala index 534d68069..13448afdd 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala @@ -49,6 +49,11 @@ trait JavaCommandLineFunction extends CommandLineFunction { */ var javaMemoryLimit: Option[Double] = None + /** + * Max number of GC threads + */ + var javaGCThreads: Option[Int] = None + override def freezeFieldValues() { super.freezeFieldValues() @@ -73,6 +78,8 @@ trait JavaCommandLineFunction extends CommandLineFunction { } def javaOpts = optional("-Xmx", javaMemoryLimit.map(gb => (gb * 1024).ceil.toInt), "m", spaceSeparated=false) + + conditional(javaGCThreads.isDefined, "-XX:+UseParallelOldGC") + + optional("-XX:ParallelGCThreads=", javaGCThreads, spaceSeparated=false) + required("-Djava.io.tmpdir=", jobTempDir, spaceSeparated=false) def commandLine = required("java") + diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala index 73ab7c366..c69a310b3 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala @@ -56,8 +56,8 @@ trait QJobReport extends Logging { "jobName" -> QJobReport.workAroundSameJobNames(this), "intermediate" -> self.isIntermediate, "exechosts" -> info.getExecHosts, - "startTime" -> info.getStartTime.getTime, - "doneTime" -> info.getDoneTime.getTime, + "startTime" -> info.getStartTime, + "doneTime" -> info.getDoneTime, "formattedStartTime" -> info.getFormattedStartTime, "formattedDoneTime" -> info.getFormattedDoneTime, "runtime" -> info.getRuntimeInMs).mapValues((x:Any) => if (x != null) x.toString else "null") diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala index 6b615e6d9..0d8edc25d 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala @@ -70,7 +70,7 @@ class ScalaCompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { * @return The parsed object. */ def parse(parsingEngine: ParsingEngine, source: ArgumentSource, typeType: Type, argumentMatches: ArgumentMatches) = { - parse(parsingEngine,source, makeRawTypeIfNecessary(typeType), argumentMatches) + parse(parsingEngine,source, ArgumentTypeDescriptor.makeRawTypeIfNecessary(typeType), argumentMatches) } def parse(parsingEngine: ParsingEngine, source: ArgumentSource, classType: Class[_], argumentMatches: ArgumentMatches) = { diff --git a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala index b23350557..2c6016c9b 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala @@ -26,19 +26,21 @@ package org.broadinstitute.sting.queue.extensions.gatk import java.io.File import org.testng.Assert -import org.testng.annotations.Test +import org.testng.annotations.{DataProvider, Test} import org.broadinstitute.sting.BaseTest import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile import org.broadinstitute.sting.utils.{GenomeLocSortedSet, GenomeLocParser} import collection.JavaConversions._ import org.broadinstitute.sting.utils.interval.IntervalUtils +import org.broadinstitute.sting.utils.exceptions.UserException class GATKIntervalsUnitTest { private final lazy val hg18Reference = new File(BaseTest.hg18Reference) private final lazy val hg18GenomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(hg18Reference)) private final lazy val hg18ReferenceLocs = GenomeLocSortedSet. createSetFromSequenceDictionary(new ReferenceDataSource(hg18Reference).getReference.getSequenceDictionary).toList + private final lazy val hg19GenomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(hg19Reference)) private final lazy val hg19Reference = new File(BaseTest.hg19Reference) @@ -48,14 +50,14 @@ class GATKIntervalsUnitTest { val chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-3") val chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:3-5") - val gi = new GATKIntervals(hg18Reference, Seq("chr1:1-1", "chr2:2-3", "chr3:3-5")) + val gi = createGATKIntervals(hg18Reference, Seq("chr1:1-1", "chr2:2-3", "chr3:3-5")) Assert.assertEquals(gi.locs.toSeq, Seq(chr1, chr2, chr3)) Assert.assertEquals(gi.contigs, Seq("chr1", "chr2", "chr3")) } @Test(timeOut = 30000L) def testIntervalFile() { - var gi = new GATKIntervals(hg19Reference, Seq(BaseTest.hg19Intervals)) + val gi = createGATKIntervals(hg19Reference, Seq(BaseTest.hg19Intervals)) Assert.assertEquals(gi.locs.size, 189894) // Timeout check is because of bad: // for(Item item: javaConvertedScalaList) @@ -67,28 +69,85 @@ class GATKIntervalsUnitTest { @Test def testEmptyIntervals() { - val gi = new GATKIntervals(hg18Reference, Nil) + val gi = createGATKIntervals(hg18Reference, Nil) Assert.assertEquals(gi.locs, hg18ReferenceLocs) Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size) } @Test def testContigCounts() { - Assert.assertEquals(new GATKIntervals(hg18Reference, Nil).contigs, hg18ReferenceLocs.map(_.getContig)) - Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1", "chr2", "chr3")).contigs, Seq("chr1", "chr2", "chr3")) - Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, Seq("chr1", "chr2", "chr3")) + Assert.assertEquals(createGATKIntervals(hg18Reference, Nil).contigs, hg18ReferenceLocs.map(_.getContig)) + Assert.assertEquals(createGATKIntervals(hg18Reference, Seq("chr1", "chr2", "chr3")).contigs, Seq("chr1", "chr2", "chr3")) + Assert.assertEquals(createGATKIntervals(hg18Reference, Seq("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, Seq("chr1", "chr2", "chr3")) } - @Test - def testSortAndMergeIntervals() { - testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-10", "chr1:1-10"), Seq("chr1:1-10")) - testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-11", "chr1:1-12"), Seq("chr1:1-12")) - testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:11-20", "chr1:21-30"), Seq("chr1:1-10", "chr1:11-20", "chr1:21-30")) - testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:10-20", "chr1:21-30"), Seq("chr1:1-20", "chr1:21-30")) - testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:21-30", "chr1:10-20"), Seq("chr1:1-20", "chr1:21-30")) + @DataProvider(name="sortAndMergeIntervals") + def getSortAndMergeIntervals: Array[Array[AnyRef]] = { + Array( + Array(Seq("chr1:1-10", "chr1:1-10", "chr1:1-10"), Seq("chr1:1-10")), + Array(Seq("chr1:1-10", "chr1:1-11", "chr1:1-12"), Seq("chr1:1-12")), + Array(Seq("chr1:1-10", "chr1:11-20", "chr1:21-30"), Seq("chr1:1-30")), + Array(Seq("chr1:1-10", "chr1:10-20", "chr1:21-30"), Seq("chr1:1-30")), + Array(Seq("chr1:1-9", "chr1:21-30", "chr1:11-20"), Seq("chr1:1-9", "chr1:11-30")) + ).asInstanceOf[Array[Array[AnyRef]]] } - private def testSortAndMergeIntervals(actual: Seq[String], expected: Seq[String]) { - Assert.assertEquals(new GATKIntervals(hg18Reference, actual).locs.toSeq, expected.map(hg18GenomeLocParser.parseGenomeLoc(_))) + @Test(dataProvider="sortAndMergeIntervals") + def testSortAndMergeIntervals(unmerged: Seq[String], expected: Seq[String]) { + Assert.assertEquals(createGATKIntervals(hg18Reference, unmerged).locs.toSeq, expected.map(hg18GenomeLocParser.parseGenomeLoc(_))) + } + + @DataProvider(name="taggedFiles") + def getTaggedFiles: Array[Array[AnyRef]] = { + Array( + Array(hg18Reference, BaseTest.privateTestDir + "small_unmerged_gatk_intervals.list", null, Seq("chr1:1-10")), + Array(hg18Reference, BaseTest.privateTestDir + "small_unmerged_gatk_intervals.list", "", Seq("chr1:1-10")), + Array(hg18Reference, BaseTest.privateTestDir + "small_unmerged_gatk_intervals.list", "myList", Seq("chr1:1-10")), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", null, Seq("1:897475-897481", "1:10001292")), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "", Seq("1:897475-897481", "1:10001292")), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "myVcf", Seq("1:897475-897481", "1:10001292")), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "VCF", Seq("1:897475-897481", "1:10001292")), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "myVcf,VCF", Seq("1:897475-897481", "1:10001292")), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", null, Seq("20:1-999", "20:1002-2000", "22:1001-6000")), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "", Seq("20:1-999", "20:1002-2000", "22:1001-6000")), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "myBed", Seq("20:1-999", "20:1002-2000", "22:1001-6000")), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "BED", Seq("20:1-999", "20:1002-2000", "22:1001-6000")), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "myBed,BED", Seq("20:1-999", "20:1002-2000", "22:1001-6000")) + ) + } + + @Test(dataProvider="taggedFiles") + def testTaggedFiles(reference: File, file: String, tags: String, expected: Seq[String]) { + val gatk = new CommandLineGATK + gatk.reference_sequence = reference + gatk.intervals = Seq(new TaggedFile(file, tags)) + val parser = if (reference == hg18Reference) hg18GenomeLocParser else hg19GenomeLocParser + Assert.assertEquals(new GATKIntervals(gatk).locs.toSeq, expected.map(parser.parseGenomeLoc(_))) + } + + @DataProvider(name="badTaggedFiles") + def getBadTaggedFiles: Array[Array[AnyRef]] = { + Array( + Array(hg18Reference, BaseTest.privateTestDir + "small_unmerged_gatk_intervals.list", "VCF"), + Array(hg18Reference, BaseTest.privateTestDir + "small_unmerged_gatk_intervals.list", "too,many,tags"), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "BED"), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "VCF,myVCF"), + Array(hg19Reference, BaseTest.privateTestDir + "small.indel.test.vcf", "myVCF,VCF,extra"), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "VCF"), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "BED,myBed"), + Array(hg19Reference, BaseTest.privateTestDir + "sampleBedFile.bed", "myBed,BED,extra") + ).asInstanceOf[Array[Array[AnyRef]]] + } + + @Test(dataProvider = "badTaggedFiles", expectedExceptions = Array(classOf[UserException])) + def testBadTaggedFiles(reference: File, file: String, tags: String) { + testTaggedFiles(reference, file, tags, Nil) + } + + private def createGATKIntervals(reference: File, intervals: Seq[String]) = { + val gatk = new CommandLineGATK + gatk.reference_sequence = reference + gatk.intervalsString = intervals + new GATKIntervals(gatk) } } diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala index 6e37ae2a3..c9d8b59c9 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala @@ -42,4 +42,43 @@ class ExampleUnifiedGenotyperPipelineTest { spec.jobRunners = PipelineTest.allJobRunners PipelineTest.executeTest(spec) } + + @Test + def testUnifiedGenotyperWithGatkIntervals() { + val spec = new PipelineTestSpec + spec.name = "unifiedgenotyper_with_gatk_intervals" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", + " -I " + BaseTest.validationDataLocation + "OV-0930.normal.chunk.bam", + " -R " + BaseTest.hg18Reference, + " -L " + BaseTest.validationDataLocation + "intervalTest.intervals").mkString + spec.jobRunners = Seq("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test + def testUnifiedGenotyperWithBedIntervals() { + val spec = new PipelineTestSpec + spec.name = "unifiedgenotyper_with_bed_intervals" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", + " -I " + BaseTest.validationDataLocation + "OV-0930.normal.chunk.bam", + " -R " + BaseTest.hg18Reference, + " -L " + BaseTest.validationDataLocation + "intervalTest.bed").mkString + spec.jobRunners = Seq("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test + def testUnifiedGenotyperWithVcfIntervals() { + val spec = new PipelineTestSpec + spec.name = "unifiedgenotyper_with_vcf_intervals" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", + " -I " + BaseTest.validationDataLocation + "OV-0930.normal.chunk.bam", + " -R " + BaseTest.hg18Reference, + " -L " + BaseTest.validationDataLocation + "intervalTest.1.vcf").mkString + spec.jobRunners = Seq("Lsf706") + PipelineTest.executeTest(spec) + } } From 1fafd9f6c8b33271194fa3aaf6a5b05e73febb3b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 27 Jun 2012 16:55:49 -0400 Subject: [PATCH 21/32] NestedHashMap-based implementation of BQSRv2 along with a few minor optimizations. Not a huge runtime upgrade over the long bitset approach, but it allows us to implement further optimizations going forward. Integration test change because the original version had a bug in the quantized qual table creation. --- .../sting/gatk/walkers/bqsr/BQSRGatherer.java | 6 +- .../gatk/walkers/bqsr/BQSRKeyManager.java | 329 ------------------ .../gatk/walkers/bqsr/ContextCovariate.java | 140 ++------ .../sting/gatk/walkers/bqsr/Covariate.java | 14 +- .../gatk/walkers/bqsr/CycleCovariate.java | 39 +-- .../walkers/bqsr/QualityScoreCovariate.java | 16 +- .../gatk/walkers/bqsr/QuantizationInfo.java | 25 +- .../gatk/walkers/bqsr/ReadCovariates.java | 61 ++-- .../gatk/walkers/bqsr/ReadGroupCovariate.java | 21 +- .../gatk/walkers/bqsr/RecalDataManager.java | 312 ++++++++--------- .../sting/gatk/walkers/bqsr/RecalDatum.java | 3 +- .../walkers/bqsr/RecalibrationReport.java | 247 ++++++------- .../broadinstitute/sting/utils/BaseUtils.java | 65 +--- .../utils/collections/NestedHashMap.java | 51 +++ .../recalibration/BaseRecalibration.java | 100 +++--- .../recalibration/RecalibrationTables.java | 62 ++++ .../walkers/bqsr/BQSRKeyManagerUnitTest.java | 158 --------- .../bqsr/ContextCovariateUnitTest.java | 2 +- .../walkers/bqsr/CycleCovariateUnitTest.java | 2 +- .../bqsr/ReadGroupCovariateUnitTest.java | 4 +- .../bqsr/RecalibrationReportUnitTest.java | 48 ++- .../sting/utils/BitSetUtilsUnitTest.java | 17 - .../BaseRecalibrationUnitTest.java | 91 +---- 23 files changed, 566 insertions(+), 1247 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java delete mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java index d91ddd221..01fa92b8c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -71,11 +71,13 @@ public class BQSRGatherer extends Gatherer { if (RAC.recalibrationReport != null && !RAC.NO_PLOTS) { File recal_out = new File(output.getName() + ".original"); RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport); - RecalDataManager.generateRecalibrationPlot(recal_out, originalReport.getKeysAndTablesMap(), generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); + // TODO -- fix me + //RecalDataManager.generateRecalibrationPlot(recal_out, originalReport.getKeysAndTablesMap(), generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); } else if (!RAC.NO_PLOTS) { File recal_out = new File(output.getName() + ".recal"); - RecalDataManager.generateRecalibrationPlot(recal_out, generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); + // TODO -- fix me + //RecalDataManager.generateRecalibrationPlot(recal_out, generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); } generalReport.output(outputFile); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java deleted file mode 100644 index 29eecfbb1..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java +++ /dev/null @@ -1,329 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.util.*; - -/** - * This class provides all the functionality for the BitSet representation of the keys to the hash table of BQSR - * - * It also handles the event type "covariate" which is not exactly a covariate, but is added as a key to the hashmap. The Key Manager will - * add the event type as a bitset to the end of the covariate bitset key. This way, it won't get int the way of masking the information - * out of the key for the actual covariates, and having the covariates handle it. The key manager handles the event type. - * - * The keys represented by this key manager will always have the same order: - * - * RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate1, OptionalCovariateID, EventType - * RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate2, OptionalCovariateID, EventType - * ... - * RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariateN, OptionalCovariateID, EventType - * - * - * Note that Optional Covariates are optional, and the Key Manager should operate without them if necessary. - * - * @author Mauricio Carneiro - * @since 3/6/12 - */ -public class BQSRKeyManager { - - private final Covariate[] requiredCovariates; - private final Covariate[] optionalCovariates; - private final RequiredCovariateInfo[] requiredCovariatesInfo; - private final OptionalCovariateInfo[] optionalCovariatesInfo; - private final Map covariateNameToIDMap; - - private int nRequiredBits; // Number of bits used to represent the required covariates - - private final int optionalCovariateOffset; - private final int optionalCovariateIDOffset; - - private final long optionalCovariateMask; // Standard mask for optional covariates key - private final long optionalCovariateIDMask; // Standard mask for optional covariates order key - private final long eventIDMask; // Standard mask for event ID - - /** - * Initializes the KeyManager with the total number of covariates to use - * - * @param requiredCovariates the ordered list of required covariates - * @param optionalCovariates the ordered list of optional covariates - */ - public BQSRKeyManager(final List requiredCovariates, final List optionalCovariates) { - this.requiredCovariates = new Covariate[requiredCovariates.size()]; - this.optionalCovariates = new Covariate[optionalCovariates.size()]; - requiredCovariatesInfo = new RequiredCovariateInfo[requiredCovariates.size()]; // initialize the required covariates list - optionalCovariatesInfo = new OptionalCovariateInfo[optionalCovariates.size()]; // initialize the optional covariates list (size may be 0, it's okay) - covariateNameToIDMap = new HashMap(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates) - - nRequiredBits = 0; - for (int i = 0; i < requiredCovariates.size(); i++) { // create a list of required covariates with the extra information for key management - final Covariate required = requiredCovariates.get(i); - final int nBits = required.numberOfBits(); // number of bits used by this covariate - final long mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate - this.requiredCovariates[i] = required; - requiredCovariatesInfo[i] = new RequiredCovariateInfo(nBits, nRequiredBits, mask, required); // Create an object for this required covariate - nRequiredBits += nBits; - } - - final int bitsInEventType = numberOfBitsToRepresent(EventType.values().length); - eventIDMask = genericMask(nRequiredBits, bitsInEventType); - - short id = 0; - int nOptionalBits = 0; - for (int i = 0; i < optionalCovariates.size(); i++) { - final Covariate optional = optionalCovariates.get(i); - nOptionalBits = Math.max(nOptionalBits, optional.numberOfBits()); // optional covariates are represented by the number of bits needed by biggest covariate - this.optionalCovariates[i] = optional; - optionalCovariatesInfo[i] = new OptionalCovariateInfo(id, optional); - final String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport - covariateNameToIDMap.put(covariateName, id); - id++; - } - - optionalCovariateOffset = nRequiredBits + bitsInEventType; - optionalCovariateMask = genericMask(optionalCovariateOffset, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset - optionalCovariateIDOffset = nRequiredBits + bitsInEventType + nOptionalBits; - final int nOptionalIDBits = numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID - optionalCovariateIDMask = genericMask(optionalCovariateIDOffset, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset - - final int totalNumberOfBits = optionalCovariateIDOffset + nOptionalIDBits; // total number of bits used in the final key - if ( totalNumberOfBits > 64 ) - throw new UserException.BadInput("The total number of bits used for the master BQSR key is greater than 64 and cannot be represented in a long"); - } - - /** - * Generates one key given the optional covariate (or none if it is null) - * - * Keys include all required covariates, the standard covariate and the event type. - * - * @param allKeys The keys in long representation for each covariate (includes all optional covariates, not just the one requested) - * @param eventType The type of event described by this keyset (e.g. mismatches, insertions, deletions) - * @return one key in long representation (non-negative) or -1 for a bad key - */ - public long createMasterKey(final long[] allKeys, final EventType eventType, final int optionalCovariateIndex) { - - int keyIndex = 0; - long masterKey = 0L; // This will be a master key holding all the required keys, to replicate later on - for (RequiredCovariateInfo infoRequired : requiredCovariatesInfo) - masterKey |= (allKeys[keyIndex++] << infoRequired.offset); - - final long eventKey = keyFromEvent(eventType); // create a key for the event type - masterKey |= (eventKey << nRequiredBits); - - if (optionalCovariateIndex >= 0 && optionalCovariateIndex < optionalCovariates.length) { - final long covariateKey = allKeys[keyIndex + optionalCovariateIndex]; - if (covariateKey < 0) // do not add "nulls" to the final set of keys - return -1; - - masterKey |= (covariateKey << optionalCovariateOffset); - masterKey |= (optionalCovariatesInfo[optionalCovariateIndex].covariateID << optionalCovariateIDOffset); - } - - return masterKey; - } - - /** - * Generates one key for the covariates represented in Object[] key - * - * The covariates will have the actual objects produced by the covariates (probably read from the recalibration data file) - * and will contain all required covariates and one (or none) optional covariates. Therefore, the product is one key, not many. - * - * Example key: - * RG, QUAL, CYCLE, CYCLE_ID, EventType - * - * @param key list of objects produced by the required covariates followed by one or zero optional covariates. - * @return a key representing these objects. - */ - public long longFromKey(Object[] key) { - int requiredCovariate = 0; - long masterKey = 0L; // This will be a master key holding all the required keys, to replicate later on - for (RequiredCovariateInfo infoRequired : requiredCovariatesInfo) - masterKey |= (infoRequired.covariate.longFromKey(key[requiredCovariate++]) << infoRequired.offset); - - final int eventIndex = key.length - 1; // the event type is always the last key - final long eventKey = keyFromEvent((EventType) key[eventIndex]); // create a key for the event type - masterKey |= (eventKey << nRequiredBits); - - if (optionalCovariatesInfo.length > 0) { - final int covariateIndex = requiredCovariatesInfo.length; // the optional covariate index in the key array - final int covariateIDIndex = covariateIndex + 1; // the optional covariate ID index is right after the optional covariate's - final short covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index - final OptionalCovariateInfo infoOptional = optionalCovariatesInfo[covariateID]; // so we can get the optional covariate information - - final long covariateKey = infoOptional.covariate.longFromKey(key[covariateIndex]); // convert the optional covariate key into a bitset using the covariate's interface - masterKey |= (covariateKey << optionalCovariateOffset); - masterKey |= (infoOptional.covariateID << optionalCovariateIDOffset); - } - - return masterKey; - } - - /** - * Covariate id can be either the covariate name (String) or the actual id (short). This method - * finds it's type and converts accordingly to the short notation. - * - * @param id the string or short representation of the optional covariate id - * @return the short representation of the optional covariate id. - */ - private short parseCovariateID(final Object id) { - return (id instanceof String) ? covariateNameToIDMap.get(id.toString()) : (Short) id; - } - - /** - * Generates a key set of objects from a combined master key. - * - * Masks out each covariate independently and decodes their values (Object) into a keyset - * - * @param master the master representation of the keys - * @return an object array with the values for each key - */ - public List keySetFrom(final long master) { - final List objectKeys = new ArrayList(); - for (RequiredCovariateInfo info : requiredCovariatesInfo) { - final long covariateKey = extractKeyFromMaster(master, info.mask, info.offset); // get the covariate's key - objectKeys.add(info.covariate.formatKey(covariateKey)); // convert the key to object using covariate's interface - } - - if (optionalCovariatesInfo.length > 0) { - final long covKey = extractKeyFromMaster(master, optionalCovariateMask, optionalCovariateOffset); // get the covariate's key - final int covIDKey = (int)extractKeyFromMaster(master, optionalCovariateIDMask, optionalCovariateIDOffset); // get the covariate's id (to identify which covariate this is) - Covariate covariate = optionalCovariatesInfo[(short)covIDKey].covariate; // get the corresponding optional covariate object - objectKeys.add(covariate.formatKey(covKey)); // add the optional covariate key to the key set - objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id - } - - objectKeys.add(EventType.eventFrom((int)extractKeyFromMaster(master, eventIDMask, nRequiredBits))); // add the event type object to the key set - - return objectKeys; - } - - public Covariate[] getRequiredCovariates() { - return requiredCovariates; - } - - public Covariate[] getOptionalCovariates() { - return optionalCovariates; - } - - public int getNumRequiredCovariates() { - return requiredCovariates.length; - } - - public int getNumOptionalCovariates() { - return optionalCovariates.length; - } - - /** - * Creates a mask for the requested covariate to extract the relevant key from a combined master key - * - * @param offset the offset into the master key - * @param nBits the number of bits needed by the Covariate to represent its values - * @return the mask relevant to the covariate - */ - private long genericMask(final int offset, final int nBits) { - long mask = 0L; - for ( int i = 0; i < nBits; i++ ) - mask |= 1L << (offset+i); - return mask; - } - - private long extractKeyFromMaster(final long master, final long mask, final int offset) { - long key = master & mask; - return key >> offset; - } - - // cache the key representing an event since it's otherwise created a massive amount of times - private static final long[] eventTypeCache = new long[EventType.values().length]; // event IDs must be longs so that bit-fiddling works - static { - for (final EventType eventType : EventType.values()) - eventTypeCache[eventType.index] = (long)eventType.index; - } - - private long keyFromEvent(final EventType eventType) { - return eventTypeCache[eventType.index]; - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof BQSRKeyManager)) - return false; - - BQSRKeyManager other = (BQSRKeyManager) o; - if (this == other) - return true; - - if (requiredCovariatesInfo.length != other.requiredCovariatesInfo.length || - optionalCovariatesInfo.length != other.optionalCovariatesInfo.length) - return false; - - for (int i = 0; i < requiredCovariates.length; i++) { - Covariate myRequiredCovariate = requiredCovariates[i]; - Covariate otherRequiredCovariate = other.requiredCovariates[i]; - String thisName = myRequiredCovariate.getClass().getSimpleName(); - String otherName = otherRequiredCovariate.getClass().getSimpleName(); - if (!thisName.equals(otherName)) - return false; - } - - for (int i = 0; i < optionalCovariates.length; i++) { - Covariate myOptionalCovariate = optionalCovariates[i]; - Covariate otherOptionalCovariate = other.optionalCovariates[i]; - String thisName = myOptionalCovariate.getClass().getSimpleName(); - String otherName = otherOptionalCovariate.getClass().getSimpleName(); - if (!thisName.equals(otherName)) - return false; - } - - return true; - } - - /** - * Calculates the number of bits necessary to represent a given number of elements - * - * @param numberOfElements the number of elements to represent (must be positive) - * @return the number of bits necessary to represent this many elements - */ - public static int numberOfBitsToRepresent(long numberOfElements) { - if (numberOfElements < 0) - throw new ReviewedStingException("Number of elements must be positive: " + numberOfElements); - - if (numberOfElements == 1L) - return 1; // special case - - int n = 0; - numberOfElements--; - while (numberOfElements > 0) { - numberOfElements = numberOfElements >> 1; - n++; - } - return n; - } - - /** - * Aggregate information for each Covariate - */ - private static class RequiredCovariateInfo { - public final int nBits; // number of bits for this key - public final int offset; // the offset into the master key - public final long mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits ) - public final Covariate covariate; // this allows reverse lookup of the Covariates in order - - RequiredCovariateInfo(final int nBits, final int offset, final long mask, final Covariate covariate) { - this.nBits = nBits; - this.offset = offset; - this.mask = mask; - this.covariate = covariate; - } - } - - private static class OptionalCovariateInfo { - public final long covariateID; // cache the covariate ID (must be a long so that bit-fiddling works) - public final Covariate covariate; - - OptionalCovariateInfo(final long covariateID, final Covariate covariate) { - this.covariateID = covariateID; - this.covariate = covariate; - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java index fae2ac898..365c816c7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -43,6 +43,9 @@ public class ContextCovariate implements StandardCovariate { private int mismatchesContextSize; private int indelsContextSize; + // the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are + // not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base. + static final private int MAX_DNA_CONTEXT = 13; private byte LOW_QUAL_TAIL; // Initialize any member variables using the command-line arguments passed to the walkers @@ -64,6 +67,7 @@ public class ContextCovariate implements StandardCovariate { @Override public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { + // TODO -- wrong: fix me final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context final boolean negativeStrand = clippedRead.getReadNegativeStrandFlag(); @@ -73,7 +77,7 @@ public class ContextCovariate implements StandardCovariate { final int readLength = clippedRead.getReadLength(); for (int i = 0; i < readLength; i++) { - final long indelKey = contextWith(bases, i, indelsContextSize); + final int indelKey = contextWith(bases, i, indelsContextSize); values.addCovariate(contextWith(bases, i, mismatchesContextSize), indelKey, indelKey, (negativeStrand ? readLength - i - 1 : i)); } } @@ -85,7 +89,7 @@ public class ContextCovariate implements StandardCovariate { } @Override - public String formatKey(final long key) { + public String formatKey(final int key) { if (key == -1) // this can only happen in test routines because we do not propagate null keys to the csv file return null; @@ -93,13 +97,8 @@ public class ContextCovariate implements StandardCovariate { } @Override - public long longFromKey(Object key) { - return keyFromContext((String) key); - } - - @Override - public int numberOfBits() { - return Integer.bitCount(Integer.MAX_VALUE); + public int keyFromValue(final Object value) { + return keyFromContext((String) value); } /** @@ -110,130 +109,61 @@ public class ContextCovariate implements StandardCovariate { * @param contextSize context size to use building the context * @return the key representing the context */ - private long contextWith(final byte[] bases, final int offset, final int contextSize) { + private int contextWith(final byte[] bases, final int offset, final int contextSize) { final int start = offset - contextSize + 1; - final long result; - if (start >= 0) - result = keyFromContext(bases, start, offset + 1); - else - result = -1L; - return result; + return (start >= 0) ? keyFromContext(bases, start, offset + 1) : -1; } - public static long keyFromContext(final String dna) { + public static int keyFromContext(final String dna) { return keyFromContext(dna.getBytes(), 0, dna.length()); } /** - * Creates a long representation of a given dna string. + * Creates a int representation of a given dna string. * - * Warning: This conversion is limited to long precision, therefore the dna sequence cannot - * be longer than 31 bases. - * - * The bit representation of a dna string is the simple: - * 0 A 4 AA 8 CA - * 1 C 5 AC ... - * 2 G 6 AG 1343 TTGGT - * 3 T 7 AT 1364 TTTTT - * - * To convert from dna to number, we convert the dna string to base10 and add all combinations that - * preceded the string (with smaller lengths). - * - * @param dna the dna sequence + * @param dna the dna sequence + * @param start the start position in the byte array (inclusive) + * @param end the end position in the array (exclusive) * @return the key representing the dna sequence */ - public static long keyFromContext(final byte[] dna, final int start, final int end) { - final long preContext = combinationsPerLength[end - start - 1]; // the sum of all combinations that preceded the length of the dna string - long baseTen = 0L; // the number in base_10 that we are going to use to generate the bit set + public static int keyFromContext(final byte[] dna, final int start, final int end) { + + // TODO -- bit fiddle to ge this all working in a single call to the method (mask out length, shift, OR length back in) + + int key = end - start; + int bitOffset = 4; for (int i = start; i < end; i++) { - baseTen = (baseTen << 2); // multiply by 4 final int baseIndex = BaseUtils.simpleBaseToBaseIndex(dna[i]); if (baseIndex == -1) // ignore non-ACGT bases - return -1L; - baseTen += (long)baseIndex; + return -1; + key |= (baseIndex << bitOffset); + bitOffset += 2; } - return baseTen + preContext; // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length. - } - - static final private int MAX_DNA_CONTEXT = 31; // the maximum context size (number of bases) permitted in the "long bitset" implementation of the DNA <=> BitSet conversion. - static final long[] combinationsPerLength = new long[MAX_DNA_CONTEXT + 1]; // keeps the memoized table with the number of combinations for each given DNA context length - static { - for (int i = 0; i < MAX_DNA_CONTEXT + 1; i++) - computeCombinationsFor(i); - } - - /** - * The sum of all combinations of a context of a given length from length = 0 to length. - * - * Memoized implementation of sum(4^i) , where i=[0,length] - * - * @param length the length of the DNA context - */ - private static void computeCombinationsFor(final int length) { - long combinations = 0L; - for (int i = 1; i <= length; i++) - combinations += (1L << 2 * i); // add all combinations with 4^i ( 4^i is the same as 2^(2*i) ) - combinationsPerLength[length] = combinations; + return key; } /** * Converts a key into the dna string representation. * - * Warning: This conversion is limited to long precision, therefore the dna sequence cannot - * be longer than 31 bases. - * - * We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the - * base_10 representation of the sequence. This is important for us to know how to bring the number - * to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented - * as 0's and leading 0's are omitted). - * - * quasi-canonical because A is represented by a 0, therefore, - * instead of : 0, 1, 2, 3, 10, 11, 12, ... - * we have : 0, 1, 2, 3, 00, 01, 02, ... - * - * but we can correctly decode it because we know the final length. - * * @param key the key representing the dna sequence * @return the dna sequence represented by the key */ - public static String contextFromKey(long key) { + public static String contextFromKey(final int key) { if (key < 0) throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?"); - final int length = contextLengthFor(key); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls) - key -= combinationsPerLength[length - 1]; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation + final int length = key & 15; // the first 4 bits represent the length (in bp) of the context + int mask = 48; // use the mask to pull out bases + int offset = 4; StringBuilder dna = new StringBuilder(); - while (key > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical) - final byte base = (byte) (key & 3); // equivalent to (key % 4) - dna.append((char)BaseUtils.baseIndexToSimpleBase(base)); - key = key >> 2; // divide by 4 + for (int i = 0; i < length; i++) { + final int baseIndex = (key & mask) >> offset; + dna.append((char)BaseUtils.baseIndexToSimpleBase(baseIndex)); + mask = mask << 2; // move the mask over to the next 2 bits + offset += 2; } - for (int j = dna.length(); j < length; j++) - dna.append('A'); // add leading A's as necessary (due to the "quasi" canonical status, see description above) - return dna.reverse().toString(); // make sure to reverse the string since we should have been pre-pending all along - } - - /** - * Calculates the length of the DNA context for a given base 10 number - * - * It is important to know the length given the base 10 number to calculate the number of combinations - * and to disambiguate the "quasi-canonical" state. - * - * This method also calculates the number of combinations as a by-product, but since it memoizes the - * results, a subsequent call to combinationsFor(length) is O(1). - * - * @param number the base 10 representation of the key - * @return the length of the DNA context represented by this number - */ - private static int contextLengthFor(final long number) { - int length = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet. - long combinations = combinationsPerLength[length]; // the next context (we advance it so we know which one was preceding it). - while (combinations <= number) { // find the length of the dna string (length) - length++; - combinations = combinationsPerLength[length]; // calculate the next context - } - return length; + return dna.toString(); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java index ff86220b8..4b959eea4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java @@ -67,7 +67,7 @@ public interface Covariate { * @param key the long representation of the key * @return a string representation of the key */ - public String formatKey(final long key); + public String formatKey(final int key); /** * Converts an Object key into a long key using only the lowest numberOfBits() bits @@ -75,18 +75,10 @@ public interface Covariate { * Only necessary for on-the-fly recalibration when you have the object, but need to store it in memory in long format. For counting covariates * the getValues method already returns all values in long format. * - * @param key the object corresponding to the covariate + * @param value the object corresponding to the covariate * @return a long representation of the object */ - public long longFromKey(final Object key); - - /** - * Each covariate should determine how many bits are necessary to encode it's data - * - * @return The number of bits used to represent the values of this covariate. - */ - public int numberOfBits(); - + public int keyFromValue(final Object value); } interface RequiredCovariate extends Covariate {} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java index 3e91ca539..3c917388c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java @@ -79,7 +79,7 @@ public class CycleCovariate implements StandardCovariate { final int CUSHION = 4; final int MAX_CYCLE = readLength - CUSHION - 1; for (int i = 0; i < readLength; i++) { - final long key = (iMAX_CYCLE) ? -1L : keyFromCycle(cycle); + final int key = (iMAX_CYCLE) ? -1 : keyFromCycle(cycle); values.addCovariate(key, key, key, i); cycle += increment; } @@ -106,22 +106,22 @@ public class CycleCovariate implements StandardCovariate { int iii = 0; while (iii < readLength) { while (iii < readLength && bases[iii] == (byte) 'T') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii++; } while (iii < readLength && bases[iii] == (byte) 'A') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii++; } while (iii < readLength && bases[iii] == (byte) 'C') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii++; } while (iii < readLength && bases[iii] == (byte) 'G') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii++; } @@ -132,7 +132,7 @@ public class CycleCovariate implements StandardCovariate { cycle++; } if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii++; } @@ -143,22 +143,22 @@ public class CycleCovariate implements StandardCovariate { int iii = readLength - 1; while (iii >= 0) { while (iii >= 0 && bases[iii] == (byte) 'T') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii--; } while (iii >= 0 && bases[iii] == (byte) 'A') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii--; } while (iii >= 0 && bases[iii] == (byte) 'C') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii--; } while (iii >= 0 && bases[iii] == (byte) 'G') { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii--; } @@ -169,7 +169,7 @@ public class CycleCovariate implements StandardCovariate { cycle++; } if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) { - final long key = keyFromCycle(cycle); + final int key = keyFromCycle(cycle); values.addCovariate(key, key, key, iii); iii--; } @@ -190,26 +190,21 @@ public class CycleCovariate implements StandardCovariate { } @Override - public String formatKey(final long key) { - long cycle = key >> 1; // shift so we can remove the "sign" bit + public String formatKey(final int key) { + int cycle = key >> 1; // shift so we can remove the "sign" bit if ( (key & 1) != 0 ) // is the last bit set? cycle *= -1; // then the cycle is negative return String.format("%d", cycle); } @Override - public long longFromKey(final Object key) { - return (key instanceof String) ? keyFromCycle(Integer.parseInt((String) key)) : keyFromCycle((Integer) key); + public int keyFromValue(final Object value) { + return (value instanceof String) ? keyFromCycle(Integer.parseInt((String) value)) : keyFromCycle((Integer) value); } - @Override - public int numberOfBits() { - return Integer.bitCount(Integer.MAX_VALUE); - } - - private static long keyFromCycle(final int cycle) { + private static int keyFromCycle(final int cycle) { // no negative values because values must fit into the first few bits of the long - long result = Math.abs(cycle); + int result = Math.abs(cycle); result = result << 1; // shift so we can add the "sign" bit if ( cycle < 0 ) result++; // negative cycles get the lower-most bit set diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java index c60ca38e1..8ee980124 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* @@ -49,7 +48,7 @@ public class QualityScoreCovariate implements RequiredCovariate { final byte[] baseDeletionQualities = read.getBaseDeletionQualities(); for (int i = 0; i < baseQualities.length; i++) { - values.addCovariate((long)baseQualities[i], (long)baseInsertionQualities[i], (long)baseDeletionQualities[i], i); + values.addCovariate((int)baseQualities[i], (int)baseInsertionQualities[i], (int)baseDeletionQualities[i], i); } } @@ -60,17 +59,12 @@ public class QualityScoreCovariate implements RequiredCovariate { } @Override - public String formatKey(final long key) { + public String formatKey(final int key) { return String.format("%d", key); } @Override - public long longFromKey(final Object key) { - return (key instanceof String) ? (long)Byte.parseByte((String) key) : (long)(Byte) key; + public int keyFromValue(final Object value) { + return (value instanceof String) ? (int)Byte.parseByte((String) value) : (int)(Byte) value; } - - @Override - public int numberOfBits() { - return BQSRKeyManager.numberOfBitsToRepresent(QualityUtils.MAX_QUAL_SCORE); - } -} +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java index 02339330b..541f3a0a5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java @@ -1,13 +1,14 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.recalibration.QualQuantizer; +import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import java.util.Arrays; import java.util.List; -import java.util.Map; /** * Class that encapsulates the information necessary for quality score quantization for BQSR @@ -30,25 +31,17 @@ public class QuantizationInfo { this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals)); } - public QuantizationInfo(Map> keysAndTablesMap, int quantizationLevels) { + public QuantizationInfo(final RecalibrationTables recalibrationTables, final int quantizationLevels) { final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution for (int i = 0; i < qualHistogram.length; i++) qualHistogram[i] = 0L; - Map qualTable = null; // look for the quality score table - for (Map.Entry> entry : keysAndTablesMap.entrySet()) { - BQSRKeyManager keyManager = entry.getKey(); - if (keyManager.getNumRequiredCovariates() == 2) // it should be the only one with 2 required covariates - qualTable = entry.getValue(); - } + final NestedHashMap qualTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); // get the quality score table - if (qualTable == null) - throw new ReviewedStingException("Could not find QualityScore table."); - - for (RecalDatum datum : qualTable.values()) { - int empiricalQual = (int) Math.round(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) - long nObservations = datum.numObservations; - qualHistogram[empiricalQual] += nObservations; // add the number of observations for every key + for (final Object value : qualTable.getAllValues()) { + final RecalDatum datum = (RecalDatum)value; + final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) + qualHistogram[empiricalQual] += datum.numObservations; // add the number of observations for every key } empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities quantizeQualityScores(quantizationLevels); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java index c9043dc04..5e907237d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java @@ -1,7 +1,5 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - /** * The object temporarily held by a read that describes all of it's covariates. * @@ -11,65 +9,56 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; * @since 2/8/12 */ public class ReadCovariates { - private final long[][] mismatchesKeySet; - private final long[][] insertionsKeySet; - private final long[][] deletionsKeySet; + private final int[][][] keys; private int currentCovariateIndex = 0; - public ReadCovariates(int readLength, int numberOfCovariates) { - this.mismatchesKeySet = new long[readLength][numberOfCovariates]; - this.insertionsKeySet = new long[readLength][numberOfCovariates]; - this.deletionsKeySet = new long[readLength][numberOfCovariates]; + public ReadCovariates(final int readLength, final int numberOfCovariates) { + keys = new int[EventType.values().length][readLength][numberOfCovariates]; } public void setCovariateIndex(final int index) { currentCovariateIndex = index; } - public void addCovariate(final long mismatch, final long insertion, final long deletion, final int readOffset) { - mismatchesKeySet[readOffset][currentCovariateIndex] = mismatch; - insertionsKeySet[readOffset][currentCovariateIndex] = insertion; - deletionsKeySet[readOffset][currentCovariateIndex] = deletion; + public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { + keys[EventType.BASE_SUBSTITUTION.index][readOffset][currentCovariateIndex] = mismatch; + keys[EventType.BASE_INSERTION.index][readOffset][currentCovariateIndex] = insertion; + keys[EventType.BASE_DELETION.index][readOffset][currentCovariateIndex] = deletion; } - public long[] getKeySet(final int readPosition, final EventType errorModel) { - switch (errorModel) { - case BASE_SUBSTITUTION: - return getMismatchesKeySet(readPosition); - case BASE_INSERTION: - return getInsertionsKeySet(readPosition); - case BASE_DELETION: - return getDeletionsKeySet(readPosition); - default: - throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel); - } + public int[] getKeySet(final int readPosition, final EventType errorModel) { + return keys[errorModel.index][readPosition]; } - public long[] getMismatchesKeySet(final int readPosition) { - return mismatchesKeySet[readPosition]; + public int[][] getKeySet(final EventType errorModel) { + return keys[errorModel.index]; } - public long[] getInsertionsKeySet(final int readPosition) { - return insertionsKeySet[readPosition]; + public int[] getMismatchesKeySet(final int readPosition) { + return keys[EventType.BASE_SUBSTITUTION.index][readPosition]; } - public long[] getDeletionsKeySet(final int readPosition) { - return deletionsKeySet[readPosition]; + public int[] getInsertionsKeySet(final int readPosition) { + return keys[EventType.BASE_INSERTION.index][readPosition]; + } + + public int[] getDeletionsKeySet(final int readPosition) { + return keys[EventType.BASE_DELETION.index][readPosition]; } /** * Testing routines */ - protected long[][] getMismatchesKeySet() { - return mismatchesKeySet; + protected int[][] getMismatchesKeySet() { + return keys[EventType.BASE_SUBSTITUTION.index]; } - protected long[][] getInsertionsKeySet() { - return insertionsKeySet; + protected int[][] getInsertionsKeySet() { + return keys[EventType.BASE_INSERTION.index]; } - protected long[][] getDeletionsKeySet() { - return deletionsKeySet; + protected int[][] getDeletionsKeySet() { + return keys[EventType.BASE_DELETION.index]; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java index ae0ef38cc..c086ef6d9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java @@ -40,9 +40,9 @@ import java.util.HashMap; public class ReadGroupCovariate implements RequiredCovariate { - private final HashMap readGroupLookupTable = new HashMap(); - private final HashMap readGroupReverseLookupTable = new HashMap(); - private long nextId = 0L; + private final HashMap readGroupLookupTable = new HashMap(); + private final HashMap readGroupReverseLookupTable = new HashMap(); + private int nextId = 0; // Initialize any member variables using the command-line arguments passed to the walkers @Override @@ -51,7 +51,7 @@ public class ReadGroupCovariate implements RequiredCovariate { @Override public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { final String readGroupId = readGroupValueFromRG(read.getReadGroup()); - final long key = keyForReadGroup(readGroupId); + final int key = keyForReadGroup(readGroupId); final int l = read.getReadLength(); for (int i = 0; i < l; i++) @@ -64,21 +64,16 @@ public class ReadGroupCovariate implements RequiredCovariate { } @Override - public String formatKey(final long key) { + public String formatKey(final int key) { return readGroupReverseLookupTable.get(key); } @Override - public long longFromKey(Object key) { - return keyForReadGroup((String) key); + public int keyFromValue(final Object value) { + return keyForReadGroup((String) value); } - @Override - public int numberOfBits() { - return BQSRKeyManager.numberOfBitsToRepresent(Short.MAX_VALUE); - } - - private long keyForReadGroup(final String readGroupId) { + private int keyForReadGroup(final String readGroupId) { if (!readGroupLookupTable.containsKey(readGroupId)) { readGroupLookupTable.put(readGroupId, nextId); readGroupReverseLookupTable.put(nextId, readGroupId); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java index 1356ffa94..ec82da95f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java @@ -32,11 +32,13 @@ import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.io.Resource; +import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -82,6 +84,14 @@ public class RecalDataManager { private static final String SCRIPT_FILE = "BQSR.R"; + private static final Pair covariateValue = new Pair(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s"); + private static final Pair covariateName = new Pair(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s"); + private static final Pair eventType = new Pair(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s"); + private static final Pair empiricalQuality = new Pair(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); + private static final Pair estimatedQReported = new Pair(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); + private static final Pair nObservations = new Pair(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); + private static final Pair nErrors = new Pair(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d"); + public enum SOLID_RECAL_MODE { /** @@ -141,30 +151,6 @@ public class RecalDataManager { } } - - /** - * Initializes the recalibration table -> key manager map - * - * @param requiredCovariates list of required covariates (in order) - * @param optionalCovariates list of optional covariates (in order) - * @return a map with each key manager and it's corresponding recalibration table properly initialized - */ - public static LinkedHashMap> initializeTables(ArrayList requiredCovariates, ArrayList optionalCovariates) { - final LinkedHashMap> tablesAndKeysMap = new LinkedHashMap>(); - final ArrayList requiredCovariatesToAdd = new ArrayList(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates. - final ArrayList optionalCovariatesToAdd = new ArrayList(); // initialize an empty array of optional covariates to create the first few tables - for (Covariate covariate : requiredCovariates) { - requiredCovariatesToAdd.add(covariate); - final Map recalTable = new HashMap(); // initializing a new recal table for each required covariate (cumulatively) - final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager - tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map - } - final Map recalTable = new HashMap(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates - final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager - tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map - return tablesAndKeysMap; - } - /** * Generates two lists : required covariates and optional covariates based on the user's requests. * @@ -223,42 +209,29 @@ public class RecalDataManager { logger.info(""); } - private static List generateReportTables(Map> keysAndTablesMap) { + private static List generateReportTables(final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates) { List result = new LinkedList(); int tableIndex = 0; - final Pair covariateValue = new Pair(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s"); - final Pair covariateName = new Pair(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s"); - final Pair eventType = new Pair(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s"); - final Pair empiricalQuality = new Pair(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); - final Pair estimatedQReported = new Pair(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); - final Pair nObservations = new Pair(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); - final Pair nErrors = new Pair(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d"); + final Map covariateNameMap = new HashMap(requestedCovariates.length); + for (final Covariate covariate : requestedCovariates) + covariateNameMap.put(covariate, parseCovariateName(covariate)); - for (Map.Entry> entry : keysAndTablesMap.entrySet()) { - final BQSRKeyManager keyManager = entry.getKey(); - final Map recalTable = entry.getValue(); + for (final RecalibrationTables.TableType type : RecalibrationTables.TableType.values()) { - final boolean isReadGroupTable = tableIndex == 0; // special case for the read group table so we can print the extra column it needs. - - final Covariate[] requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table - final Covariate[] optionalList = keyManager.getOptionalCovariates(); // ask the key manager what optional covariates were used in this recal table - - final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names - - for (final Covariate covariate : requiredList) { - final String name = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the covariate names and put them in order - columnNames.add(new Pair(name, "%s")); // save the required covariate name so we can reference it in the future - } - - if (optionalList.length > 0) { - columnNames.add(covariateValue); - columnNames.add(covariateName); + final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future + if (type != RecalibrationTables.TableType.READ_GROUP_TABLE) { + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future + if (type == RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLE) { + columnNames.add(covariateValue); + columnNames.add(covariateName); + } } columnNames.add(eventType); // the order of these column names is important here columnNames.add(empiricalQuality); - if (isReadGroupTable) + if (type == RecalibrationTables.TableType.READ_GROUP_TABLE) columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported columnNames.add(nObservations); columnNames.add(nErrors); @@ -269,42 +242,59 @@ public class RecalDataManager { int rowIndex = 0; - for (Map.Entry recalTableEntry : recalTable.entrySet()) { // create a map with column name => key value for all covariate keys - final Long bitSetKey = recalTableEntry.getKey(); - final Map columnData = new HashMap(columnNames.size()); - final Iterator> iterator = columnNames.iterator(); - for (final Object key : keyManager.keySetFrom(bitSetKey)) { - final String columnName = iterator.next().getFirst(); - columnData.put(columnName, key); - } - final RecalDatum datum = recalTableEntry.getValue(); - columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality()); - if (isReadGroupTable) - columnData.put(iterator.next().getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table - columnData.put(iterator.next().getFirst(), datum.numObservations); - columnData.put(iterator.next().getFirst(), datum.numMismatches); + final NestedHashMap table = recalibrationTables.getTable(type); + for (final NestedHashMap.Leaf row : table.getAllLeaves()) { + final RecalDatum datum = (RecalDatum)row.value; + final List keys = row.keys; - for (final Map.Entry dataEntry : columnData.entrySet()) { - final String columnName = dataEntry.getKey(); - final Object value = dataEntry.getValue(); - reportTable.set(rowIndex, columnName, value.toString()); + int columnIndex = 0; + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex).getFirst(), requestedCovariates[0].formatKey((Integer)keys.get(columnIndex++))); + if (type != RecalibrationTables.TableType.READ_GROUP_TABLE) { + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex).getFirst(), requestedCovariates[1].formatKey((Integer) keys.get(columnIndex++))); + if (type == RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLE) { + final int covariateIndex = (Integer)keys.get(columnIndex); + final Covariate covariate = requestedCovariates[2 + covariateIndex]; + final int covariateKey = (Integer)keys.get(columnIndex+1); + + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex++).getFirst(), covariate.formatKey(covariateKey)); + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex++).getFirst(), covariateNameMap.get(covariate)); + } } + + final EventType event = EventType.eventFrom((Integer)keys.get(columnIndex)); + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex++).getFirst(), event); + + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality()); + if (type == RecalibrationTables.TableType.READ_GROUP_TABLE) + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex++).getFirst(), datum.numObservations); + setReportTableCell(reportTable, rowIndex, columnNames.get(columnIndex).getFirst(), datum.numMismatches); + rowIndex++; } result.add(reportTable); } + return result; } - public static void outputRecalibrationReport(RecalibrationArgumentCollection RAC, QuantizationInfo quantizationInfo, Map> keysAndTablesMap, PrintStream outputFile) { - outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile); + private static String parseCovariateName(final Covariate covariate) { + return covariate.getClass().getSimpleName().split("Covariate")[0]; } - public static void outputRecalibrationReport(GATKReportTable argumentTable, QuantizationInfo quantizationInfo, LinkedHashMap> keysAndTablesMap, PrintStream outputFile) { - outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile); + private static void setReportTableCell(final GATKReportTable reportTable, final int rowIndex, final String columnName, final Object value) { + reportTable.set(rowIndex, columnName, value.toString()); } - private static void outputRecalibrationReport(GATKReportTable argumentTable, GATKReportTable quantizationTable, List recalTables, PrintStream outputFile) { + public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile) { + outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile); + } + + public static void outputRecalibrationReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile) { + outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile); + } + + private static void outputRecalibrationReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables, final PrintStream outputFile) { final GATKReport report = new GATKReport(); report.addTable(argumentTable); report.addTable(quantizationTable); @@ -340,108 +330,87 @@ public class RecalDataManager { } - public static void generateRecalibrationPlot(File filename, LinkedHashMap> original, boolean keepIntermediates) { + public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final Covariate[] requestedCovariates, final boolean keepIntermediates) { final Pair files = initializeRecalibrationPlot(filename); - writeCSV(files.getFirst(), original, "ORIGINAL", true); + writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, true); outputRecalibrationPlot(files, keepIntermediates); } - public static void generateRecalibrationPlot(File filename, LinkedHashMap> original, LinkedHashMap> recalibrated, boolean keepIntermediates) { + public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates, final boolean keepIntermediates) { final Pair files = initializeRecalibrationPlot(filename); - writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", true); - writeCSV(files.getFirst(), original, "ORIGINAL", false); + writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", requestedCovariates, true); + writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, false); outputRecalibrationPlot(files, keepIntermediates); } - private static void writeCSV(PrintStream deltaTableFile, LinkedHashMap> map, String recalibrationMode, boolean printHeader) { - final int QUALITY_SCORE_COVARIATE_INDEX = 1; - final Map deltaTable = new HashMap(); - BQSRKeyManager deltaKeyManager = null; + private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) { + final NestedHashMap deltaTable = new NestedHashMap(); - - for (Map.Entry> tableEntry : map.entrySet()) { - final BQSRKeyManager keyManager = tableEntry.getKey(); - - if (keyManager.getNumOptionalCovariates() > 0) { // initialize with the 'all covariates' table - // create a key manager for the delta table - final List requiredCovariates = Arrays.asList(keyManager.getRequiredCovariates()[0]); // include the read group covariate as the only required covariate - final List optionalCovariates = new ArrayList(); - optionalCovariates.add(keyManager.getRequiredCovariates()[1]); // include the quality score covariate as an optional covariate - optionalCovariates.addAll(Arrays.asList(keyManager.getOptionalCovariates())); // include all optional covariates - deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager - } + // add the quality score table to the delta table + final NestedHashMap qualTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); + for (final NestedHashMap.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table + final List newCovs = new ArrayList(4); + newCovs.add(leaf.keys.get(0)); + newCovs.add(requestedCovariates.length); // replace the covariate name with an arbitrary (unused) index for QualityScore + newCovs.add(leaf.keys.get(1)); + newCovs.add(leaf.keys.get(2)); + addToDeltaTable(deltaTable, newCovs.toArray(), (RecalDatum)leaf.value); // add this covariate to the delta table } - if (deltaKeyManager == null) - throw new ReviewedStingException ("Couldn't find the covariates table"); - - boolean readyToPrint = false; - for (Map.Entry> tableEntry : map.entrySet()) { - final BQSRKeyManager keyManager = tableEntry.getKey(); - - if (keyManager.getNumRequiredCovariates() == 2 && keyManager.getNumOptionalCovariates() == 0) { // look for the QualityScore table - final Map table = tableEntry.getValue(); - - // add the quality score table to the delta table - for (final Map.Entry entry : table.entrySet()) { // go through every element in the covariates table to create the delta table - final RecalDatum recalDatum = entry.getValue(); // the current element (recal datum) - - final List covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key - final List newCovs = new ArrayList(4); - newCovs.add(0, covs.get(0)); // replace the covariate value with the quality score - newCovs.add(1, covs.get(1)); - newCovs.add(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate) - newCovs.add(3, covs.get(2)); - final long deltaKey = deltaKeyManager.longFromKey(newCovs.toArray()); // create a new bitset key for the delta table - addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table - } - } - - else if (keyManager.getNumOptionalCovariates() > 0) { // look for the optional covariates table - final Map table = tableEntry.getValue(); - - // add the optional covariates to the delta table - for (final Map.Entry entry : table.entrySet()) { // go through every element in the covariates table to create the delta table - final RecalDatum recalDatum = entry.getValue(); // the current element (recal datum) - - final List covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key - covs.remove(QUALITY_SCORE_COVARIATE_INDEX); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) - final long deltaKey = deltaKeyManager.longFromKey(covs.toArray()); // create a new bitset key for the delta table - addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table - } - readyToPrint = true; - } - - // output the csv file - if (readyToPrint) { - - if (printHeader) { - final List header = new LinkedList(); - header.add("ReadGroup"); - header.add("CovariateValue"); - header.add("CovariateName"); - header.add("EventType"); - header.add("Observations"); - header.add("Errors"); - header.add("EmpiricalQuality"); - header.add("AverageReportedQuality"); - header.add("Accuracy"); - header.add("Recalibration"); - deltaTableFile.println(Utils.join(",", header)); - } - - // print each data line - for (final Map.Entry deltaEntry : deltaTable.entrySet()) { - final List deltaKeys = deltaKeyManager.keySetFrom(deltaEntry.getKey()); - final RecalDatum deltaDatum = deltaEntry.getValue(); - deltaTableFile.print(Utils.join(",", deltaKeys)); - deltaTableFile.print("," + deltaDatum.stringForCSV()); - deltaTableFile.println("," + recalibrationMode); - } - - } - + // add the optional covariates to the delta table + final NestedHashMap covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLE); + for (final NestedHashMap.Leaf leaf : covTable.getAllLeaves()) { + final List covs = new ArrayList(leaf.keys); + covs.remove(1); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) + addToDeltaTable(deltaTable, covs.toArray(), (RecalDatum)leaf.value); // add this covariate to the delta table } + + // output the csv file + if (printHeader) { + final List header = new LinkedList(); + header.add("ReadGroup"); + header.add("CovariateValue"); + header.add("CovariateName"); + header.add("EventType"); + header.add("Observations"); + header.add("Errors"); + header.add("EmpiricalQuality"); + header.add("AverageReportedQuality"); + header.add("Accuracy"); + header.add("Recalibration"); + deltaTableFile.println(Utils.join(",", header)); + } + + final Map covariateNameMap = new HashMap(requestedCovariates.length); + for (final Covariate covariate : requestedCovariates) + covariateNameMap.put(covariate, parseCovariateName(covariate)); + + // print each data line + for (final NestedHashMap.Leaf leaf : deltaTable.getAllLeaves()) { + final List deltaKeys = generateValuesFromKeys(leaf.keys, requestedCovariates, covariateNameMap); + final RecalDatum deltaDatum = (RecalDatum)leaf.value; + deltaTableFile.print(Utils.join(",", deltaKeys)); + deltaTableFile.print("," + deltaDatum.stringForCSV()); + deltaTableFile.println("," + recalibrationMode); + } + } + + private static List generateValuesFromKeys(final List keys, final Covariate[] covariates, final Map covariateNameMap) { + final List values = new ArrayList(4); + values.add(covariates[0].formatKey((Integer)keys.get(0))); + + // TODO -- create static final variables to hold the indexes of the RG, qual, cov ID, etc. + + final int covariateIndex = (Integer)keys.get(1); + final Covariate covariate = covariateIndex == covariates.length ? covariates[1] : covariates[2 + covariateIndex]; + final int covariateKey = (Integer)keys.get(2); + values.add(covariate.formatKey(covariateKey)); + values.add(covariateNameMap.get(covariate)); + + final EventType event = EventType.eventFrom((Integer)keys.get(3)); + values.add(event); + + return values; } /** @@ -453,15 +422,14 @@ public class RecalDataManager { * @param deltaKey the key to the table * @param recalDatum the recal datum to combine with the accuracyDatum element in the table */ - private static void addToDeltaTable(Map deltaTable, Long deltaKey, RecalDatum recalDatum) { - final RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key + private static void addToDeltaTable(final NestedHashMap deltaTable, final Object[] deltaKey, final RecalDatum recalDatum) { + final RecalDatum deltaDatum = (RecalDatum)deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key if (deltaDatum == null) - deltaTable.put(deltaKey, new RecalDatum(recalDatum)); // if we don't have a key yet, create a new one with the same values as the curent datum + deltaTable.put(new RecalDatum(recalDatum), deltaKey); // if we don't have a key yet, create a new one with the same values as the curent datum else deltaDatum.combine(recalDatum); // if we do have a datum, combine it with this one. } - /** * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string * @@ -627,13 +595,13 @@ public class RecalDataManager { * * @param read The read for which to compute covariate values. * @param requestedCovariates The list of requested covariates. - * @param readCovariates The object to store the covariate values + * @param resultsStorage The object to store the covariate values */ - public static void computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates, final ReadCovariates readCovariates) { + public static void computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates, final ReadCovariates resultsStorage) { // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read for (int i = 0; i < requestedCovariates.length; i++) { - readCovariates.setCovariateIndex(i); - requestedCovariates[i].recordValues(read, readCovariates); + resultsStorage.setCovariateIndex(i); + requestedCovariates[i].recordValues(read, resultsStorage); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java index 3eb3a3981..b26912c31 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java @@ -113,8 +113,7 @@ public class RecalDatum extends Datum { return String.format("%s,%d,%.2f", toString(), (byte) Math.floor(getEstimatedQReported()), getEmpiricalQuality() - getEstimatedQReported()); } - - private double calcExpectedErrors() { + private double calcExpectedErrors() { return (double) this.numObservations * qualToErrorProb(estimatedQReported); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java index 5af15c01c..a7088f4b6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java @@ -3,8 +3,9 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import java.io.File; import java.io.PrintStream; @@ -18,14 +19,19 @@ import java.util.*; */ public class RecalibrationReport { private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) - private final LinkedHashMap> keysAndTablesMap; // quick access reference to the read group table and its key manager + private final RecalibrationTables recalibrationTables; // quick access reference to the tables private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation + private final HashMap optionalCovariateIndexes; private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter + private final Object[] tempRGarray = new Object[2]; + private final Object[] tempQUALarray = new Object[3]; + private final Object[] tempCOVarray = new Object[5]; + public RecalibrationReport(final File RECAL_FILE) { - GATKReport report = new GATKReport(RECAL_FILE); + final GATKReport report = new GATKReport(RECAL_FILE); argumentTable = report.getTable(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE); RAC = initializeArgumentCollectionTable(argumentTable); @@ -37,52 +43,39 @@ public class RecalibrationReport { ArrayList requiredCovariates = covariates.getFirst(); ArrayList optionalCovariates = covariates.getSecond(); requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; + optionalCovariateIndexes = new HashMap(optionalCovariates.size()); int covariateIndex = 0; for (final Covariate covariate : requiredCovariates) requestedCovariates[covariateIndex++] = covariate; - for (final Covariate covariate : optionalCovariates) - requestedCovariates[covariateIndex++] = covariate; + for (final Covariate covariate : optionalCovariates) { + requestedCovariates[covariateIndex] = covariate; + final String covariateName = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport + optionalCovariateIndexes.put(covariateName, covariateIndex-2); + covariateIndex++; + } for (Covariate cov : requestedCovariates) cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection - keysAndTablesMap = new LinkedHashMap>(); - ArrayList requiredCovariatesToAdd = new ArrayList(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates. - ArrayList optionalCovariatesToAdd = new ArrayList(); // initialize an empty array of optional covariates to create the first few tables - for (Covariate covariate : requiredCovariates) { - requiredCovariatesToAdd.add(covariate); - final Map table; // initializing a new recal table for each required covariate (cumulatively) - final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager + final GATKReportTable rgReportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE); + final NestedHashMap rgTable = parseReadGroupTable(rgReportTable); - final int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES) - final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check."; - if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table - final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE); - table = parseReadGroupTable(keyManager, reportTable); - } - else if (nRequiredCovariates == 2 && optionalCovariatesToAdd.isEmpty()) { // when we have both required covariates and no optional covariates we're at the QUAL table - final GATKReportTable reportTable = report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE); - table = parseQualityScoreTable(keyManager, reportTable); - } - else - throw new ReviewedStingException(UNRECOGNIZED_REPORT_TABLE_EXCEPTION); + final GATKReportTable qualReportTable = report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE); + final NestedHashMap qualTable = parseQualityScoreTable(qualReportTable); - keysAndTablesMap.put(keyManager, table); // adding the pair key+table to the map - } + final GATKReportTable covReportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE); + final NestedHashMap covTable = parseAllCovariatesTable(covReportTable); - - final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager - final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE); - final Map table = parseAllCovariatesTable(keyManager, reportTable); - keysAndTablesMap.put(keyManager, table); + recalibrationTables = new RecalibrationTables(rgTable, qualTable, covTable); } - protected RecalibrationReport(final QuantizationInfo quantizationInfo, final LinkedHashMap> keysAndTablesMap, final GATKReportTable argumentTable, final RecalibrationArgumentCollection RAC) { + protected RecalibrationReport(final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final GATKReportTable argumentTable, final RecalibrationArgumentCollection RAC) { this.quantizationInfo = quantizationInfo; - this.keysAndTablesMap = keysAndTablesMap; + this.recalibrationTables = recalibrationTables; this.argumentTable = argumentTable; this.RAC = RAC; this.requestedCovariates = null; + this.optionalCovariateIndexes = null; } /** @@ -98,29 +91,20 @@ public class RecalibrationReport { * * @param other the recalibration report to combine with this one */ - public void combine(RecalibrationReport other) { - Iterator>> thisIterator = keysAndTablesMap.entrySet().iterator(); + public void combine(final RecalibrationReport other) { - for (Map.Entry> otherEntry : other.getKeysAndTablesMap().entrySet()) { - Map.Entry> thisEntry = thisIterator.next(); + for (RecalibrationTables.TableType type : RecalibrationTables.TableType.values()) { + final NestedHashMap myTable = recalibrationTables.getTable(type); + final NestedHashMap otherTable = other.recalibrationTables.getTable(type); - final Map thisTable = thisEntry.getValue(); - final BQSRKeyManager thisKeyManager = thisEntry.getKey(); - final BQSRKeyManager otherKeyManager = otherEntry.getKey(); + for (final NestedHashMap.Leaf row : otherTable.getAllLeaves()) { + final RecalDatum myDatum = (RecalDatum)myTable.get(row.keys); - for (Map.Entry otherTableEntry : otherEntry.getValue().entrySet()) { - final RecalDatum otherDatum = otherTableEntry.getValue(); - final Long otherBitKey = otherTableEntry.getKey(); - final List otherObjectKey = otherKeyManager.keySetFrom(otherBitKey); - - final long thisKey = thisKeyManager.longFromKey(otherObjectKey.toArray()); - final RecalDatum thisDatum = thisTable.get(thisKey); - - if (thisDatum == null) - thisTable.put(thisKey, otherDatum); + if (myDatum == null) + myTable.put(row.value, row.keys); else - thisDatum.combine(otherDatum); - } + myDatum.combine((RecalDatum)row.value); + } } } @@ -128,8 +112,8 @@ public class RecalibrationReport { return quantizationInfo; } - public LinkedHashMap> getKeysAndTablesMap() { - return keysAndTablesMap; + public RecalibrationTables getRecalibrationTables() { + return recalibrationTables; } public Covariate[] getRequestedCovariates() { @@ -139,82 +123,87 @@ public class RecalibrationReport { /** * Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table * - * @param keyManager the key manager for this table * @param reportTable the GATKReport table containing data for this table * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. */ - private Map parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) { - ArrayList columnNamesOrderedList = new ArrayList(5); - columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.COVARIATE_NAME_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME); - return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, false); + private NestedHashMap parseAllCovariatesTable(final GATKReportTable reportTable) { + final NestedHashMap result = new NestedHashMap(); + + for ( int i = 0; i < reportTable.getNumRows(); i++ ) { + final Object rg = reportTable.get(i, RecalDataManager.READGROUP_COLUMN_NAME); + tempCOVarray[0] = requestedCovariates[0].keyFromValue(rg); + final Object qual = reportTable.get(i, RecalDataManager.QUALITY_SCORE_COLUMN_NAME); + tempCOVarray[1] = requestedCovariates[1].keyFromValue(qual); + final String covName = (String)reportTable.get(i, RecalDataManager.COVARIATE_NAME_COLUMN_NAME); + final int covIndex = optionalCovariateIndexes.get(covName); + tempCOVarray[2] = covIndex; + final Object covValue = reportTable.get(i, RecalDataManager.COVARIATE_VALUE_COLUMN_NAME); + tempCOVarray[3] = requestedCovariates[covIndex + 2].keyFromValue(covValue); + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalDataManager.EVENT_TYPE_COLUMN_NAME)); + tempCOVarray[4] = event.index; + + result.put(getRecalDatum(reportTable, i, false), tempCOVarray); + } + + return result; } /** * * Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table - * @param keyManager the key manager for this table * @param reportTable the GATKReport table containing data for this table * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. */ - private Map parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) { - ArrayList columnNamesOrderedList = new ArrayList(3); - columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME); - return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, false); + private NestedHashMap parseQualityScoreTable(final GATKReportTable reportTable) { + final NestedHashMap result = new NestedHashMap(); + + for ( int i = 0; i < reportTable.getNumRows(); i++ ) { + final Object rg = reportTable.get(i, RecalDataManager.READGROUP_COLUMN_NAME); + tempQUALarray[0] = requestedCovariates[0].keyFromValue(rg); + final Object qual = reportTable.get(i, RecalDataManager.QUALITY_SCORE_COLUMN_NAME); + tempQUALarray[1] = requestedCovariates[1].keyFromValue(qual); + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalDataManager.EVENT_TYPE_COLUMN_NAME)); + tempQUALarray[2] = event.index; + + result.put(getRecalDatum(reportTable, i, false), tempQUALarray); + } + + return result; } /** * Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table * - * @param keyManager the key manager for this table * @param reportTable the GATKReport table containing data for this table * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. */ - private Map parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) { - ArrayList columnNamesOrderedList = new ArrayList(2); - columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME); - columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME); - return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, true); - } - - /** - * Shared parsing functionality for all tables. - * - * @param keyManager the key manager for this table - * @param reportTable the GATKReport table containing data for this table - * @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table - * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. - */ - private Map genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList columnNamesOrderedList, boolean hasEstimatedQReportedColumn) { - final Map result = new HashMap(reportTable.getNumRows()*2); + private NestedHashMap parseReadGroupTable(final GATKReportTable reportTable) { + final NestedHashMap result = new NestedHashMap(); for ( int i = 0; i < reportTable.getNumRows(); i++ ) { - final int nKeys = columnNamesOrderedList.size(); - final Object [] keySet = new Object[nKeys]; - for (int j = 0; j < nKeys; j++) - keySet[j] = reportTable.get(i, columnNamesOrderedList.get(j)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below) - keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager). - final long bitKey = keyManager.longFromKey(keySet); + final Object rg = reportTable.get(i, RecalDataManager.READGROUP_COLUMN_NAME); + tempRGarray[0] = requestedCovariates[0].keyFromValue(rg); + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalDataManager.EVENT_TYPE_COLUMN_NAME)); + tempRGarray[1] = event.index; - final long nObservations = (Long) reportTable.get(i, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME); - final long nErrors = (Long) reportTable.get(i, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME); - final double empiricalQuality = (Double) reportTable.get(i, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME); - - final double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table - (Double) reportTable.get(i, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table - Byte.parseByte((String) reportTable.get(i, RecalDataManager.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table - - final RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality); - result.put(bitKey, recalDatum); + result.put(getRecalDatum(reportTable, i, true), tempRGarray); } + return result; } + private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { + final long nObservations = (Long) reportTable.get(row, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME); + final long nErrors = (Long) reportTable.get(row, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME); + final double empiricalQuality = (Double) reportTable.get(row, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME); + + final double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table + (Double) reportTable.get(row, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table + Byte.parseByte((String) reportTable.get(row, RecalDataManager.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table + + return new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality); + } + /** * Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores * @@ -308,55 +297,21 @@ public class RecalibrationReport { * and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer. */ public void calculateEmpiricalAndQuantizedQualities() { - for (Map table : keysAndTablesMap.values()) - for (RecalDatum datum : table.values()) - datum.calcCombinedEmpiricalQuality(); + for (RecalibrationTables.TableType type : RecalibrationTables.TableType.values()) { + final NestedHashMap table = recalibrationTables.getTable(type); + for (final Object value : table.getAllValues()) { + ((RecalDatum)value).calcCombinedEmpiricalQuality(); + } + } - quantizationInfo = new QuantizationInfo(keysAndTablesMap, RAC.QUANTIZING_LEVELS); + quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS); } public void output(PrintStream output) { - RecalDataManager.outputRecalibrationReport(argumentTable, quantizationInfo, keysAndTablesMap, output); + RecalDataManager.outputRecalibrationReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, output); } public RecalibrationArgumentCollection getRAC() { return RAC; } - - @Override - public boolean equals(Object o) { - if (!(o instanceof RecalibrationReport)) - return false; - RecalibrationReport other = (RecalibrationReport) o; - if (this == o) - return true; - return isEqualTable(this.keysAndTablesMap, other.keysAndTablesMap); - } - - private boolean isEqualTable(LinkedHashMap> t1, LinkedHashMap> t2) { - if (t1.size() != t2.size()) - return false; - - final Iterator>> t1Iterator = t1.entrySet().iterator(); - final Iterator>> t2Iterator = t2.entrySet().iterator(); - - while (t1Iterator.hasNext() && t2Iterator.hasNext()) { - Map.Entry> t1MapEntry = t1Iterator.next(); - Map.Entry> t2MapEntry = t2Iterator.next(); - - if (!(t1MapEntry.getKey().equals(t2MapEntry.getKey()))) - return false; - - final Map table2 = t2MapEntry.getValue(); - for (Map.Entry t1TableEntry : t1MapEntry.getValue().entrySet()) { - final Long t1Key = t1TableEntry.getKey(); - if (!table2.containsKey(t1Key)) - return false; - final RecalDatum t1Datum = t1TableEntry.getValue(); - if (!t1Datum.equals(table2.get(t1Key))) - return false; - } - } - return true; - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 3871ca987..393dd5735 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import java.util.Arrays; + /** * BaseUtils contains some basic utilities for manipulating nucleotides. */ @@ -47,6 +49,20 @@ public class BaseUtils { public boolean sameBase(int i) { return index == i; } } + static private final int[] baseIndexMap = new int[256]; + static { + Arrays.fill(baseIndexMap, -1); + baseIndexMap['A'] = 0; + baseIndexMap['a'] = 0; + baseIndexMap['*'] = 0; // the wildcard character counts as an A + baseIndexMap['C'] = 1; + baseIndexMap['c'] = 1; + baseIndexMap['G'] = 2; + baseIndexMap['g'] = 2; + baseIndexMap['T'] = 3; + baseIndexMap['t'] = 3; + } + // todo -- fix me (enums?) public static final byte DELETION_INDEX = 4; public static final byte NO_CALL_INDEX = 5; // (this is 'N') @@ -182,27 +198,7 @@ public class BaseUtils { * @return 0, 1, 2, 3, or -1 if the base can't be understood */ static public int simpleBaseToBaseIndex(byte base) { - switch (base) { - case '*': // the wildcard character counts as an A - case 'A': - case 'a': - return 0; - - case 'C': - case 'c': - return 1; - - case 'G': - case 'g': - return 2; - - case 'T': - case 't': - return 3; - - default: - return -1; - } + return baseIndexMap[base]; } /** @@ -213,27 +209,7 @@ public class BaseUtils { */ @Deprecated static public int simpleBaseToBaseIndex(char base) { - switch (base) { - case '*': // the wildcard character counts as an A - case 'A': - case 'a': - return 0; - - case 'C': - case 'c': - return 1; - - case 'G': - case 'g': - return 2; - - case 'T': - case 't': - return 3; - - default: - return -1; - } + return baseIndexMap[base]; } static public int extendedBaseToBaseIndex(byte base) { @@ -284,11 +260,6 @@ public class BaseUtils { } } - @Deprecated - static public char baseIndexToSimpleBaseAsChar(int baseIndex) { - return (char) baseIndexToSimpleBase(baseIndex); - } - /** * Converts a base index to a base index representing its cross-talk partner * diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java b/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java index 8652d3c28..6e79b7f24 100755 --- a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java @@ -25,7 +25,9 @@ package org.broadinstitute.sting.utils.collections; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; /** @@ -83,4 +85,53 @@ public class NestedHashMap { return value; // todo -- should never reach this point } + + public List getAllValues() { + List result = new ArrayList(); + fillAllValues(data, result); + return result; + } + + private void fillAllValues(final Map map, final List result) { + for ( Object value : map.values() ) { + if ( value == null ) + continue; + if ( value instanceof Map ) + fillAllValues((Map)value, result); + else + result.add(value); + } + } + + public static class Leaf { + public final List keys; + public final Object value; + + public Leaf(final List keys, final Object value) { + this.keys = keys; + this.value = value; + } + } + + public List getAllLeaves() { + List result = new ArrayList(); + List path = new ArrayList(); + fillAllLeaves(data, path, result); + return result; + } + + private void fillAllLeaves(final Map map, final List path, final List result) { + for ( final Object key : map.keySet() ) { + final Object value = map.get(key); + if ( value == null ) + continue; + final List newPath = new ArrayList(path); + newPath.add(key); + if ( value instanceof Map ) { + fillAllLeaves((Map) value, newPath, result); + } else { + result.add(new Leaf(newPath, value)); + } + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 631d69858..3612693da 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -28,10 +28,10 @@ package org.broadinstitute.sting.utils.recalibration; import org.broadinstitute.sting.gatk.walkers.bqsr.*; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; -import java.util.*; /** * Utility methods to facilitate on-the-fly base quality score recalibration. @@ -45,39 +45,15 @@ public class BaseRecalibration { private final ReadCovariates readCovariates; private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) - private final KeysAndTables keysAndTables; + private final RecalibrationTables recalibrationTables; private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation - static class KeysAndTables { + private final Object[] tempKeySet; - public enum Type { - READ_GROUP_TABLE(0), - QUALITY_SCORE_TABLE(1), - OPTIONAL_COVARIATE_TABLE(2); - - private final int index; - - private Type(int index) { - this.index = index; - } - } - - public final BQSRKeyManager[] managers = new BQSRKeyManager[Type.values().length]; - public final Map[] tables = new Map[Type.values().length]; - - public KeysAndTables(final Map> keysAndTablesMap) { - for (Map.Entry> mapEntry : keysAndTablesMap.entrySet()) { - Type type; - if (mapEntry.getKey().getNumRequiredCovariates() == 1) - type = Type.READ_GROUP_TABLE; - else if (mapEntry.getKey().getNumOptionalCovariates() == 0) - type = Type.QUALITY_SCORE_TABLE; - else - type = Type.OPTIONAL_COVARIATE_TABLE; - managers[type.index] = mapEntry.getKey(); - tables[type.index] = mapEntry.getValue(); - } - } + private static final NestedHashMap[] qualityScoreByFullCovariateKey = new NestedHashMap[EventType.values().length]; // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. + static { + for (int i = 0; i < EventType.values().length; i++) + qualityScoreByFullCovariateKey[i] = new NestedHashMap(); } /** @@ -89,7 +65,7 @@ public class BaseRecalibration { public BaseRecalibration(final File RECAL_FILE, int quantizationLevels) { RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE); - keysAndTables = new KeysAndTables(recalibrationReport.getKeysAndTablesMap()); + recalibrationTables = recalibrationReport.getRecalibrationTables(); requestedCovariates = recalibrationReport.getRequestedCovariates(); quantizationInfo = recalibrationReport.getQuantizationInfo(); if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores @@ -98,20 +74,22 @@ public class BaseRecalibration { quantizationInfo.quantizeQualityScores(quantizationLevels); readCovariates = new ReadCovariates(MAXIMUM_RECALIBRATED_READ_LENGTH, requestedCovariates.length); + tempKeySet = new Integer[requestedCovariates.length]; } /** * This constructor only exists for testing purposes. * * @param quantizationInfo the quantization info object - * @param keysAndTablesMap the map of key managers and recalibration tables + * @param recalibrationTables the map of key managers and recalibration tables * @param requestedCovariates the list of requested covariates */ - protected BaseRecalibration(final QuantizationInfo quantizationInfo, final LinkedHashMap> keysAndTablesMap, final Covariate[] requestedCovariates) { + protected BaseRecalibration(final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates) { this.quantizationInfo = quantizationInfo; - keysAndTables = new KeysAndTables(keysAndTablesMap); + this.recalibrationTables = recalibrationTables; this.requestedCovariates = requestedCovariates; readCovariates = new ReadCovariates(MAXIMUM_RECALIBRATED_READ_LENGTH, requestedCovariates.length); + tempKeySet = new Integer[requestedCovariates.length]; } /** @@ -125,13 +103,20 @@ public class BaseRecalibration { RecalDataManager.computeCovariates(read, requestedCovariates, readCovariates); // compute all covariates for the read for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings final byte[] quals = read.getBaseQualities(errorModel); + final int[][] fullReadKeySet = readCovariates.getKeySet(errorModel); // get the keyset for this base using the error model + + final int readLength = read.getReadLength(); + for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read - for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read final byte originalQualityScore = quals[offset]; if (originalQualityScore >= QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) - final long[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model - final byte recalibratedQualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base + final int[] keySet = fullReadKeySet[offset]; // get the keyset for this base using the error model + Byte recalibratedQualityScore = (Byte) qualityScoreByFullCovariateKey[errorModel.index].get(wrapKeySet(keySet)); + if (recalibratedQualityScore == null) { + recalibratedQualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base + qualityScoreByFullCovariateKey[errorModel.index].put(recalibratedQualityScore, keySet); + } quals[offset] = recalibratedQualityScore; } } @@ -139,7 +124,11 @@ public class BaseRecalibration { } } - + private Object[] wrapKeySet(final int[] keySet) { + for (int i = 0; i < keySet.length; i++) + tempKeySet[i] = keySet[i]; + return tempKeySet; + } /** * Implements a serial recalibration of the reads using the combinational table. @@ -158,24 +147,23 @@ public class BaseRecalibration { * @param errorModel the event type * @return A recalibrated quality score as a byte */ - protected byte performSequentialQualityCalculation(final long[] key, final EventType errorModel) { + protected byte performSequentialQualityCalculation(final int[] key, final EventType errorModel) { - final double globalDeltaQ = calculateGlobalDeltaQ(keysAndTables.managers[KeysAndTables.Type.READ_GROUP_TABLE.index], keysAndTables.tables[KeysAndTables.Type.READ_GROUP_TABLE.index], key, errorModel); - final double deltaQReported = calculateDeltaQReported(keysAndTables.managers[KeysAndTables.Type.QUALITY_SCORE_TABLE.index], keysAndTables.tables[KeysAndTables.Type.QUALITY_SCORE_TABLE.index], key, errorModel, globalDeltaQ); - final double deltaQCovariates = calculateDeltaQCovariates(keysAndTables.managers[KeysAndTables.Type.OPTIONAL_COVARIATE_TABLE.index], keysAndTables.tables[KeysAndTables.Type.OPTIONAL_COVARIATE_TABLE.index], key, errorModel, globalDeltaQ, deltaQReported); + final byte qualFromRead = (byte)(long)key[1]; + final double globalDeltaQ = calculateGlobalDeltaQ(recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE), key, errorModel); + final double deltaQReported = calculateDeltaQReported(recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE), key, errorModel, globalDeltaQ, qualFromRead); + final double deltaQCovariates = calculateDeltaQCovariates(recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLE), key, errorModel, globalDeltaQ, deltaQReported, qualFromRead); - final byte qualFromRead = (byte)key[1]; double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQual), QualityUtils.MAX_RECALIBRATED_Q_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL return quantizationInfo.getQuantizedQuals().get((int) recalibratedQual); // return the quantized version of the recalibrated quality } - private double calculateGlobalDeltaQ(final BQSRKeyManager keyManager, final Map table, final long[] key, final EventType errorModel) { + private double calculateGlobalDeltaQ(final NestedHashMap table, final int[] key, final EventType errorModel) { double result = 0.0; - final long masterKey = keyManager.createMasterKey(key, errorModel, -1); - final RecalDatum empiricalQualRG = table.get(masterKey); + final RecalDatum empiricalQualRG = (RecalDatum)table.get(key[0], errorModel.index); if (empiricalQualRG != null) { final double globalDeltaQEmpirical = empiricalQualRG.getEmpiricalQuality(); final double aggregrateQReported = empiricalQualRG.getEstimatedQReported(); @@ -185,32 +173,28 @@ public class BaseRecalibration { return result; } - private double calculateDeltaQReported(final BQSRKeyManager keyManager, final Map table, final long[] key, final EventType errorModel, final double globalDeltaQ) { + private double calculateDeltaQReported(final NestedHashMap table, final int[] key, final EventType errorModel, final double globalDeltaQ, final byte qualFromRead) { double result = 0.0; - final long masterKey = keyManager.createMasterKey(key, errorModel, -1); - final RecalDatum empiricalQualQS = table.get(masterKey); + final RecalDatum empiricalQualQS = (RecalDatum)table.get(key[0], key[1], errorModel.index); if (empiricalQualQS != null) { final double deltaQReportedEmpirical = empiricalQualQS.getEmpiricalQuality(); - final byte qualFromRead = (byte)key[1]; result = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; } return result; } - private double calculateDeltaQCovariates(final BQSRKeyManager keyManager, final Map table, final long[] key, final EventType errorModel, final double globalDeltaQ, final double deltaQReported) { + private double calculateDeltaQCovariates(final NestedHashMap table, final int[] key, final EventType errorModel, final double globalDeltaQ, final double deltaQReported, final byte qualFromRead) { double result = 0.0; - final int numOptionalCovariates = keyManager.getNumOptionalCovariates(); - for (int i = 0; i < numOptionalCovariates; i++) { - final long masterKey = keyManager.createMasterKey(key, errorModel, i); - if (masterKey < 0) + // for all optional covariates + for (int i = 2; i < requestedCovariates.length; i++) { + if (key[i] < 0) continue; - final RecalDatum empiricalQualCO = table.get(masterKey); + final RecalDatum empiricalQualCO = (RecalDatum)table.get(key[0], key[1], (i-2), key[i], errorModel.index); if (empiricalQualCO != null) { final double deltaQCovariateEmpirical = empiricalQualCO.getEmpiricalQuality(); - final byte qualFromRead = (byte)key[1]; result += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported)); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java new file mode 100644 index 000000000..aa77b5142 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.utils.collections.NestedHashMap; + +/** + * Utility class to facilitate on-the-fly base quality score recalibration. + * + * User: ebanks + * Date: 6/20/12 + */ + +public class RecalibrationTables { + + public enum TableType { + READ_GROUP_TABLE(0), + QUALITY_SCORE_TABLE(1), + OPTIONAL_COVARIATE_TABLE(2); + + private final int index; + + private TableType(final int index) { + this.index = index; + } + } + + private final NestedHashMap[] tables = new NestedHashMap[TableType.values().length]; + + public RecalibrationTables(final NestedHashMap rgMap, final NestedHashMap qualMap, final NestedHashMap covMap) { + tables[TableType.READ_GROUP_TABLE.index] = rgMap; + tables[TableType.QUALITY_SCORE_TABLE.index] = qualMap; + tables[TableType.OPTIONAL_COVARIATE_TABLE.index] = covMap; + } + + public NestedHashMap getTable(final TableType type) { + return tables[type.index]; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java deleted file mode 100644 index da1678d54..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java +++ /dev/null @@ -1,158 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; - -/** - * @author Mauricio Carneiro - * @since 3/7/12 - */ -public class BQSRKeyManagerUnitTest { - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - } - - @Test(enabled = false) - public void testCombineBitSets() { - final int nRequired = 2; - final ArrayList covariates = new ArrayList(); - covariates.add(new ReadGroupCovariate()); - covariates.add(new QualityScoreCovariate()); - covariates.add(new CycleCovariate()); - covariates.add(new ContextCovariate()); - createReadAndTest(covariates, nRequired); - } - - @Test(enabled = true) - public void testOnlyRequiredCovariates() { - final int nRequired = 2; - final ArrayList covariates = new ArrayList(2); - covariates.add(new ReadGroupCovariate()); - covariates.add(new QualityScoreCovariate()); - createReadAndTest(covariates, nRequired); - } - - @Test(enabled = true) - public void testOnlyOneCovariate() { - final int nRequired = 1; - final ArrayList covariates = new ArrayList(2); - covariates.add(new ReadGroupCovariate()); - createReadAndTest(covariates, nRequired); - } - - @Test(enabled = false) - public void testOneCovariateWithOptionalCovariates() { - final int nRequired = 1; - final ArrayList covariates = new ArrayList(4); - covariates.add(new ReadGroupCovariate()); - covariates.add(new QualityScoreCovariate()); - covariates.add(new CycleCovariate()); - covariates.add(new ContextCovariate()); - createReadAndTest(covariates, nRequired); - } - - private void createReadAndTest(List covariates, int nRequired) { - int readLength = 1000; - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(ReadUtils.createRandomReadBases(readLength, true), ReadUtils.createRandomReadQuals(readLength), readLength + "M"); - read.setReadGroup(new GATKSAMReadGroupRecord("ID")); - read.getReadGroup().setPlatform("illumina"); - - runTestOnRead(read, covariates, nRequired); - read.setReadNegativeStrandFlag(true); - runTestOnRead(read, covariates, nRequired); - read.setReadPairedFlag(true); - read.setSecondOfPairFlag(true); - runTestOnRead(read, covariates, nRequired); - read.setReadNegativeStrandFlag(false); - runTestOnRead(read, covariates, nRequired); - } - - private void runTestOnRead(GATKSAMRecord read, List covariateList, int nRequired) { - final long[][][] covariateKeys = new long[covariateList.size()][EventType.values().length][read.getReadLength()]; - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), covariateList.size()); - for (int i = 0; i < covariateList.size(); i++) { - final Covariate cov = covariateList.get(i); - cov.initialize(RAC); - readCovariates.setCovariateIndex(i); - cov.recordValues(read, readCovariates); - } - for (int i = 0; i < read.getReadLength(); i++) { - for (EventType eventType : EventType.values()) { - final long[] vals = readCovariates.getKeySet(i, eventType); - for (int j = 0; j < vals.length; j++) - covariateKeys[j][eventType.index][i] = vals[j]; - } - } - - List requiredCovariates = new LinkedList(); - List optionalCovariates = new LinkedList(); - - for (int j=0; j optionalCovariates, - final Object[] expectedRequired, final Object[] expectedCovariate, final EventType eventType, final int index) { - - Object[] actual = keyManager.keySetFrom(key).toArray(); - - // Build the expected array - Object[] expected = new Object[nRequired + (optionalCovariates.size() > 0 ? 3 : 1)]; - System.arraycopy(expectedRequired, 0, expected, 0, nRequired); - if (optionalCovariates.size() > 0) { - expected[expected.length-3] = expectedCovariate[index]; - expected[expected.length-2] = optionalCovariates.get(index).getClass().getSimpleName().split("Covariate")[0]; - } - expected[expected.length-1] = eventType; - -// System.out.println("Actual : " + Utils.join(",", Arrays.asList(actual))); -// System.out.println("Expected: " + Utils.join(",", Arrays.asList(expected))); -// System.out.println(); - - for (int k = 0; k < expected.length; k++) - Assert.assertEquals(actual[k], expected[k]); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java index ee5395454..553b7e237 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java @@ -36,7 +36,7 @@ public class ContextCovariateUnitTest { verifyCovariateArray(readCovariates.getDeletionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); } - public static void verifyCovariateArray(long[][] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { + public static void verifyCovariateArray(int[][] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { for (int i = 0; i < values.length; i++) Assert.assertEquals(contextCovariate.formatKey(values[i][0]), expectedContext(read, i, contextSize)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java index 79b57fd8f..3fa1e916d 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java @@ -47,7 +47,7 @@ public class CycleCovariateUnitTest { verifyCovariateArray(readCovariates.getMismatchesKeySet(), -1, -1); } - private void verifyCovariateArray(long[][] values, int init, int increment) { + private void verifyCovariateArray(int[][] values, int init, int increment) { for (short i = 0; i < values.length; i++) { short actual = Short.decode(covariate.formatKey(values[i][0])); int expected = init + (increment * i); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java index 4970413e8..a83508353 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java @@ -46,8 +46,8 @@ public class ReadGroupCovariateUnitTest { } - private void verifyCovariateArray(long[][] values, String expected) { - for (long[] value : values) { + private void verifyCovariateArray(int[][] values, String expected) { + for (int[] value : values) { String actual = covariate.formatKey(value[0]); Assert.assertEquals(actual, expected); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java index e5fde0efc..d1f2d6342 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java @@ -1,7 +1,9 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -32,7 +34,6 @@ public class RecalibrationReportUnitTest { final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts); final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - final LinkedHashMap> keysAndTablesMap = new LinkedHashMap>(); quantizationInfo.noQuantization(); final List requiredCovariates = new LinkedList(); @@ -41,14 +42,10 @@ public class RecalibrationReportUnitTest { final ReadGroupCovariate rgCovariate = new ReadGroupCovariate(); rgCovariate.initialize(RAC); requiredCovariates.add(rgCovariate); - final BQSRKeyManager rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); - keysAndTablesMap.put(rgKeyManager, new HashMap()); final QualityScoreCovariate qsCovariate = new QualityScoreCovariate(); qsCovariate.initialize(RAC); requiredCovariates.add(qsCovariate); - final BQSRKeyManager qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); - keysAndTablesMap.put(qsKeyManager, new HashMap()); final ContextCovariate cxCovariate = new ContextCovariate(); cxCovariate.initialize(RAC); @@ -56,8 +53,6 @@ public class RecalibrationReportUnitTest { final CycleCovariate cyCovariate = new CycleCovariate(); cyCovariate.initialize(RAC); optionalCovariates.add(cyCovariate); - BQSRKeyManager cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); - keysAndTablesMap.put(cvKeyManager, new HashMap()); final Covariate[] requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; int covariateIndex = 0; @@ -75,34 +70,35 @@ public class RecalibrationReportUnitTest { readQuals[i] = 20; read.setBaseQualities(readQuals); - final int expectedKeys = expectedNumberOfKeys(4, length, RAC.INDELS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); int nKeys = 0; // keep track of how many keys were produced final ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates); - for (int offset = 0; offset < length; offset++) { - for (Map.Entry> entry : keysAndTablesMap.entrySet()) { - BQSRKeyManager keyManager = entry.getKey(); - Map table = entry.getValue(); - final int numOptionalCovariates = keyManager.getNumOptionalCovariates(); - if (numOptionalCovariates == 0) { - table.put(keyManager.createMasterKey(rc.getMismatchesKeySet(offset), EventType.BASE_SUBSTITUTION, -1), RecalDatum.createRandomRecalDatum(10000, 10)); - table.put(keyManager.createMasterKey(rc.getMismatchesKeySet(offset), EventType.BASE_INSERTION, -1), RecalDatum.createRandomRecalDatum(100000, 10)); - table.put(keyManager.createMasterKey(rc.getMismatchesKeySet(offset), EventType.BASE_DELETION, -1), RecalDatum.createRandomRecalDatum(100000, 10)); - nKeys += 3; - } else { - for (int j = 0; j < numOptionalCovariates; j++) { - table.put(keyManager.createMasterKey(rc.getMismatchesKeySet(offset), EventType.BASE_SUBSTITUTION, j), RecalDatum.createRandomRecalDatum(10000, 10)); - table.put(keyManager.createMasterKey(rc.getMismatchesKeySet(offset), EventType.BASE_INSERTION, j), RecalDatum.createRandomRecalDatum(100000, 10)); - table.put(keyManager.createMasterKey(rc.getMismatchesKeySet(offset), EventType.BASE_DELETION, j), RecalDatum.createRandomRecalDatum(100000, 10)); - nKeys += 3; - } + final NestedHashMap rgTable = new NestedHashMap(); + final NestedHashMap qualTable = new NestedHashMap(); + final NestedHashMap covTable = new NestedHashMap(); + + for (int offset = 0; offset < length; offset++) { + + for (EventType errorMode : EventType.values()) { + + final int[] covariates = rc.getKeySet(offset, errorMode); + final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; + + rgTable.put(RecalDatum.createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.index); + qualTable.put(RecalDatum.createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.index); + nKeys += 2; + for (int j = 0; j < optionalCovariates.size(); j++) { + covTable.put(RecalDatum.createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], j, covariates[2 + j], errorMode.index); + nKeys++; } } } Assert.assertEquals(nKeys, expectedKeys); - RecalibrationReport report = new RecalibrationReport(quantizationInfo, keysAndTablesMap, RAC.generateReportTable(), RAC); + final RecalibrationTables recalibrationTables = new RecalibrationTables(rgTable, qualTable, covTable); + + final RecalibrationReport report = new RecalibrationReport(quantizationInfo, recalibrationTables, RAC.generateReportTable(), RAC); File output = new File("RecalibrationReportUnitTestOutuput.grp"); PrintStream out; diff --git a/public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java index fd53283b1..32fe7597d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java @@ -1,8 +1,6 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.walkers.bqsr.BQSRKeyManager; -import org.broadinstitute.sting.gatk.walkers.bqsr.ContextCovariate; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -59,19 +57,4 @@ public class BitSetUtilsUnitTest { //for (String d : dna) // Assert.assertEquals(BitSetUtils.dnaFrom(BitSetUtils.bitSetFrom(d)), d); } - - @Test(enabled = true) - public void testNumberOfBitsToRepresent() { - Assert.assertEquals(BQSRKeyManager.numberOfBitsToRepresent(0), 0); // Make sure 0 elements need 0 bits to be represented - Assert.assertEquals(BQSRKeyManager.numberOfBitsToRepresent(1), 1); // Make sure 1 element needs 1 bit to be represented - Assert.assertEquals(BQSRKeyManager.numberOfBitsToRepresent(3), 2); // Make sure 3 elements need 2 bit to be represented - - for (int i = 1; i < 63; i++) { // Can't test i == 63 because n1 is a negative number - long n1 = 1L << i; - long n2 = Math.abs(random.nextLong()) % n1; - long n3 = n1 | n2; - Assert.assertEquals(BQSRKeyManager.numberOfBitsToRepresent(n3), (n3 == n1) ? i : i + 1); - Assert.assertEquals(BQSRKeyManager.numberOfBitsToRepresent(n1), i); - } - } } diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java index f70466d4f..982ac03bd 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.recalibration; import org.broadinstitute.sting.gatk.walkers.bqsr.*; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -22,7 +21,7 @@ import java.util.*; public class BaseRecalibrationUnitTest { private org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager dataManager; - private LinkedHashMap> keysAndTablesMap; + private RecalibrationTables recalibrationTables; private ReadGroupCovariate rgCovariate; private QualityScoreCovariate qsCovariate; @@ -50,19 +49,14 @@ public class BaseRecalibrationUnitTest { List optionalCovariates = new ArrayList(); dataManager = new org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager(true, 4); - keysAndTablesMap = new LinkedHashMap>(); rgCovariate = new ReadGroupCovariate(); rgCovariate.initialize(RAC); requiredCovariates.add(rgCovariate); - BQSRKeyManager rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); - keysAndTablesMap.put(rgKeyManager, new HashMap()); qsCovariate = new QualityScoreCovariate(); qsCovariate.initialize(RAC); requiredCovariates.add(qsCovariate); - BQSRKeyManager qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); - keysAndTablesMap.put(qsKeyManager, new HashMap()); cxCovariate = new ContextCovariate(); cxCovariate.initialize(RAC); @@ -70,8 +64,6 @@ public class BaseRecalibrationUnitTest { cyCovariate = new CycleCovariate(); cyCovariate.initialize(RAC); optionalCovariates.add(cyCovariate); - BQSRKeyManager cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); - keysAndTablesMap.put(cvKeyManager, new HashMap()); final Covariate[] requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; int covariateIndex = 0; @@ -82,10 +74,13 @@ public class BaseRecalibrationUnitTest { readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); - for (int i=0; i> mapEntry : keysAndTablesMap.entrySet()) { - final BQSRKeyManager keyManager = mapEntry.getKey(); - final int numOptionalCovariates = keyManager.getNumOptionalCovariates(); - if (numOptionalCovariates == 0) { - final long masterKey = keyManager.createMasterKey(bitKeys, EventType.BASE_SUBSTITUTION, -1); - updateCovariateWithKeySet(mapEntry.getValue(), masterKey, newDatum); - } else { - for (int j = 0; j < numOptionalCovariates; j++) { - final long masterKey = keyManager.createMasterKey(bitKeys, EventType.BASE_SUBSTITUTION, j); - updateCovariateWithKeySet(mapEntry.getValue(), masterKey, newDatum); - } - } + + rgTable.put(newDatum, bitKeys[0], EventType.BASE_SUBSTITUTION.index); + qualTable.put(newDatum, bitKeys[0], bitKeys[1], EventType.BASE_SUBSTITUTION.index); + for (int j = 0; j < optionalCovariates.size(); j++) { + covTable.put(newDatum, bitKeys[0], bitKeys[1], j, bitKeys[2 + j], EventType.BASE_SUBSTITUTION.index); } } - dataManager.generateEmpiricalQualities(1, QualityUtils.MAX_RECALIBRATED_Q_SCORE); + + recalibrationTables = new RecalibrationTables(rgTable, qualTable, covTable); + + dataManager.generateEmpiricalQualities(1, QualityUtils.MAX_RECALIBRATED_Q_SCORE); List quantizedQuals = new ArrayList(); List qualCounts = new ArrayList(); @@ -121,16 +112,15 @@ public class BaseRecalibrationUnitTest { } QuantizationInfo quantizationInfo = new QuantizationInfo(quantizedQuals, qualCounts); quantizationInfo.noQuantization(); - baseRecalibration = new BaseRecalibration(quantizationInfo, keysAndTablesMap, requestedCovariates); + baseRecalibration = new BaseRecalibration(quantizationInfo, recalibrationTables, requestedCovariates); } @Test(enabled=false) public void testGoldStandardComparison() { - debugTables(); for (int i = 0; i < read.getReadLength(); i++) { - long [] bitKey = readCovariates.getKeySet(i, EventType.BASE_SUBSTITUTION); + int [] bitKey = readCovariates.getKeySet(i, EventType.BASE_SUBSTITUTION); Object [] objKey = buildObjectKey(bitKey); byte v2 = baseRecalibration.performSequentialQualityCalculation(bitKey, EventType.BASE_SUBSTITUTION); byte v1 = goldStandardSequentialCalculation(objKey); @@ -138,7 +128,7 @@ public class BaseRecalibrationUnitTest { } } - private Object[] buildObjectKey(long[] bitKey) { + private Object[] buildObjectKey(final int[] bitKey) { Object[] key = new Object[bitKey.length]; key[0] = rgCovariate.formatKey(bitKey[0]); key[1] = qsCovariate.formatKey(bitKey[1]); @@ -147,49 +137,6 @@ public class BaseRecalibrationUnitTest { return key; } - private void debugTables() { - System.out.println("\nV1 Table\n"); - System.out.println("ReadGroup Table:"); - NestedHashMap nestedTable = dataManager.getCollapsedTable(0); - printNestedHashMap(nestedTable.data, ""); - System.out.println("\nQualityScore Table:"); - nestedTable = dataManager.getCollapsedTable(1); - printNestedHashMap(nestedTable.data, ""); - System.out.println("\nCovariates Table:"); - nestedTable = dataManager.getCollapsedTable(2); - printNestedHashMap(nestedTable.data, ""); - nestedTable = dataManager.getCollapsedTable(3); - printNestedHashMap(nestedTable.data, ""); - - - int i = 0; - System.out.println("\nV2 Table\n"); - for (Map.Entry> mapEntry : keysAndTablesMap.entrySet()) { - BQSRKeyManager keyManager = mapEntry.getKey(); - Map table = mapEntry.getValue(); - switch(i++) { - case 0 : - System.out.println("ReadGroup Table:"); - break; - case 1 : - System.out.println("QualityScore Table:"); - break; - case 2 : - System.out.println("Covariates Table:"); - break; - } - for (Map.Entry entry : table.entrySet()) { - Long key = entry.getKey(); - RecalDatum datum = entry.getValue(); - List keySet = keyManager.keySetFrom(key); - System.out.println(String.format("%s => %s", Utils.join(",", keySet), datum) + "," + datum.getEstimatedQReported()); - } - System.out.println(); - } - - - } - private static void printNestedHashMap(Map table, String output) { for (Object key : table.keySet()) { String ret; From dc7636b923c7e0fae628e62304c5261c7206cd26 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 28 Jun 2012 02:29:35 -0400 Subject: [PATCH 22/32] Refactor the ContextCovariate to significantly reduce runtime --- .../gatk/walkers/bqsr/ContextCovariate.java | 99 ++++++++++++++++--- 1 file changed, 86 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java index 365c816c7..7da3c372e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -32,6 +32,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.ArrayList; + /** * Created by IntelliJ IDEA. * User: rpoplin @@ -43,6 +45,16 @@ public class ContextCovariate implements StandardCovariate { private int mismatchesContextSize; private int indelsContextSize; + private int mismatchesKeyMask; + private int indelsKeyMask; + + private static final int LENGTH_BITS = 4; + private static final int LENGTH_MASK = 15; + + // temporary lists to use for creating context covariate keys + private final ArrayList mismatchKeys = new ArrayList(200); + private final ArrayList indelKeys = new ArrayList(200); + // the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are // not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base. static final private int MAX_DNA_CONTEXT = 13; @@ -62,6 +74,9 @@ public class ContextCovariate implements StandardCovariate { if (mismatchesContextSize <= 0 || indelsContextSize <= 0) throw new UserException(String.format("Context size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Indels: %d", mismatchesContextSize, indelsContextSize)); + + mismatchesKeyMask = createMask(mismatchesContextSize); + indelsKeyMask = createMask(indelsContextSize); } @Override @@ -75,10 +90,15 @@ public class ContextCovariate implements StandardCovariate { if (negativeStrand) bases = BaseUtils.simpleReverseComplement(bases); - final int readLength = clippedRead.getReadLength(); + mismatchKeys.clear(); + indelKeys.clear(); + contextWith(bases, mismatchesContextSize, mismatchKeys, mismatchesKeyMask); + contextWith(bases, indelsContextSize, indelKeys, indelsKeyMask); + + final int readLength = bases.length; for (int i = 0; i < readLength; i++) { - final int indelKey = contextWith(bases, i, indelsContextSize); - values.addCovariate(contextWith(bases, i, mismatchesContextSize), indelKey, indelKey, (negativeStrand ? readLength - i - 1 : i)); + final int indelKey = indelKeys.get(i); + values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, (negativeStrand ? readLength - i - 1 : i)); } } @@ -101,17 +121,72 @@ public class ContextCovariate implements StandardCovariate { return keyFromContext((String) value); } + private static int createMask(final int contextSize) { + int mask = 0; + // create 2*contextSize worth of bits + for (int i = 0; i < contextSize; i++) + mask = (mask << 2) | 3; + // shift 4 bits to mask out the bits used to encode the length + return mask << LENGTH_BITS; + } + /** * calculates the context of a base independent of the covariate mode (mismatch, insertion or deletion) * * @param bases the bases in the read to build the context from - * @param offset the position in the read to calculate the context for * @param contextSize context size to use building the context - * @return the key representing the context + * @param keys list to store the keys + * @param mask mask for pulling out just the context bits */ - private int contextWith(final byte[] bases, final int offset, final int contextSize) { - final int start = offset - contextSize + 1; - return (start >= 0) ? keyFromContext(bases, start, offset + 1) : -1; + private static void contextWith(final byte[] bases, final int contextSize, final ArrayList keys, final int mask) { + + // the first contextSize-1 bases will not have enough previous context + for (int i = 1; i < contextSize && i <= bases.length; i++) + keys.add(-1); + + if (bases.length < contextSize) + return; + + final int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS; + + // get (and add) the key for the context starting at the first base + int currentKey = keyFromContext(bases, 0, contextSize); + keys.add(currentKey); + + // if the first key was -1 then there was an N in the context; figure out how many more consecutive contexts it affects + int currentNPenalty = 0; + if (currentKey == -1) { + currentKey = 0; + currentNPenalty = contextSize - 1; + int offset = newBaseOffset; + while (bases[currentNPenalty] != 'N') { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentNPenalty]); + currentKey |= (baseIndex << offset); + offset -= 2; + currentNPenalty--; + } + } + + final int readLength = bases.length; + for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentIndex]); + if (baseIndex == -1) { // ignore non-ACGT bases + currentNPenalty = contextSize; + currentKey = 0; // reset the key + } else { + // push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length in + currentKey = (currentKey >> 2) & mask; + currentKey |= (baseIndex << newBaseOffset); + currentKey |= contextSize; + } + + if (currentNPenalty == 0) { + keys.add(currentKey); + } else { + currentNPenalty--; + keys.add(-1); + } + } } public static int keyFromContext(final String dna) { @@ -126,9 +201,7 @@ public class ContextCovariate implements StandardCovariate { * @param end the end position in the array (exclusive) * @return the key representing the dna sequence */ - public static int keyFromContext(final byte[] dna, final int start, final int end) { - - // TODO -- bit fiddle to ge this all working in a single call to the method (mask out length, shift, OR length back in) + private static int keyFromContext(final byte[] dna, final int start, final int end) { int key = end - start; int bitOffset = 4; @@ -152,8 +225,8 @@ public class ContextCovariate implements StandardCovariate { if (key < 0) throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?"); - final int length = key & 15; // the first 4 bits represent the length (in bp) of the context - int mask = 48; // use the mask to pull out bases + final int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context + int mask = 48; // use the mask to pull out bases int offset = 4; StringBuilder dna = new StringBuilder(); From 93426a44b1fdda654973c9bb79663e2452cb2f41 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 27 Jun 2012 17:33:37 -0400 Subject: [PATCH 24/32] Fixes for DiagnoseTargets to be VCF/BCF2 spec complaint -- Don't use DP for average interval depth but rather AVG_INTERVAL_DP, which is a float now, not an int -- Don't add PASS filter value to genotypes, as this is actually considered failing filters in the GATK. Genotype filters should be empty for PASSing sites --- .../diagnostics/targets/DiagnoseTargets.java | 15 ++++++++------- .../walkers/diagnostics/targets/ThresHolder.java | 6 ++++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index cba38d0de..369731530 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -266,13 +266,13 @@ public class DiagnoseTargets extends LocusWalker { alleles.add(refAllele); alleles.add(SYMBOLIC_ALLELE); - VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles); + VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF - vcb.filters(new HashSet(statusesToStrings(stats.callableStatuses(thresholds)))); + vcb.filters(new HashSet(statusesToStrings(stats.callableStatuses(thresholds), true))); attributes.put(VCFConstants.END_KEY, interval.getStop()); - attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage()); + attributes.put(ThresHolder.AVG_INTERVAL_DP_KEY, stats.averageCoverage()); vcb = vcb.attributes(attributes); if (debug) { @@ -282,7 +282,7 @@ public class DiagnoseTargets extends LocusWalker { final GenotypeBuilder gb = new GenotypeBuilder(sample); SampleStatistics sampleStat = stats.getSample(sample); - gb.DP((int)sampleStat.averageCoverage()); + gb.attribute(ThresHolder.AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage()); gb.attribute("Q1", sampleStat.getQuantileDepth(0.25)); gb.attribute("MED", sampleStat.getQuantileDepth(0.50)); gb.attribute("Q3", sampleStat.getQuantileDepth(0.75)); @@ -290,7 +290,7 @@ public class DiagnoseTargets extends LocusWalker { if (debug) { System.out.printf("Found %d bad mates out of %d reads %n", sampleStat.getnBadMates(), sampleStat.getnReads()); } - gb.filters(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds))); + gb.filters(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds), false)); genotypes.add(gb.make()); } @@ -307,11 +307,12 @@ public class DiagnoseTargets extends LocusWalker { * @param statuses the set of statuses to be converted * @return a matching set of strings */ - private List statusesToStrings(Set statuses) { + private List statusesToStrings(Set statuses, final boolean includePASS) { List output = new ArrayList(statuses.size()); for (CallableStatus status : statuses) - output.add(status.name()); + if ( includePASS || status != CallableStatus.PASS ) // adding pass => results in a filter for genotypes + output.add(status.name()); return output; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java index 234906944..0d8195551 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java @@ -31,6 +31,7 @@ import java.util.HashSet; import java.util.Set; class ThresHolder { + public static final String AVG_INTERVAL_DP_KEY = "AVG_INTERVAL_DP"; public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5); private final int minimumBaseQuality; @@ -129,12 +130,13 @@ class ThresHolder { // INFO fields for overall data headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); - headerLines.add(new VCFInfoHeaderLine("AVG_INTERVAL_DP", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); + headerLines.add(new VCFInfoHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); // FORMAT fields for each genotype // todo -- find the appropriate VCF constants - headerLines.add(new VCFFormatHeaderLine("AVG_INTERVAL_DP", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); + headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); + headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution.")); headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution.")); headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution.")); From 4811a00891ef5485a7ccbac6b794c785c7260e34 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 27 Jun 2012 17:34:56 -0400 Subject: [PATCH 25/32] GENOTYPE_FILTER_KEY is now a VCFStandardHeaderLine --- .../sting/gatk/walkers/filters/VariantFiltrationWalker.java | 2 +- .../sting/utils/codecs/vcf/VCFStandardHeaderLines.java | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index 8f3b0ea07..71352bddd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -176,7 +176,7 @@ public class VariantFiltrationWalker extends RodWalker { hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString())); if ( genotypeFilterExps.size() > 0 ) - hInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, 1, VCFHeaderLineType.String, "Genotype-level filter")); + hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); if ( mask.isBound() ) { hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask")); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java index 84c60d9d1..dcc141b00 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java @@ -183,6 +183,7 @@ public class VCFStandardHeaderLines { registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter")); // INFO lines registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); From 7144154f53d8776bce37630e9443034e318768e1 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 27 Jun 2012 17:39:00 -0400 Subject: [PATCH 26/32] VCFWriter and BCFWriter no longer allow missing samples in the VC compared to their header -- They now throw an error, as its really unsafe to write out ./. as a special case in the VCFWriter as occurred previously. -- Added convenience method in VariantContextUtils.addMissingSamples(vc, allSamples) that returns a complete VC where samples are given ./. Genotype objects -- This allows us to properly pass tests of creating / writing / reading VCFs and BCFs, which previously differed because the VC from the VCF would actually be different from its original VC -- Updated UG, UGEngine, GenotypeAndValidateWalker, CombineVariants, and VariantsToVCF to manage the master list of samples they are writing out and addMissingSamples via the VCU function --- .../walkers/genotyper/UnifiedGenotyper.java | 6 +- .../genotyper/UnifiedGenotyperEngine.java | 55 ++++++++++++++++--- .../validation/GenotypeAndValidateWalker.java | 3 +- .../walkers/variantutils/CombineVariants.java | 19 ++++--- .../walkers/variantutils/VariantsToVCF.java | 4 +- .../variantcontext/VariantContextUtils.java | 26 +++++++++ .../variantcontext/writer/BCF2Writer.java | 4 +- .../variantcontext/writer/VCFWriter.java | 21 +++---- 8 files changed, 103 insertions(+), 35 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 5b9a83a1b..29ca1265c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -187,6 +187,8 @@ public class UnifiedGenotyper extends LocusWalker, Unif // the annotation engine private VariantAnnotatorEngine annotationEngine; + private Set samples; + // enable deletions in the pileup @Override public boolean includeReadsWithDeletionAtLoci() { return true; } @@ -231,7 +233,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode"); // get all of the unique sample names - Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // initialize the verbose writer if ( verboseWriter != null ) @@ -298,7 +300,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif * @return the VariantCallContext object */ public List map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext); + return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, samples); } public UGStatistics reduceInit() { return new UGStatistics(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 60fa75f41..3c32d132f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -140,14 +140,39 @@ public class UnifiedGenotyperEngine { } /** - * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. + * @see #calculateLikelihoodsAndGenotypes(org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker, org.broadinstitute.sting.gatk.contexts.ReferenceContext, org.broadinstitute.sting.gatk.contexts.AlignmentContext, java.util.Set) * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @return the VariantCallContext object + * same as the full call but with allSamples == null + * + * @param tracker + * @param refContext + * @param rawContext + * @return */ - public List calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { + public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext) { + return calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, null); + } + + + /** + * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. + * + * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype + * for every sample in allSamples. If it's null there's no such guarentee. Providing this + * argument is critical when the resulting calls will be written to a VCF file. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param allSamples set of all sample names that we might call (i.e., those in the VCF header) + * @return the VariantCallContext object + */ + public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Set allSamples) { final List results = new ArrayList(2); final List models = getGLModelsToUse(tracker, refContext, rawContext); @@ -168,7 +193,23 @@ public class UnifiedGenotyperEngine { } } - return results; + return addMissingSamples(results, allSamples); + } + + private List addMissingSamples(final List calls, final Set allSamples) { + if ( calls.isEmpty() || allSamples == null ) return calls; + + final List withAllSamples = new ArrayList(calls.size()); + for ( final VariantCallContext call : calls ) { + if ( call == null ) + withAllSamples.add(call); + else { + final VariantContext withoutMissing = VariantContextUtils.addMissingSamples(call, allSamples); + withAllSamples.add(new VariantCallContext(withoutMissing, call.confidentlyCalled, call.shouldEmit)); + } + } + + return withAllSamples; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java index 22c0131c2..2e3fc26f6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java @@ -261,6 +261,7 @@ public class GenotypeAndValidateWalker extends RodWalker samples; public static class CountedData { private long nAltCalledAlt = 0L; @@ -307,7 +308,7 @@ public class GenotypeAndValidateWalker extends RodWalker header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName()); - Set samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); Set headerLines = VCFUtils.smartMergeHeaders(header.values(), logger); headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate")); vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 6a55b024b..629c7f84c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -174,17 +174,24 @@ public class CombineVariants extends RodWalker { /** Optimization to strip out genotypes before merging if we are doing a sites_only output */ private boolean sitesOnlyVCF = false; + private Set samples; public void initialize() { Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit()); + if ( vcfWriter instanceof VariantContextWriterStub) { + sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES); + if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance"); + } else + logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); + if ( PRIORITY_STRING == null ) { PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); logger.info("Priority string not provided, using arbitrary genotyping order: " + PRIORITY_STRING); } validateAnnotateUnionArguments(); - Set samples = SampleUtils.getSampleList(vcfRods, genotypeMergeOption); + samples = sitesOnlyVCF ? Collections.emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption); if ( SET_KEY.toLowerCase().equals("null") ) SET_KEY = null; @@ -194,15 +201,9 @@ public class CombineVariants extends RodWalker { headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants")); if ( !ASSUME_IDENTICAL_SAMPLES ) headerLines.addAll(Arrays.asList(ChromosomeCounts.descriptions)); - VCFHeader vcfHeader = new VCFHeader(headerLines, sitesOnlyVCF ? Collections.emptySet() : samples); + VCFHeader vcfHeader = new VCFHeader(headerLines, samples); vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER); vcfWriter.writeHeader(vcfHeader); - - if ( vcfWriter instanceof VariantContextWriterStub) { - sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES); - if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance"); - } else - logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); } private void validateAnnotateUnionArguments() { @@ -296,7 +297,7 @@ public class CombineVariants extends RodWalker { VariantContextUtils.calculateChromosomeCounts(builder, false); if ( minimalVCF ) VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); - vcfWriter.add(builder.make()); + vcfWriter.add(VariantContextUtils.addMissingSamples(builder.make(), samples)); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index b508a9dd5..e8c6794f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -108,6 +108,7 @@ public class VariantsToVCF extends RodWalker { private Set allowedGenotypeFormatStrings = new HashSet(); private boolean wroteHeader = false; + private Set samples; // for dealing with indels in hapmap CloseableIterator dbsnpIterator = null; @@ -228,7 +229,7 @@ public class VariantsToVCF extends RodWalker { } } - Set samples = new LinkedHashSet(); + samples = new LinkedHashSet(); if ( sampleName != null ) { samples.add(sampleName); } else { @@ -252,6 +253,7 @@ public class VariantsToVCF extends RodWalker { } vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); + vc = VariantContextUtils.addMissingSamples(vc, samples); vcfwriter.add(vc); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index b697b3381..ccc0f5971 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -46,6 +46,7 @@ public class VariantContextUtils { public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_PREFIX = "filterIn"; + private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); final public static JexlEngine engine = new JexlEngine(); public static final int DEFAULT_PLOIDY = 2; @@ -57,6 +58,31 @@ public class VariantContextUtils { engine.setDebug(false); } + /** + * Ensures that VC contains all of the samples in allSamples by adding missing samples to + * the resulting VC with default diploid ./. genotypes + * + * @param vc + * @param allSamples + * @return + */ + public static VariantContext addMissingSamples(final VariantContext vc, final Set allSamples) { + // TODO -- what's the fastest way to do this calculation? + final Set missingSamples = new HashSet(allSamples); + missingSamples.removeAll(vc.getSampleNames()); + + if ( missingSamples.isEmpty() ) + return vc; + else { + //logger.warn("Adding " + missingSamples.size() + " missing samples to called context"); + final GenotypesContext gc = GenotypesContext.copy(vc.getGenotypes()); + for ( final String missing : missingSamples ) { + gc.add(new GenotypeBuilder(missing).alleles(DIPLOID_NO_CALL).make()); + } + return new VariantContextBuilder(vc).genotypes(gc).make(); + } + } + /** * Update the attributes of the attributes map given the VariantContext to reflect the * proper chromosome-based VCF tags diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index 1e15c2bc5..5555849dd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -302,9 +302,7 @@ class BCF2Writer extends IndexingVariantContextWriter { writer.start(encoder, vc); for ( final String name : sampleNames ) { Genotype g = vc.getGenotype(name); - if ( g == null ) - // we don't have any data about g at all - g = new GenotypeBuilder(name).alleles(MISSING_GENOTYPE).make(); + if ( g == null ) VCFWriter.missingSampleError(vc, header); writer.addGenotype(encoder, vc, g); } writer.done(encoder, vc); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index 651223ac3..ee7b1b9ef 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.variantcontext.writer; import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -339,23 +340,12 @@ class VCFWriter extends IndexingVariantContextWriter { */ private void addGenotypeData(VariantContext vc, Map alleleMap, List genotypeFormatKeys) throws IOException { -// if ( ! mHeader.getGenotypeSamples().containsAll(vc.getSampleNames()) ) { -// final List badSampleNames = new ArrayList(); -// for ( final Genotype g : vc.getGenotypes() ) -// if ( ! mHeader.getGenotypeSamples().contains(g.getSampleName()) ) -// badSampleNames.add(g.getSampleName()); -// throw new ReviewedStingException("BUG: VariantContext contains some samples not in the VCF header: bad samples are " + Utils.join(",",badSampleNames)); -// } - for ( String sample : mHeader.getGenotypeSamples() ) { mWriter.write(VCFConstants.FIELD_SEPARATOR); Genotype g = vc.getGenotype(sample); if ( g == null ) { - // TODO -- The VariantContext needs to know what the general ploidy is of the samples - // TODO -- We shouldn't be assuming diploid genotypes here! - mWriter.write(VCFConstants.EMPTY_GENOTYPE); - continue; + missingSampleError(vc, mHeader); } List attrs = new ArrayList(genotypeFormatKeys.size()); @@ -439,6 +429,13 @@ class VCFWriter extends IndexingVariantContextWriter { } } + public static final void missingSampleError(final VariantContext vc, final VCFHeader header) { + final List badSampleNames = new ArrayList(); + for ( final String x : header.getGenotypeSamples() ) + if ( ! vc.hasGenotype(x) ) badSampleNames.add(x); + throw new ReviewedStingException("BUG: we now require all samples in VCFheader to have genotype objects. Missing samples are " + Utils.join(",", badSampleNames)); + } + private boolean isMissingValue(String s) { // we need to deal with the case that it's a list of missing values return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length()); From 64d7e9320971076112c23df781f0c376542b32b8 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 27 Jun 2012 18:18:25 -0400 Subject: [PATCH 27/32] Massive bugfixes -- Previous version was reading the size of the encoded genotypes vector for each genotype. This only worked because I never wrote out genotype field values with > 15 elements. Mauricio's killer DiagnoseTargets VCF uncovered the bug. Unfortunately since symbolic allele clipping is still busted those tests are still diabled -- GenotypeContext getMaxPloidy was returning -1 in the case where there are no genotypes, but the answer should be 0. --- .../sting/utils/codecs/bcf2/BCF2Decoder.java | 7 +++-- .../bcf2/BCF2GenotypeFieldDecoders.java | 30 +++++++++---------- .../codecs/bcf2/BCF2LazyGenotypesDecoder.java | 3 +- .../variantcontext/GenotypeLikelihoods.java | 5 ++-- .../variantcontext/GenotypesContext.java | 1 + 5 files changed, 25 insertions(+), 21 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java index 1bb833868..7a6d96131 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -136,6 +136,10 @@ public final class BCF2Decoder { public final Object decodeTypedValue(final byte typeDescriptor) { final int size = decodeNumberOfElements(typeDescriptor); + return decodeTypedValue(typeDescriptor, size); + } + + public final Object decodeTypedValue(final byte typeDescriptor, final int size) { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); assert size >= 0; @@ -285,8 +289,7 @@ public final class BCF2Decoder { } } - public final int[] decodeIntArray(final byte typeDescriptor) { - final int size = decodeNumberOfElements(typeDescriptor); + public final int[] decodeIntArray(final byte typeDescriptor, final int size) { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); return decodeIntArray(size, type, null); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java index 5a4d1d0da..59537a329 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java @@ -104,19 +104,17 @@ public class BCF2GenotypeFieldDecoders { final String field, final BCF2Decoder decoder, final byte typeDescriptor, + final int numElements, final GenotypeBuilder[] gbs); } private class GTDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { - // we have to do a bit of low-level processing here as we want to know the size upfronta - final int ploidy = decoder.decodeNumberOfElements(typeDescriptor); - - if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES ) + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { + if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && numElements == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES ) fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs); else { - generalDecode(siteAlleles, ploidy, decoder, typeDescriptor, gbs); + generalDecode(siteAlleles, numElements, decoder, typeDescriptor, gbs); } } @@ -218,7 +216,7 @@ public class BCF2GenotypeFieldDecoders { private class DPDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { // the -1 is for missing gb.DP(decoder.decodeInt(typeDescriptor, -1)); @@ -228,7 +226,7 @@ public class BCF2GenotypeFieldDecoders { private class GQDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { // the -1 is for missing gb.GQ(decoder.decodeInt(typeDescriptor, -1)); @@ -238,27 +236,27 @@ public class BCF2GenotypeFieldDecoders { private class ADDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { - gb.AD(decoder.decodeIntArray(typeDescriptor)); + gb.AD(decoder.decodeIntArray(typeDescriptor, numElements)); } } } private class PLDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { - gb.PL(decoder.decodeIntArray(typeDescriptor)); + gb.PL(decoder.decodeIntArray(typeDescriptor, numElements)); } } } private class GenericDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { - Object value = decoder.decodeTypedValue(typeDescriptor); + Object value = decoder.decodeTypedValue(typeDescriptor, numElements); if ( value != null ) { // don't add missing values if ( value instanceof List && ((List)value).size() == 1) { // todo -- I really hate this, and it suggests that the code isn't completely right @@ -275,9 +273,9 @@ public class BCF2GenotypeFieldDecoders { private class FTDecoder implements Decoder { @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) { + public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { - Object value = decoder.decodeTypedValue(typeDescriptor); + Object value = decoder.decodeTypedValue(typeDescriptor, numElements); if ( value != null ) { // don't add missing values gb.filters(value instanceof String ? Collections.singletonList((String)value) : (List)value); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java index 7f10375bb..c749325fb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java @@ -77,9 +77,10 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { // the type of each element final byte typeDescriptor = decoder.readTypeDescriptor(); + final int numElements = decoder.decodeNumberOfElements(typeDescriptor); final BCF2GenotypeFieldDecoders.Decoder fieldDecoder = codec.getGenotypeFieldDecoder(field); try { - fieldDecoder.decode(siteAlleles, field, decoder, typeDescriptor, builders); + fieldDecoder.decode(siteAlleles, field, decoder, typeDescriptor, numElements, builders); } catch ( ClassCastException e ) { throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field + " inconsistent with the value observed in the decoded value"); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 7c745628a..bb4a5abb9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -58,7 +58,6 @@ public class GenotypeLikelihoods { static { // must be done before PLIndexToAlleleIndex for ( int numAlleles = 1; numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES; numAlleles++ ) { - //numLikelihoodCache[numAlleles] = new int[NUM_LIKELIHOODS_CACHE_PLOIDY]; for ( int ploidy = 1; ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY; ploidy++ ) { numLikelihoodCache[numAlleles][ploidy] = calcNumLikelihoods(numAlleles, ploidy); } @@ -364,11 +363,13 @@ public class GenotypeLikelihoods { * S(N,1) = N (only way to have N integers add up to 1 is all-zeros except one element with a one. There are N of these vectors) * S(1,P) = 1 (only way to have 1 integer add to P is with that integer P itself). * + * note that in the case where ploidy == 0 we assume that the ploidy actually == 2 + * * @param numAlleles Number of alleles (including ref) * @param ploidy Ploidy, or number of chromosomes in set * @return Number of likelihood elements we need to hold. */ - @Requires({"ploidy > 0", "numAlleles > 0"}) + @Requires({"ploidy >= 0", "numAlleles > 0"}) @Ensures("result > 0") public static int numLikelihoods(final int numAlleles, final int ploidy) { if ( numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java index 9577a3e63..ba8668fa9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -416,6 +416,7 @@ public class GenotypesContext implements List { @Ensures("result >= 0") public int getMaxPloidy() { if ( maxPloidy == -1 ) { + maxPloidy = 0; // necessary in the case where there are no genotypes for ( final Genotype g : getGenotypes() ) { maxPloidy = Math.max(g.getPloidy(), maxPloidy); } From 064cc563356cecef7f9f27f9bc36f2847b4f1cb0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 27 Jun 2012 18:19:13 -0400 Subject: [PATCH 28/32] Update integration tests to reflect new FT header line standard and new DiagnoseTargets field names --- .../DiagnoseTargetsIntegrationTest.java | 7 +- .../VariantFiltrationIntegrationTest.java | 4 +- ...ntRecalibrationWalkersIntegrationTest.java | 6 +- .../VCFStreamingIntegrationTest.java | 2 +- .../bcf2/BCF2EncoderDecoderUnitTest.java | 3 +- .../utils/codecs/vcf/VCFIntegrationTest.java | 2 +- .../VariantContextTestProvider.java | 103 ++++++++++-------- 7 files changed, 69 insertions(+), 58 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java index 355071e73..63b2d39f1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java @@ -36,18 +36,19 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { final String L = validationDataLocation + "DT-itest.interval_list"; private void DTTest(String testName, String args, String md5) { - String base = String.format("-T DiagnoseTargets -R %s -L %s", REF, L) + " -o %s "; + String base = String.format("-T DiagnoseTargets --no_cmdline_in_header -R %s -L %s", REF, L) + " -o %s "; WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5)); + spec.disableShadowBCF(); executeTest(testName, spec); } @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "2df47009571fe83ead779c94be97fe96"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "ef71a569a48697c89e642cdda7bfb766"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "6f0c070b9671e1d007ce6374c3183014"); + DTTest("testMultiSample ", "-I " + multiSample, "1e6e15156e01e736274898fdac77d911"); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index ae5128c75..70a10a0b5 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -80,7 +80,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testGenotypeFilters1() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("060e9e7b6faf8b2f7b3291594eb6b39c")); + Arrays.asList("c5ed9dd3975b3602293bb484b4fda5f4")); executeTest("test genotype filter #1", spec1); } @@ -88,7 +88,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testGenotypeFilters2() { WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -G_filter 'isHomVar == 1' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("00f90028a8c0d56772c47f039816b585")); + Arrays.asList("979ccdf484259117aa31305701075602")); executeTest("test genotype filter #2", spec2); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 857032579..e0cda07d7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -7,10 +7,6 @@ import org.testng.annotations.DataProvider; import java.util.*; public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { - static HashMap clusterFiles = new HashMap(); - static HashMap tranchesFiles = new HashMap(); - static HashMap inputVCFFiles = new HashMap(); - private static class VRTest { String inVCF; String tranchesMD5; @@ -77,7 +73,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest indel = new VRTest("combined.phase1.chr20.raw.indels.sites.vcf", "da4458d05f6396f5c4ab96f274e5ccdc", // tranches "a04a9001f62eff43d363f4d63769f3ee", // recal file - "05e88052e0798f1c1e83f0a8938bce56"); // cut VCF + "b9936d2432d3c85b2d8b5b7aa17d0950"); // cut VCF @DataProvider(name = "VRIndelTest") public Object[][] createData2() { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index a5cd49971..2b917ae0c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -60,7 +60,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " --no_cmdline_in_header " + " -o %s", 1, - Arrays.asList("2cdcd9e140eb1b6da7e365e37dd7d859") + Arrays.asList("283f434b3efbebb8e10ed6347f97d104") ); executeTest("testSimpleVCFStreaming", spec); diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java index ef8a67d47..a0feef186 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java @@ -480,7 +480,8 @@ public class BCF2EncoderDecoderUnitTest extends BaseTest { final byte typeDescriptor = decoder.readTypeDescriptor(); // read the int[] with the low-level version - final int[] decoded = decoder.decodeIntArray(typeDescriptor); + final int size = decoder.decodeNumberOfElements(typeDescriptor); + final int[] decoded = decoder.decodeIntArray(typeDescriptor, size); if ( isMissing(ints) ) { // we expect that the result is null in this case diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 14b75fbc6..2a92b85e1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -34,7 +34,7 @@ public class VCFIntegrationTest extends WalkerTest { String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; String test1 = baseCommand + "-T SelectVariants -V " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("355b029487c3b4c499140d71310ca37e")); + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("13329ba7360a8beb3afc02569e5a20c4")); executeTest("Test reading and writing breakpoint VCF", spec1); } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index e5b45f70f..528f3dd29 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -56,18 +56,23 @@ public class VariantContextTestProvider { final private static boolean ENABLE_VARARRAY_TESTS = true; final private static boolean ENABLE_PLOIDY_TESTS = true; final private static boolean ENABLE_PL_TESTS = true; + final private static boolean ENABLE_SYMBOLIC_ALLELE_TESTS = false; final private static boolean ENABLE_SOURCE_VCF_TESTS = true; final private static boolean ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS = true; + final private static List TWENTY_INTS = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); private static VCFHeader syntheticHeader; final static List TEST_DATAs = new ArrayList(); private static VariantContext ROOT; - private final static List testSourceVCFs = Arrays.asList( - new File(BaseTest.privateTestDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf"), - new File(BaseTest.privateTestDir + "ex2.vcf"), - new File(BaseTest.privateTestDir + "dbsnp_135.b37.1000.vcf") - ); + private final static List testSourceVCFs = new ArrayList(); + static { + testSourceVCFs.add(new File(BaseTest.privateTestDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf")); + testSourceVCFs.add(new File(BaseTest.privateTestDir + "ex2.vcf")); + testSourceVCFs.add(new File(BaseTest.privateTestDir + "dbsnp_135.b37.1000.vcf")); + if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) + testSourceVCFs.add(new File(BaseTest.privateTestDir + "diagnosis_targets_testfile.vcf")); + } public abstract static class VariantContextIOTest { public String toString() { @@ -245,7 +250,7 @@ public class VariantContextTestProvider { add(builder().attribute("INT3", Arrays.asList(1000, 2000, 3000))); add(builder().attribute("INT3", Arrays.asList(100000, 200000, 300000))); add(builder().attribute("INT3", null)); - add(builder().attribute("INT20", Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20))); + add(builder().attribute("INT20", TWENTY_INTS)); add(builder().attribute("FLOAT1", 1.0)); add(builder().attribute("FLOAT1", 100.0)); @@ -449,6 +454,11 @@ public class VariantContextTestProvider { attr("g1", ref, "INT3", 1, 2, 3), attr("g2", ref, "INT3")); + addGenotypeTests(site, + attr("g1", ref, "INT20", TWENTY_INTS), + attr("g2", ref, "INT20", TWENTY_INTS)); + + if (ENABLE_VARARRAY_TESTS) { addGenotypeTests(site, attr("g1", ref, "INT.VAR", 1, 2, 3), @@ -693,20 +703,23 @@ public class VariantContextTestProvider { * @param expected */ public static void assertEquals( final VariantContext actual, final VariantContext expected ) { - Assert.assertNotNull(actual); - Assert.assertEquals(actual.getChr(), expected.getChr()); - Assert.assertEquals(actual.getStart(), expected.getStart()); - Assert.assertEquals(actual.getEnd(), expected.getEnd()); - Assert.assertEquals(actual.getID(), expected.getID()); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles()); + Assert.assertNotNull(actual, "VariantContext expected not null"); + Assert.assertEquals(actual.getChr(), expected.getChr(), "chr"); + Assert.assertEquals(actual.getStart(), expected.getStart(), "start"); + Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end"); + Assert.assertEquals(actual.getID(), expected.getID(), "id"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles"); assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); - Assert.assertEquals(actual.getFilters(), expected.getFilters()); + Assert.assertEquals(actual.getFilters(), expected.getFilters(), "filters"); BaseTest.assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); - Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes()); + Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes"); if ( expected.hasGenotypes() ) { - Assert.assertEquals(actual.getSampleNames(), expected.getSampleNames()); + final Set actualSampleSet = new HashSet(actual.getSampleNames()); + final Set expectedSampleSet = new HashSet(expected.getSampleNames()); + Assert.assertTrue(actualSampleSet.equals(expectedSampleSet), "sample names"); // note this is necessary due to testng bug for set comps + Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names"); final Set samples = expected.getSampleNames(); for ( final String sample : samples ) { assertEquals(actual.getGenotype(sample), expected.getGenotype(sample)); @@ -715,33 +728,33 @@ public class VariantContextTestProvider { } public static void assertEquals(final Genotype actual, final Genotype expected) { - Assert.assertEquals(actual.getSampleName(), expected.getSampleName()); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles()); - Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString()); - Assert.assertEquals(actual.getType(), expected.getType()); + Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles"); + Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string"); + Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type"); // filters are the same - Assert.assertEquals(actual.getFilters(), expected.getFilters()); - Assert.assertEquals(actual.isFiltered(), expected.isFiltered()); + Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields"); + Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered"); // inline attributes - Assert.assertEquals(actual.getDP(), expected.getDP()); - Assert.assertEquals(actual.getAD(), expected.getAD()); - Assert.assertEquals(actual.getGQ(), expected.getGQ()); - Assert.assertEquals(actual.hasPL(), expected.hasPL()); - Assert.assertEquals(actual.hasAD(), expected.hasAD()); - Assert.assertEquals(actual.hasGQ(), expected.hasGQ()); - Assert.assertEquals(actual.hasDP(), expected.hasDP()); + Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp"); + Assert.assertEquals(actual.getAD(), expected.getAD(), "Genotype ad"); + Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq"); + Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL"); + Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD"); + Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ"); + Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP"); - Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods()); - Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString()); - Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods()); - Assert.assertEquals(actual.getPL(), expected.getPL()); + Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods"); + Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString"); + Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods"); + Assert.assertEquals(actual.getPL(), expected.getPL(), "Genotype getPL"); - Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual()); + Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual"); assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes()); - Assert.assertEquals(actual.isPhased(), expected.isPhased()); - Assert.assertEquals(actual.getPloidy(), expected.getPloidy()); + Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased"); + Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy"); } private static void assertAttributesEquals(final Map actual, Map expected) { @@ -753,16 +766,16 @@ public class VariantContextTestProvider { final Object expectedValue = expected.get(act.getKey()); if ( expectedValue instanceof List ) { final List expectedList = (List)expectedValue; - Assert.assertTrue(actualValue instanceof List); + Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't"); final List actualList = (List)actualValue; - Assert.assertEquals(actualList.size(), expectedList.size()); + Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size"); for ( int i = 0; i < expectedList.size(); i++ ) - assertAttributesEquals(actualList.get(i), expectedList.get(i)); + assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i)); } else - assertAttributesEquals(actualValue, expectedValue); + assertAttributeEquals(act.getKey(), actualValue, expectedValue); } else { // it's ok to have a binding in x -> null that's absent in y - Assert.assertNull(actualValue); + Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other"); } expectedKeys.remove(act.getKey()); } @@ -771,7 +784,7 @@ public class VariantContextTestProvider { // and they must all be null for ( final String missingExpected : expectedKeys ) { final Object value = expected.get(missingExpected); - Assert.assertTrue(isMissing(value)); + Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" ); } } @@ -788,12 +801,12 @@ public class VariantContextTestProvider { return false; } - private static void assertAttributesEquals(final Object actual, final Object expected) { + private static void assertAttributeEquals(final String key, final Object actual, final Object expected) { if ( expected instanceof Double ) { // must be very tolerant because doubles are being rounded to 2 sig figs BaseTest.assertEqualsDoubleSmart(actual, (Double)expected, 1e-2); } else - Assert.assertEquals(actual, expected); + Assert.assertEquals(actual, expected, "Attribute " + key); } public static void addComplexGenotypesTest() { @@ -863,14 +876,14 @@ public class VariantContextTestProvider { } public static void assertEquals(final VCFHeader actual, final VCFHeader expected) { - Assert.assertEquals(actual.getMetaData().size(), expected.getMetaData().size()); + Assert.assertEquals(actual.getMetaData().size(), expected.getMetaData().size(), "No VCF header lines"); // for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted? //Assert.assertEquals(actual.getMetaData(), expected.getMetaData()); final List actualLines = new ArrayList(actual.getMetaData()); final List expectedLines = new ArrayList(expected.getMetaData()); for ( int i = 0; i < actualLines.size(); i++ ) { - Assert.assertEquals(actualLines.get(i), expectedLines.get(i)); + Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines"); } } From 734bb5366b35bea9aac01db8a868fa20bd466b44 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 28 Jun 2012 10:04:15 -0400 Subject: [PATCH 29/32] Special case the situation where we have ploidy == 0 (no GT values) to implicitly assume we have diploid samples -- numLikelihoods no longer allows even ploidy == 0 in requires -- VCFCompoundHeaderLine handles the case where ploidy == 0 => implicit ploidy == 2 --- .../codecs/vcf/VCFCompoundHeaderLine.java | 18 ++++++++++-------- .../variantcontext/GenotypeLikelihoods.java | 4 +--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index 6f9a8f5e6..667de3dea 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -75,22 +75,24 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF * If the count is a fixed count, return that. For example, a field with size of 1 in the header returns 1 * If the count is of type A, return vc.getNAlleles - 1 * If the count is of type G, return the expected number of genotypes given the number of alleles in VC and the - * max ploidy among all samples + * max ploidy among all samples. Note that if the max ploidy of the VC is 0 (there's no GT information + * at all, then implicitly assume diploid samples when computing G values. * If the count is UNBOUNDED return -1 * * @param vc * @return */ public int getCount(final VariantContext vc) { - int myCount; switch ( countType ) { - case INTEGER: myCount = count; break; - case UNBOUNDED: myCount = -1; break; - case A: myCount = vc.getNAlleles() - 1; break; - case G: myCount = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), vc.getMaxPloidy()); break; - default: throw new ReviewedStingException("Unknown count type: " + countType); + case INTEGER: return count; + case UNBOUNDED: return -1; + case A: return vc.getNAlleles() - 1; + case G: + final int ploidy = vc.getMaxPloidy(); + return GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), ploidy == 0 ? 2 : ploidy); + default: + throw new ReviewedStingException("Unknown count type: " + countType); } - return myCount; } public void setNumberToUnbounded() { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index bb4a5abb9..d644eda7d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -363,13 +363,11 @@ public class GenotypeLikelihoods { * S(N,1) = N (only way to have N integers add up to 1 is all-zeros except one element with a one. There are N of these vectors) * S(1,P) = 1 (only way to have 1 integer add to P is with that integer P itself). * - * note that in the case where ploidy == 0 we assume that the ploidy actually == 2 - * * @param numAlleles Number of alleles (including ref) * @param ploidy Ploidy, or number of chromosomes in set * @return Number of likelihood elements we need to hold. */ - @Requires({"ploidy >= 0", "numAlleles > 0"}) + @Requires({"ploidy > 0", "numAlleles > 0"}) @Ensures("result > 0") public static int numLikelihoods(final int numAlleles, final int ploidy) { if ( numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES From 1ce0b9d519a1bc3c323a660071598ffafea0ee57 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Thu, 28 Jun 2012 11:21:11 -0400 Subject: [PATCH 30/32] Throwing UnknownTribbleType exception instead of CommandLineException when an unknown tribble type is specified. --- .../sting/commandline/ArgumentTypeDescriptor.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index d5503b2a9..c201e95f0 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -345,7 +345,8 @@ public abstract class ArgumentTypeDescriptor { FeatureManager manager = new FeatureManager(); if ( manager.getByName(tribbleType) == null ) - throw new UserException.CommandLineException( + throw new UserException.UnknownTribbleType( + tribbleType, String.format("Unable to find tribble type '%s' provided on the command line. " + "Please select a correct type from among the supported types:%n%s", tribbleType, manager.userFriendlyListOfAvailableFeatures(parameterType))); From abe74dc32d55df2009ce7db6b3ee6956566468aa Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Wed, 27 Jun 2012 14:09:30 -0400 Subject: [PATCH 31/32] Navel -> GXDB --- ivy.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ivy.xml b/ivy.xml index f76880b94..5a8c3986a 100644 --- a/ivy.xml +++ b/ivy.xml @@ -97,7 +97,7 @@ - +