From abf6d9eb6abcb92735a861e87ea9144a2cc295e7 Mon Sep 17 00:00:00 2001 From: delucchi-cmu Date: Tue, 9 Jan 2024 15:37:59 -0500 Subject: [PATCH] Correct integer types in index catalog creation --- src/hipscat_import/catalog/map_reduce.py | 4 +- src/hipscat_import/index/map_reduce.py | 12 +++- src/hipscat_import/soap/map_reduce.py | 14 ++--- .../Norder=0/Dir=0/Npix=11.parquet | Bin 8913 -> 8880 bytes .../small_sky_object_catalog/_common_metadata | Bin 3997 -> 4018 bytes .../data/small_sky_object_catalog/_metadata | Bin 5130 -> 5130 bytes .../catalog_info.json | 2 +- .../small_sky_object_catalog/point_map.fits | Bin 8640 -> 1581120 bytes .../provenance_info.json | 37 ++++++------ tests/hipscat_import/index/test_run_index.py | 53 ++++++++++++++++++ 10 files changed, 90 insertions(+), 32 deletions(-) diff --git a/src/hipscat_import/catalog/map_reduce.py b/src/hipscat_import/catalog/map_reduce.py index 1cc7fe16..9c3115e2 100644 --- a/src/hipscat_import/catalog/map_reduce.py +++ b/src/hipscat_import/catalog/map_reduce.py @@ -16,7 +16,7 @@ # pylint: disable=too-many-locals,too-many-arguments -def _get_pixel_directory(cache_path: FilePointer, order: np.int64, pixel: np.int64): +def _get_pixel_directory(cache_path: FilePointer, order: np.int64, pixel: np.uint64): """Create a path for intermediate pixel data. This will take the form: @@ -81,7 +81,7 @@ def _iterate_input_file( ) # Set up the pixel data mapped_pixels = hp.ang2pix( - 2 ** highest_order, + 2**highest_order, data[ra_column].values, data[dec_column].values, lonlat=True, diff --git a/src/hipscat_import/index/map_reduce.py b/src/hipscat_import/index/map_reduce.py index f728e3df..988104de 100644 --- a/src/hipscat_import/index/map_reduce.py +++ b/src/hipscat_import/index/map_reduce.py @@ -3,7 +3,8 @@ import dask.dataframe as dd import numpy as np from dask.distributed import progress, wait -from hipscat.io import file_io +from hipscat.io import paths +from hipscat.io.file_io import file_io from hipscat.pixel_math.hipscat_id import HIPSCAT_ID_COLUMN @@ -16,13 +17,17 @@ def create_index(args): if args.include_order_pixel: include_columns.extend(["Norder", "Dir", "Npix"]) - index_dir = file_io.append_paths_to_pointer(args.catalog_path, "index") + index_dir = paths.append_paths_to_pointer(args.catalog_path, "index") + metadata_file = paths.get_parquet_metadata_pointer(args.input_catalog_path) + + metadata = file_io.read_parquet_metadata(metadata_file) data = dd.read_parquet( path=args.input_catalog_path, columns=include_columns, engine="pyarrow", - dataset={"partitioning": "hive"}, + dataset={"partitioning": {"flavor": "hive", "schema": metadata.schema.to_arrow_schema()}}, + filesystem="arrow", ) if args.include_order_pixel: @@ -33,6 +38,7 @@ def create_index(args): data = data.reset_index() if not args.include_hipscat_index: data = data.drop(columns=[HIPSCAT_ID_COLUMN]) + data = data.drop_duplicates() data = data.repartition(partition_size=args.compute_partition_size) data = data.set_index(args.indexing_column) result = data.to_parquet( diff --git a/src/hipscat_import/soap/map_reduce.py b/src/hipscat_import/soap/map_reduce.py index 95f1e83a..0a495119 100644 --- a/src/hipscat_import/soap/map_reduce.py +++ b/src/hipscat_import/soap/map_reduce.py @@ -46,12 +46,12 @@ def _count_joins_for_object(source_data, source_pixel, object_pixel, soap_args): joined_data = joined_data.reset_index() joined_data["Norder"] = np.full(rows_written, fill_value=object_pixel.order, dtype=np.uint8) - joined_data["Dir"] = np.full(rows_written, fill_value=object_pixel.dir, dtype=np.uint32) - joined_data["Npix"] = np.full(rows_written, fill_value=object_pixel.pixel, dtype=np.uint32) + joined_data["Dir"] = np.full(rows_written, fill_value=object_pixel.dir, dtype=np.uint64) + joined_data["Npix"] = np.full(rows_written, fill_value=object_pixel.pixel, dtype=np.uint64) joined_data["join_Norder"] = np.full(rows_written, fill_value=source_pixel.order, dtype=np.uint8) - joined_data["join_Dir"] = np.full(rows_written, fill_value=source_pixel.dir, dtype=np.uint32) - joined_data["join_Npix"] = np.full(rows_written, fill_value=source_pixel.pixel, dtype=np.uint32) + joined_data["join_Dir"] = np.full(rows_written, fill_value=source_pixel.dir, dtype=np.uint64) + joined_data["join_Npix"] = np.full(rows_written, fill_value=source_pixel.pixel, dtype=np.uint64) joined_data.to_parquet(output_file, index=True) @@ -64,9 +64,9 @@ def _write_count_results(cache_path, source_healpix, results): dataframe = pd.DataFrame(results, columns=["Norder", "Npix", "num_rows"]) dataframe["Dir"] = [int(order / 10_000) * 10_000 if order >= 0 else -1 for order, _, _ in results] - dataframe["join_Norder"] = np.full(num_results, fill_value=source_healpix.order, dtype=np.int32) - dataframe["join_Dir"] = [int(order / 10_000) * 10_000 for order in dataframe["join_Norder"]] - dataframe["join_Npix"] = np.full(num_results, fill_value=source_healpix.pixel, dtype=np.int32) + dataframe["join_Norder"] = np.full(num_results, fill_value=source_healpix.order, dtype=np.uint8) + dataframe["join_Dir"] = np.full(num_results, fill_value=source_healpix.dir, dtype=np.uint64) + dataframe["join_Npix"] = np.full(num_results, fill_value=source_healpix.pixel, dtype=np.uint64) ## Reorder columns. dataframe = dataframe[["Norder", "Dir", "Npix", "join_Norder", "join_Dir", "join_Npix", "num_rows"]] diff --git a/tests/hipscat_import/data/small_sky_object_catalog/Norder=0/Dir=0/Npix=11.parquet b/tests/hipscat_import/data/small_sky_object_catalog/Norder=0/Dir=0/Npix=11.parquet index 7aed5e2b4083a91b4e801946833f38698730aba0..e0cb8d948d9a14a7ac5937856204a9fa48684fad 100644 GIT binary patch delta 1573 zcmbVM?Q2_A7|+f7`oeBZZ+g?-tZCX@vvsyfnuMk>x|7?O-jF6s*5yqH-Xv{anxtuC znx>-FFBSY^<-qWRj0t`b$~MUtKPciq;FvGN4}NfhpkH*L$OIM7IXBG)G8i7nbDrlp z&+|LK=Xc1x8@wGnnm+y2Fs+zDzr|s_;+J9eqxo%0^CR+q>wZWTD3wA>cWKp#5Nd&2EkhRhg@Lzh9HTaSaqW@*b;<`nJ9WnX-89E2Nz%+}qgy+L z^&LiOI#iloGaXX=KU4g@Db5FLr>5Xl9glkM8%*$~j-EPv_5{q#s+P;WrfZbxI>o=s z^FP^9*L{NnT}(R$74l*fjENaL&m1Zln$a>^&ZltbPYzlAAgt?I_a|o`0jr~p@}5jb zM*hOVUw8bc?XKR^y)$q}J@D|tNiVD$Et)?T`FrQ!k?|b-{dC%CH${$(`ezUoMr~IT z(smA|F7{L8x_%By^eoYjR$U(CKOW(j0a?5U^ot}PGhFTInQkzjnVivMNohTz`21F0m$R0d9?2 zp<{Fg`dK%8i5Y@#k3QG;AC7*Yb$Mo;vo82PJI>A9URAI|Ad!fr7MuCfdQD=FjdOVI z7vZ<;5*3H6d5K!+rp-3BuAXe<>TwTz!@}wcscNgRT!c3)1INfe~#0}w7b;EuS6@EbYeqFc{j6iCH(#j3vY~%JO(s*kaRXh zAzaDVlBGh(-w`WY#r00ODW$@lOgikvu~-jyk>AP9=gScRlnb?FC-1KA=E|Z^6vVx# z;AuxH!ScrCHm)t@YKc z8@otpCoU5_bI8YqpoC)p|FY6{g11%=G;lF0+Qu(Kn8Y4k3O}D5pA1TA5jgvBU=>%Q zmJ_%l{M$NlfD#$Z58-AopomTU5-P7%g5MHnC*!eM8C$Y2ZZi4N$ON$|PoP^?_QfB>^b!+FZjei0Z$2N}t-KMr< z14>U+0-;2DQdI%s-b>ZW90E70UMqwu5I0V62oe_%haM2`Jv&X)RvgMmo;UN&H}ht{ zwSC+FgMa7Ppu?bN9vFCL*1$1;7`QuAmsHw6G4^ZwrYfycb4*X0Q;)DDKzrQz_6OQ) z6h(^h0vhb&P+qr6i#LslZhpS+6T4{A2s5N-M}O78YZz=DZ$`%6ara-xL=#I10^bv< zTux^$n^^P5togdi{FO@j=cx44QSorw_K+3*u0Q_tFz4<|2gB69+@VQ#CFyhNnUgqG zKdR_VW=%icjd(M2Vn0K-dq-yfK6x-$6RGAlAK&rnN$KYkc=EH`-|ZFJxB9-<^nLs0 zBPp&8j%jz_kZw58{TF!j+3*m0Fc>p2CiXbXlf63;y$G#x6I4}By^Mb0^j-Pirl(?j zr>poHJ$Ll0Q_^!OYI_dtZ2wOdbHDT-k_-!bZ{e?jq&Giwk1&kz3g1&rmea`%ey{z_ zFt66;%2%e`=+20V#mA({iVUNUr@{K@tB1kwM(=4o6Rrst`jkH+y6hh@`~WO1%|~Z9 zQtO#glHVP65_18%%b!sJVudrRN?R7}8eOSaOZ&o2v~3a9;g*7KSzbn8kIm|;YvsjO zIEO$KSzNv-I<)htU~w8b#sqCyu4cT*CXT5OQ_#b4-dIXTS2hv>Pc0suEGqd*7F`rf zsA}QShvVL@*m5JQlp@VUG~imBQgZV-`IJ)Htj5cs^;EflbI7IUiZ~9{(@kYV@rK-q zSg4hZdTL~tk}uT4-sy%?c9B{m9-S$~W0CjMrIlvNU97L=l-Crm(ma<3%nNw{AcQ|q z2S31Ec1s43!>|YtAV8#nOWq10WQgxq`at5@GJpIW7m9H90WX-WK3?w z-2qq3U92U7E7P5F9K3W+9H;n7Y**;KEgW?R1}S_nkkb4!jbw~W2Dr4p0Qe~|v!8eS z15fk=S=PT)O=W!-oOP#@mX|vu^7e5g1A5PDM!#9P6@M23pcEM%QPycoiulgvG^Zls z=gY_3Sm=_cNznNP+(oX%Y7$GM0h^UxJQrGjoCv`dX1-21a!1X4H~0I{7uF+%LLwT5 zNxU^3m-Q{;sI&1VX@V!$!RlhLMBfz1Sx^Atzi{E+jX%J9?_g~r3Fn@3?svX(&UZ6!+#lU1j|0A?)Y*dT zn#E#SwD~5O$*9`(t>V=EiIJ`Eeme(tq7(!nhjI&y3^{Y_@oj7lzS66qeEc$%KjqNih z>(~zurJ{mWL0rD8HnVnO4=*f%O$vYO&St+w-q9Bd6Ls&HH2 zg`*4~8m5{twf=BOu#HMP*O1vruv|=WK!h01 zC^!)%SI{OVJ|uRV7<>UB?5+F#B#iL{4LmWlkU9V%d~$Uu-WtlmOr!cBqT@kYHJq&v zu-X9v<3oPz6CgB54Cs7j-beZNG)^aA3_2jMLnq1pYl@!FuB^qRq5y#lI!F^FF8NER zpzky8C7dXc`7m#Z5DLVI=tCz7Uk=9cmCmiS&EE93SGa0(;<=-{IHexy8BG2iaLjfg^=Ud9&7mZk(lQ#Mj& z&JK4hj7OKc2mRXNhOFxwQn%X~dIq&_Pwi+P^OIYCjYe~e=*67V^={5$*@YbY2a%K~ At^fc4 delta 1105 zcma)5TW=Ck5TZB*w%SAN(i27+?GYJ~-zrsOf_`$)270=9`)C%);B%^VJs*mK;l$)-G{Q zQ9rRgSGJ=M#?LlpzPzP9SF9%X2*JXW_7B;-9V7RxE7+u1NrRUp8?|X?9a5{iJqX# zYZ~2(n6b#%>vv{`5_#>tN#4)yu64?-+%Tg{r&284t;peiwGid2ieN~Mc1@aAl`h|M zA8JWsi>%Ksds;%L8k5K;5BLA&5deWta}YWUOEZ@X?&muae!29Bmf8*$N?lMknLkC)~19i zJ>r~l_e%=x4Dfg=0OAbTp2m~<#D#jGV&nOaRuk`sdZ7?YAKN7QgF#OK zQeL=DzAjuT#3v8{qX_VeQfDO^J#FPIW=q8THIxW0+KgHhJv_oHIu^G{twI(T18i_A zUH_8^$sscDQjTD6voVR|A~|1J88wS?2EwSCwX2Ca^lZfgS_Bv6;C2x-*_Q&L1`>cu z-?r88JK-E8;gU5FDT;To+2~I;m||?IC(Jwm+8}8(@x+FG+5kDzt`q67x98wiiQ&89 YW&hLVmt^eY=KT(bW9)M{jvycY2|5@;v;Y7A diff --git a/tests/hipscat_import/data/small_sky_object_catalog/_metadata b/tests/hipscat_import/data/small_sky_object_catalog/_metadata index 63b03188f42a9fb1b4446d058e8ea678f6837297..26df207b47e12572da8ef6e66e909f2e2d10490f 100644 GIT binary patch delta 1320 zcma)5OKcle6diwJoVXFeX{@nogyZ1Bv`u2Wo)AZ>=$o;z$M}QY)ITA?c>Ei8Jf7Hb z97jzfgp@^AL{CDjxcikcuu%R0kEV@9f5DQqbs}K^1ci*^a5gSJG{k`+fJ@-EO zEOZw7tbTS6TMeWY~jqGZn->fuH0e6L_*982fl-UEjP%Q5xe zSng9VT0f@#vGq9k)9bz!tCuG*FYCw;hRxP+#p*e=dhUBpE!^L0+>yLY<>aXn-Afz;}ZVPx&E&_GXJ?F659xGMDUOE zS9~`D#|(Q3l}d%m&PeID`ZD`)c@w^)4t{0fZA%dsiQRs5z7VkE!IHxg!y7L4rDjDN zHxnKFm+LCFy}Yv;kt=*S-z%zF5vdKmysxCSb~CL_>bIpWNz6p>Q&+@sm!HVh4$dvI za|I=(#qifIFCi7zvZC;o)aSd6${n?!BDEUr8cI6ft5r*F!hE-G-ooA`7LOM%|F3qd zC*|;nwC*YWa<`d^Po>_V)0$>RawRjZRWm%~k}mKho;GeMRC=>to+?r8prL|n=`b(G zCfQz6ZQq^%w%h2Jx=JJ-tyeQgawR^d96f2A6Jtk`9!6pjA(TY|Lc-_-k)(){q|zaU z1RSA0nA8O9HW+e%khFVka}&~}2xugNp%-a@P>gt?EvK}J63us;@5BuvL~D_1`+&Sv z7+_*-lf(c-UC0sU-Ah5rx2p>VfC*?1-j*sUJa3BsbYEFb%2f%4FX&JXkmSlUx(?oQ zzvdx@8t5auH6WD06!DjC{|T_BN?I)v*AzW-pma!{=n)cgP}(%TJIiO23Um`xBoL8J zk)JTFm0#c>tDH$e=ni1S!7LZhA>?NDgszh2C;_gXD)&&nF>6R>dyj<5^UC0s#*>#sod7?Hl}gXuUUU(4YuQ?jX%_aHfwji{@gmT z9a~L6&CR{T@gp(4V|tE^p>I9^EIV@6Hun7Y$JCGq_HAZ=vp-?>L+%r1Uvt}w zf4n{I=DcBqdHJzSyHl=o+??lA&hx_V`O+@@JtO=wGtE0}D5mz)N3`+4 z$zb|qz(*F}`1(fVO6W?+zkSK)H%orht8_N>cFVuu^NVKA#-E0jm27@tL#5t1l+}4rjB`EyvOWVn4$>m3%mq9?HaU1>PaT@Eb+%T<^J=a@@ zxbM#dX0mFsF(>PNO>UNqQhuvm5X05={5?IZgMPOq8I>s6JMH~1KDm3AC*Pf#oNPqe zNQV#!RIfac zH{@taEEQ5iB_HX^wYDbL>fLm7t}nMjG}kZY!}Vez^RV1p8>nKVx30_Y$x(UmK@AX> ziU2?gALv0GU`gAN0HieJ01`Bi>0>I|rhp1naZI45symS3x(HELr-(;viL%M+?Vx8BOtSD%K?aM5`$~aU26%KZ0K{1!d~mk-i4%OF!^T$( zRg2vV_JTo{UNu4V13^y$XO#tt|VJ1qYSaW*DVvEEVSu2RR>}F&@ zquFI-(bl+8DrNF;(nqfeOR4H{AS8!~oTnMd+-S`@$75{cW|X-U1dzr@s+aOMIuMis{3iq!^t@DKanZXW;u diff --git a/tests/hipscat_import/data/small_sky_object_catalog/catalog_info.json b/tests/hipscat_import/data/small_sky_object_catalog/catalog_info.json index 59eaef9c..ef9a8b6d 100644 --- a/tests/hipscat_import/data/small_sky_object_catalog/catalog_info.json +++ b/tests/hipscat_import/data/small_sky_object_catalog/catalog_info.json @@ -5,4 +5,4 @@ "epoch": "J2000", "ra_column": "ra", "dec_column": "dec" -} \ No newline at end of file +} diff --git a/tests/hipscat_import/data/small_sky_object_catalog/point_map.fits b/tests/hipscat_import/data/small_sky_object_catalog/point_map.fits index e7287c9ffe13671c674f9d9a3f1d268f789dc544..1971966fe1e09405ab349380a805bcb939013341 100644 GIT binary patch delta 3949 zcmeHHD^NmF5cPuiWCnd$9yDz)DvTqT2nIilpGMS%Nu$X?BH|2z8G}Y6&b&Z`rp@S# z#fS|XQO`XG5G)qsCG2H)&z?PZFKln8*E@SWNX-{>+xg~E>M)Kox=US~L~bL$S&r70 z%O;-77n>LUZ6de5wY3o~rQf_LoqbF?<7VJ~2kvj+R)O1uZWFpq=r*C-gl-f5_a@B7 zjcL|`HtoBKMJD^?(PTvAz=Ha!3ag9BaiNX2 z5^T|(E(oCnsm=nAcd&qYm9|bB(q?G;1eH|MvK?zdL5;oS6dE|NREVw$-G@xX?Dc8^ z*sOC&Kt{7D(dmK)ay;qp+U>Kp$WTL}b$eZteU)AUp7CAOuvMhh0EaJaUn9X_0Afgj zP)?oB=thNc*XB)d&O~b}a>8m{7sMElRZ-VKkL9RKH;L*GMg~IpKmpv=F%o0@BX}j3(xL2u|kP7I2-wf`*APg5qk1Nm?)krZn1+&Lv{FexD8+TIXu!_u-4_K~;86 zY###<#jv1-#pRJW6$28`z<1cw)A0MJeV@rgYvdK>OPT{=MO~!zLy}gc!bn{`GHhEQ rqo)w^P_j!qpp`;YP>x;@5G6x~S2e>SiBLOH{`>Dee#t)<-+ufX4!4B% delta 86 zcmX?bAmM=X1}`2K1qB6*$$mV(%nBwJo9FS=GO~IDg%ma`@Od*bD<~LE&f}joaRLjY Z$!1=GTkMir3=kj-rD1%b&5dFQI03iw5&!@I diff --git a/tests/hipscat_import/data/small_sky_object_catalog/provenance_info.json b/tests/hipscat_import/data/small_sky_object_catalog/provenance_info.json index be110689..08ef05fe 100644 --- a/tests/hipscat_import/data/small_sky_object_catalog/provenance_info.json +++ b/tests/hipscat_import/data/small_sky_object_catalog/provenance_info.json @@ -1,43 +1,42 @@ { "catalog_name": "small_sky_object_catalog", "catalog_type": "object", - "version": "0.0.10.dev7+g0a79f90.d20230418", - "generation_date": "2023.04.20", - "epoch": "J2000", - "ra_kw": "ra", - "dec_kw": "dec", "total_rows": 131, + "epoch": "J2000", + "ra_column": "ra", + "dec_column": "dec", + "version": "0.2.1", + "generation_date": "2024.01.09", "tool_args": { "tool_name": "hipscat_import", - "version": "0.0.4.dev28+g2e31821.d20230420", + "version": "0.2.1", "runtime_args": { "catalog_name": "small_sky_object_catalog", - "output_path": "/home/data", + "output_path": "/home/delucchi/git/hipscat-import/tests/hipscat_import/data/", "output_artifact_name": "small_sky_object_catalog", "tmp_dir": "", "overwrite": true, - "dask_tmp": "/tmp/pytest-of-delucchi/pytest-1261/test_dask_runner0", + "dask_tmp": "", "dask_n_workers": 1, "dask_threads_per_worker": 1, - "catalog_path": "/home/data/small_sky_object_catalog", - "tmp_path": "/tmp/pytest-of-delucchi/pytest-1261/test_dask_runner0/small_sky_object_catalog/intermediate", + "catalog_path": "/home/delucchi/git/hipscat-import/tests/hipscat_import/data/small_sky_object_catalog", + "tmp_path": "/home/delucchi/git/hipscat-import/tests/hipscat_import/data/small_sky_object_catalog/intermediate", "epoch": "J2000", "catalog_type": "object", - "input_path": "/home/data/small_sky_parts", + "input_path": "/home/delucchi/git/hipscat-import/tests/hipscat_import/data/small_sky", "input_paths": [ - "/home/data/small_sky_parts/catalog_00_of_05.csv", - "/home/data/small_sky_parts/catalog_01_of_05.csv", - "/home/data/small_sky_parts/catalog_02_of_05.csv", - "/home/data/small_sky_parts/catalog_03_of_05.csv", - "/home/data/small_sky_parts/catalog_04_of_05.csv" + "file:///home/delucchi/git/hipscat-import/tests/hipscat_import/data/small_sky/catalog.csv" ], "input_format": "csv", "input_file_list": [], "ra_column": "ra", "dec_column": "dec", - "sort_columns": "id", - "highest_healpix_order": 1, + "use_hipscat_index": false, + "sort_columns": null, + "constant_healpix_order": -1, + "highest_healpix_order": 7, "pixel_threshold": 1000000, + "mapping_healpix_order": 7, "debug_stats_only": false, "file_reader_info": { "input_reader_type": "CsvReader", @@ -50,4 +49,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/hipscat_import/index/test_run_index.py b/tests/hipscat_import/index/test_run_index.py index 80190d10..6ebace01 100644 --- a/tests/hipscat_import/index/test_run_index.py +++ b/tests/hipscat_import/index/test_run_index.py @@ -2,6 +2,7 @@ import os +import numpy as np import pyarrow as pa import pyarrow.parquet as pq import pytest @@ -108,3 +109,55 @@ def test_run_index_on_source( schema = pq.read_metadata(os.path.join(args.catalog_path, "_common_metadata")).schema.to_arrow_schema() assert schema.equals(basic_index_parquet_schema, check_metadata=False) + + +@pytest.mark.dask +def test_run_index_on_source_object_id( + small_sky_source_catalog, + dask_client, # pylint: disable=unused-argument + tmp_path, + assert_parquet_file_index, +): + """Test appropriate metadata is written.""" + + args = IndexArguments( + input_catalog_path=small_sky_source_catalog, + indexing_column="object_id", + output_path=tmp_path, + output_artifact_name="small_sky_source_object_id_index", + overwrite=True, + include_hipscat_index=False, + progress_bar=False, + ) + runner.run(args) + + # Check that the catalog metadata file exists + catalog = Dataset.read_from_hipscat(args.catalog_path) + assert catalog.on_disk + assert catalog.catalog_path == args.catalog_path + + basic_index_parquet_schema = pa.schema( + [ + pa.field("Norder", pa.uint8()), + pa.field("Dir", pa.uint64()), + pa.field("Npix", pa.uint64()), + pa.field("object_id", pa.int64()), + ] + ) + + outfile = os.path.join(args.catalog_path, "index", "part.0.parquet") + schema = pq.read_metadata(outfile).schema.to_arrow_schema() + assert schema.equals(basic_index_parquet_schema, check_metadata=False) + + id_range = np.arange(700, 831) + ## Some of the objects have sources that span two source partitions. + doubled_up = [706, 707, 716, 726, 730, 736, 740, 779, 780, 784, 787, 789, 790, 792, 797, 818, 820] + doubled_up.extend(id_range) + + assert_parquet_file_index(outfile, doubled_up) + + schema = pq.read_metadata(os.path.join(args.catalog_path, "_metadata")).schema.to_arrow_schema() + assert schema.equals(basic_index_parquet_schema, check_metadata=False) + + schema = pq.read_metadata(os.path.join(args.catalog_path, "_common_metadata")).schema.to_arrow_schema() + assert schema.equals(basic_index_parquet_schema, check_metadata=False)