Speed up emission plot from packets

artis-mcrt · Feb 14, 2024 · 1c41846 · 1c41846
1 parent cc85001
commit 1c41846
Show file tree

Hide file tree

Showing 4 changed files with 103 additions and 26 deletions.
diff --git a/artistools/__init__.py b/artistools/__init__.py
@@ -55,13 +55,15 @@
 from artistools.misc import get_deposition
 from artistools.misc import get_dirbin_labels
 from artistools.misc import get_elsymbol
+from artistools.misc import get_elsymbols_df
 from artistools.misc import get_elsymbolslist
 from artistools.misc import get_escaped_arrivalrange
 from artistools.misc import get_file_metadata
 from artistools.misc import get_filterfunc
 from artistools.misc import get_grid_mapping
 from artistools.misc import get_inputparams
 from artistools.misc import get_ion_tuple
+from artistools.misc import get_ionstage_roman_numeral_df
 from artistools.misc import get_ionstring
 from artistools.misc import get_linelist_dataframe
 from artistools.misc import get_linelist_pldf

diff --git a/artistools/misc.py b/artistools/misc.py
@@ -606,6 +606,20 @@ def get_elsymbolslist() -> list[str]:
     ]
 
 
+def get_elsymbols_df() -> pl.DataFrame:
+    """Return a polars DataFrame of atomic number and element symbols."""
+    return (
+        pl.read_csv(
+            at.get_config()["path_datadir"] / "elements.csv",
+            separator=",",
+            has_header=True,
+            dtypes={"Z": pl.Int32},
+        )
+        .drop("name")
+        .rename({"symbol": "elsymbol", "Z": "atomic_number"})
+    )
+
+
 def get_atomic_number(elsymbol: str) -> int:
     """Return the atomic number of an element symbol."""
     assert elsymbol is not None
@@ -625,6 +639,17 @@ def decode_roman_numeral(strin: str) -> int:
     return -1
 
 
+def get_ionstage_roman_numeral_df() -> pl.DataFrame:
+    """Return a polars DataFrame of ionisation stage and roman numerals."""
+    return pl.DataFrame(
+        {
+            "ionstage": list(range(1, len(roman_numerals))),
+            "ionstage_roman": roman_numerals[1:],
+        },
+        schema={"ionstage": pl.Int32, "ionstage_roman": pl.Utf8},
+    )
+
+
 def get_elsymbol(atomic_number: int | np.int64) -> str:
     """Return the element symbol of an atomic number."""
     return get_elsymbolslist()[atomic_number]
@@ -946,7 +971,7 @@ def merge_pdf_files(pdf_files: list[str]) -> None:
     print(f"Files merged and saved to {resultfilename}.pdf")
 
 
-def get_bflist(modelpath: Path | str) -> pl.DataFrame:
+def get_bflist(modelpath: Path | str, get_ion_str: bool = False) -> pl.LazyFrame:
     """Return a dict of bound-free transitions from bflist.out."""
     compositiondata = get_composition_data(modelpath)
     bflistpath = firstexisting(["bflist.out", "bflist.dat"], folder=modelpath, tryzipped=True)
@@ -955,24 +980,37 @@ def get_bflist(modelpath: Path | str) -> pl.DataFrame:
     dfboundfree = pl.read_csv(
         bflistpath,
         skip_rows=1,
+        has_header=False,
         separator=" ",
-        new_columns=["i", "elementindex", "ionindex", "lowerlevel", "upperionlevel"],
+        new_columns=["bfindex", "elementindex", "ionindex", "lowerlevel", "upperionlevel"],
         dtypes={
-            "i": pl.Int32,
+            "bfindex": pl.Int32,
             "elementindex": pl.Int32,
             "ionindex": pl.Int32,
             "lowerlevel": pl.Int32,
             "upperionlevel": pl.Int32,
         },
-    )
+    ).lazy()
+
     dfboundfree = dfboundfree.with_columns(
         atomic_number=pl.col("elementindex").map_elements(lambda elementindex: compositiondata["Z"][elementindex]),
         ionstage=pl.col("ionindex")
         + pl.col("elementindex").map_elements(lambda elementindex: compositiondata["lowermost_ionstage"][elementindex]),
     )
-    return dfboundfree.drop(["elementindex", "ionindex"]).select(
-        ["atomic_number", "ionstage", "lowerlevel", "upperionlevel"]
-    )
+
+    dfboundfree = dfboundfree.drop(["elementindex", "ionindex"])
+
+    if get_ion_str:
+        dfgroupstrings = dfboundfree.group_by(["atomic_number", "ionstage"]).agg(
+            pl.map_groups(
+                exprs=[pl.col("atomic_number").first(), pl.col("ionstage").first()],
+                function=lambda z_ionstage: f"{at.get_ionstring(z_ionstage[0].item(), z_ionstage[1].item())} bound-free",
+            ).alias("ion_str"),
+        )
+
+        dfboundfree = dfboundfree.join(dfgroupstrings, how="left", on=["atomic_number", "ionstage"])
+
+    return dfboundfree
 
 
 linetuple = namedtuple("linetuple", "lambda_angstroms atomic_number ionstage upperlevelindex lowerlevelindex")
@@ -1002,7 +1040,7 @@ def read_linestatfile(filepath: Path | str) -> tuple[int, list[float], list[int]
     return nlines, lambda_angstroms, atomic_numbers, ionstages, upper_levels, lower_levels
 
 
-def get_linelist_pldf(modelpath: Path | str) -> pl.LazyFrame:
+def get_linelist_pldf(modelpath: Path | str, get_ion_str: bool = False) -> pl.LazyFrame:
     textfile = at.firstexisting("linestat.out", folder=modelpath)
     parquetfile = Path(modelpath, "linelist.out.parquet")
     if not parquetfile.is_file() or parquetfile.stat().st_mtime < textfile.stat().st_mtime:
@@ -1018,20 +1056,37 @@ def get_linelist_pldf(modelpath: Path | str) -> pl.LazyFrame:
                     "lower_level": lower_levels,
                 },
             )
-            # .with_columns(pl.col(pl.Int64).cast(pl.Int32))
+            .with_columns(pl.col(pl.Int64).cast(pl.Int32))
             .with_row_count(name="lineindex")
         )
         pldf.write_parquet(parquetfile, compression="zstd")
         print(f"Saved {parquetfile}")
     else:
         print(f"Reading {parquetfile}")
 
-    return (
+    linelist_lazy = (
         pl.scan_parquet(parquetfile)
         .with_columns(upperlevelindex=pl.col("upper_level") - 1, lowerlevelindex=pl.col("lower_level") - 1)
         .drop(["upper_level", "lower_level"])
+        .with_columns(pl.col(pl.Int64).cast(pl.Int32))
     )
 
+    if get_ion_str:
+        linelist_lazy = (
+            linelist_lazy.group_by(["atomic_number", "ionstage"])
+            .agg(
+                pl.struct(pl.exclude(["atomic_number", "ionstage"])).alias("othercols"),
+                pl.map_groups(
+                    exprs=[pl.col("atomic_number").first(), pl.col("ionstage").first()],
+                    function=lambda z_ionstage: f"{at.get_ionstring(z_ionstage[0].item(), z_ionstage[1].item())} bound-bound",
+                ).alias("ion_str"),
+            )
+            .explode("othercols")
+            .unnest("othercols")
+        )
+
+    return linelist_lazy
+
 
 @lru_cache(maxsize=8)
 def get_linelist_dataframe(

diff --git a/artistools/packets/packets.py b/artistools/packets/packets.py
@@ -13,7 +13,7 @@
 import artistools as at
 
 # for the parquet files
-time_parquetschemachange = (2023, 11, 19, 12, 0, 0)
+time_parquetschemachange = (2024, 2, 14, 11, 0, 0)
 
 CLIGHT = 2.99792458e10
 DAY = 86400

diff --git a/artistools/spectra/spectra.py b/artistools/spectra/spectra.py
@@ -814,16 +814,22 @@ def get_flux_contributions_from_packets(
     if groupby in {"terms", "upperterm"}:
         adata = at.atomic.get_levels(modelpath)
 
-    linelist = at.get_linelist_pldf(modelpath=modelpath).collect()
-    bflist = at.get_bflist(modelpath)
+    emtypecolumn = "emissiontype" if use_lastemissiontype else "trueemissiontype"
+
+    linelistlazy = at.get_linelist_pldf(modelpath=modelpath, get_ion_str=True)
+    bflistlazy = at.get_bflist(modelpath, get_ion_str=True)
+
+    if groupby != "ion":
+        linelist = linelistlazy.collect()
+        bflist = bflistlazy.collect()
 
     def get_emprocesslabel(emtype: int) -> str:
         if emtype >= 0:
             line = linelist.row(emtype, named=True)
 
             if groupby == "line":
                 return (
-                    f"{at.get_ionstring(line['atomic_number'], line['ionstage'])} "
+                    f"{line['ion_str']} "
                     f"λ{line['lambda_angstroms']:.0f} "
                     f"({line['upperlevelindex']}-{line['lowerlevelindex']})"
                 )
@@ -834,15 +840,15 @@ def get_emprocesslabel(emtype: int) -> str:
                 upper_term_noj = upper_config.split("_")[-1].split("[")[0]
                 lower_config = ion.levels.iloc[line["lowerlevelindex"]].levelname
                 lower_term_noj = lower_config.split("_")[-1].split("[")[0]
-                return f"{at.get_ionstring(line['atomic_number'], line['ionstage'])} {upper_term_noj}->{lower_term_noj}"
+                return f"{line['ion_str']} {upper_term_noj}->{lower_term_noj}"
 
             if groupby == "upperterm":
                 ion = adata[(adata["Z"] == line["atomic_number"]) & (adata["ionstage"] == line["ionstage"])].iloc[0]
                 upper_config = ion.levels.iloc[line["upperlevelindex"]].levelname
                 upper_term_noj = upper_config.split("_")[-1].split("[")[0]
-                return f"{at.get_ionstring(line['atomic_number'], line['ionstage'])} {upper_term_noj}"
+                return f"{line['ion_str']} {upper_term_noj}"
 
-            return f"{at.get_ionstring(line['atomic_number'], line['ionstage'])} bound-bound"
+            return line["ion_str"]
 
         if emtype == -9999999:
             return "free-free"
@@ -869,11 +875,10 @@ def get_absprocesslabel(abstype: int) -> str:
             return "free-free"
         return "bound-free" if abstype == -2 else "? other absorp."
 
-    emtypecolumn = "emissiontype" if use_lastemissiontype else "trueemissiontype"
-
     nprocs_read, lzdfpackets = at.packets.get_packets_pl(
         modelpath, maxpacketfiles=maxpacketfiles, packet_type="TYPE_ESCAPE", escape_type="TYPE_RPKT"
     )
+
     if emissionvelocitycut is not None:
         lzdfpackets = at.packets.add_derived_columns_lazy(lzdfpackets)
         lzdfpackets = lzdfpackets.filter(pl.col("emission_velocity") > emissionvelocitycut)
@@ -885,13 +890,28 @@ def get_absprocesslabel(abstype: int) -> str:
 
     if getemission:
         cols |= {"emissiontype_str", "nu_rf"}
-
-        emtypes = lzdfpackets.select(emtypecolumn).collect().get_column(emtypecolumn).unique().sort()
-        lzdfpackets = lzdfpackets.join(
-            pl.DataFrame({emtypecolumn: emtypes, "emissiontype_str": emtypes.map_elements(get_emprocesslabel)}).lazy(),
-            on=emtypecolumn,
-            how="left",
-        )
+        if groupby == "ion":
+            emtypestrings = pl.concat(
+                [
+                    linelistlazy.select([pl.col("lineindex").cast(pl.Int32).alias(emtypecolumn), pl.col("ion_str")]),
+                    pl.DataFrame(
+                        {emtypecolumn: [-9999999], "ion_str": "free-free"},
+                        schema_overrides={emtypecolumn: pl.Int32, "ion_str": pl.String},
+                    ).lazy(),
+                    bflistlazy.select([(-1 - pl.col("bfindex").cast(pl.Int32)).alias(emtypecolumn), pl.col("ion_str")]),
+                ],
+            ).rename({"ion_str": "emissiontype_str"})
+
+            lzdfpackets = lzdfpackets.join(emtypestrings, on=emtypecolumn, how="left")
+        else:
+            emtypes = lzdfpackets.select(emtypecolumn).collect().get_column(emtypecolumn).unique().sort()
+            lzdfpackets = lzdfpackets.join(
+                pl.DataFrame(
+                    {emtypecolumn: emtypes, "emissiontype_str": emtypes.map_elements(get_emprocesslabel)}
+                ).lazy(),
+                on=emtypecolumn,
+                how="left",
+            )
 
     if getabsorption:
         cols |= {"absorptiontype_str", "absorption_freq"}