diff --git a/working_dirs/kyle/results_analysis/ppl_dup_figure.ipynb b/working_dirs/kyle/results_analysis/ppl_dup_figure.ipynb index d0539a2..a383244 100644 --- a/working_dirs/kyle/results_analysis/ppl_dup_figure.ipynb +++ b/working_dirs/kyle/results_analysis/ppl_dup_figure.ipynb @@ -704,7 +704,7 @@ "name": "stderr", "output_type": "stream", "text": [ - " 72%|███████▏ | 722/1000 [1:16:32<30:45, 6.64s/it]" + " 89%|████████▉ | 892/1000 [1:34:35<11:02, 6.13s/it]" ] } ], diff --git a/working_dirs/kyle/results_analysis/results_analysis.ipynb b/working_dirs/kyle/results_analysis/results_analysis.ipynb index 9b04056..308cac0 100644 --- a/working_dirs/kyle/results_analysis/results_analysis.ipynb +++ b/working_dirs/kyle/results_analysis/results_analysis.ipynb @@ -2541,22 +2541,36 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Plotting Features: 0%| | 0/14 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ "hist_splits = [\"_deduped_12b\"]\n", "features = [\n", - " \"sequence_duplicates\", \"max_frequency\", \"avg_frequency\", \"min_frequency\",\n", - " \"median_frequency\", \"p25_frequency\", \"p75_frequency\",\n", + " \"sequence_duplicates\",\n", + " \"max_frequency\",\n", + " \"avg_frequency\",\n", + " \"min_frequency\",\n", + " \"median_frequency\",\n", + " \"p25_frequency\", \"p75_frequency\",\n", " \"0_8_templates\", \"huffman_coding_length\", \"prompt_perplexity\", \"generation_perplexity\", \"sequence_perplexity\", \"0_8_snowclones\", \"loss\"\n", "]\n", "bins_per_feature = {\n", @@ -2583,13 +2597,13 @@ " \"median_frequency\": 10**6,\n", " \"p25_frequency\": 10**5,\n", " \"p75_frequency\": 10**6,\n", - " \"0_8_templates\": 10**-0.6,\n", " \"prompt_perplexity\": 10**0,\n", " \"generation_perplexity\": 10**0,\n", " \"sequence_perplexity\": 10**0, \n", + " \"loss\": 10**-0.5,\n", " \"huffman_coding_length\": 2, \n", + " \"0_8_templates\": 10**-0.6,\n", " \"0_8_snowclones\": 10**0, \n", - " \"loss\": 10**-0.5\n", "}\n", "max_threshold = {\n", " \"sequence_duplicates\": 10**7,\n", @@ -2609,19 +2623,19 @@ "}\n", "name_map = {\n", " \"sequence_duplicates\": \"Duplicates\",\n", + " \"0_8_templates\": \"Textual Duplicates\",\n", + " \"0_8_snowclones\": \"Semantic Duplicates\",\n", + " \"prompt_perplexity\": \"Prompt PPL\",\n", + " \"generation_perplexity\": \"Generation PPL\",\n", + " \"sequence_perplexity\": \"Sequence PPL\",\n", + " \"loss\": \"Loss\",\n", " \"max_frequency\": \"Max Token Freq.\",\n", " \"avg_frequency\": \"Mean Token Freq.\",\n", " \"min_frequency\": \"Min Token Freq.\",\n", " \"median_frequency\": \"Median Token Freq.\",\n", " \"p25_frequency\": \"P25 Token Freq.\",\n", " \"p75_frequency\": \"P75 Token Freq.\",\n", - " \"0_8_templates\": \"0.8 Templates\",\n", - " \"prompt_perplexity\": \"Prompt PPL\",\n", - " \"generation_perplexity\": \"Generation PPL\",\n", - " \"sequence_perplexity\": \"Sequence PPL\",\n", " \"huffman_coding_length\": \"Huffman Length\",\n", - " \"0_8_snowclones\": \"0.8 Snowclones\",\n", - " \"loss\": \"Loss\"\n", "}\n", "e = 1e-10\n", "num_rows = 2\n", @@ -2630,7 +2644,7 @@ "fig, axs = plt.subplots(num_rows, num_columns, figsize=(17, 5))\n", "axs = axs.flatten()\n", "for i, split in enumerate(hist_splits):\n", - " for j, fx in tqdm(enumerate(features), desc=\"Plotting Features\", total=len(features)):\n", + " for j, fx in tqdm(enumerate(name_map.keys()), desc=\"Plotting Features\", total=len(features)):\n", " memories = hists_plotting_frame[hists_plotting_frame[\"Memorized\"] == True][fx]\n", " memories = [value for value in memories if value >= 0]\n", " df_memories = pd.DataFrame(memories, columns=[fx])\n", @@ -2647,8 +2661,8 @@ " bins = bins_all \n", "\n", " # no whitespace between histograms for continuous features. Make width a bit larger\n", - " sns.histplot(data=df_pile[fx], bins=bins, label=\"Pile\", ax=axs[i * num_columns + j], stat=\"percent\", binwidth=0.1)\n", - " sns.histplot(data=df_memories[fx], bins=bins, label=\"Memorized\", ax=axs[i * num_columns + j], stat=\"percent\", binwidth=0.1)\n", + " sns.histplot(data=df_pile[fx], bins=bins, label=\"Pile\", ax=axs[i * num_columns + j], stat=\"percent\", element=\"step\")\n", + " sns.histplot(data=df_memories[fx], bins=bins, label=\"Memorized\", ax=axs[i * num_columns + j], stat=\"percent\", element=\"step\")\n", "\n", " if fx == \"huffman_coding_length\":\n", " axs[i * num_columns + j].set_xscale(\"linear\") \n", @@ -2678,6 +2692,7 @@ " axs[i * num_columns + j].set_ylabel(\"\")\n", "\n", "fig.legend(labels=[\"Not Memorized\", \"Memorized\"], loc=\"upper center\", bbox_to_anchor=(0.5, -0.005), ncol=2, fontsize=18, frameon=False)\n", + "fig.align_xlabels()\n", "plt.tight_layout()\n", "fig.savefig(f\"{figures_path}/histograms_percents.pdf\", bbox_inches=\"tight\")\n", "plt.show()" diff --git a/working_dirs/kyle/results_analysis/scale+time_figures/recitation_threshold_5/histograms_percents.pdf b/working_dirs/kyle/results_analysis/scale+time_figures/recitation_threshold_5/histograms_percents.pdf index 07fdcd2..d7d9550 100644 Binary files a/working_dirs/kyle/results_analysis/scale+time_figures/recitation_threshold_5/histograms_percents.pdf and b/working_dirs/kyle/results_analysis/scale+time_figures/recitation_threshold_5/histograms_percents.pdf differ