diff --git a/.taskfiles/hugo/taskfile.yaml b/.taskfiles/hugo/taskfile.yaml index df32542..1d7063f 100644 --- a/.taskfiles/hugo/taskfile.yaml +++ b/.taskfiles/hugo/taskfile.yaml @@ -33,7 +33,7 @@ tasks: sed -i '/draft: true/c\draft: false' {{ .path }} - | sed -i '/date: .*/c\date: {{ now.Format "2006-01-02" }}' {{ .path }} - - echo "Branch is ready for PR" + - echo "Commit this change, then branch is ready for PR!" # - task: _pull_request requires: vars: ["path"] diff --git a/archetypes/blog.md b/archetypes/blog.md index 769bcda..b054c36 100644 --- a/archetypes/blog.md +++ b/archetypes/blog.md @@ -1,6 +1,10 @@ --- title: {{ replace .Name "-" " " | title }} date: {{ .Date }} +authors: + - name: ahgraber + link: https://github.com/ahgraber + image: https://github.com/ahgraber.png tags: # meta - 'meta' @@ -19,5 +23,8 @@ tags: - 'copyright' - 'privacy' series: [] +layout: single +toc: true +math: false draft: true --- diff --git a/config/_default/params.yaml b/config/_default/params.yaml index 51e9c01..1584f08 100644 --- a/config/_default/params.yaml +++ b/config/_default/params.yaml @@ -41,7 +41,7 @@ navbar: width: 50 height: 50 footer: - width: full # *width + width: *width displayCopyright: true displayPoweredBy: true diff --git a/content/blog/hello-world/index.md b/content/blog/hello-world/index.md index 7cb9780..1ba6428 100644 --- a/content/blog/hello-world/index.md +++ b/content/blog/hello-world/index.md @@ -1,10 +1,17 @@ --- title: Hello World! date: 2024-04-22 +authors: + - name: ahgraber + link: https://github.com/ahgraber + image: https://github.com/ahgraber.png tags: - 'meta' - 'blogumentation' - 'homelab' +layout: single +toc: true +math: false draft: false --- diff --git a/content/blog/llama3-cohort.md/architectures.csv b/content/blog/llama3-cohort.md/architectures.csv new file mode 100644 index 0000000..e72d4a9 --- /dev/null +++ b/content/blog/llama3-cohort.md/architectures.csv @@ -0,0 +1,21 @@ +,Meta,,,Google,Cohere,Databricks,Mistral,Meta,,Microsoft,,,Snowflake,DeepSeek +Release Date,18-Jul-23,,,21-Feb-24,11-Mar-24,27-Mar-24,17-Apr-24,18-Apr-24,,22-Apr-24,,,24-Apr-24,7-May-24 +Name,llama-2-7B,llama-2-13B,llama-2-70B,Gemma 7B,Command-R,DBRX,8x22B,llama-3-8B,llama-3-70B,Phi 3 mini,Phi 3 small,Phi 3 medium,Arctic,v2 +Training Tokens,2T,2T,2T,6T,_?_,12T,_?_,15T,80T,3.3T,4.8T,4.8T,3.5T,8.1T +Tokenizer Vocabulary,32k,32k,32k,256k,256k,100k,32k,128k,128k,32k,100k,32k (?),32k,100k +Context Length (training),4k,4k,4k,8k,8k,32k,4k,8k,8k,4k,4k,,4k,4k +Hidden dimension,4096,5120,8192,3072,8192,6144,6144,4096,8192,3072,4096,5120,7168,5120 +FF dimension,11008,13824,28672,24576,,10752,16384,14336,28672,8192,_?_,_?_,4864,1536 +Positional Encoding,RoPE,RoPE,RoPE,RoPE,RoPE?,RoPE,RoPE,RoPE,RoPE,RoPE / LongRoPE,RoPE?,RoPE?,RoPE,RoPE +Normalization,RMSNorm,RMSNorm,RMSNorm,RMSNorm,_?_,Layer,RMSNorm,RMSNorm,RMSNorm,RMSNorm,_?_,_?_,RMSNorm,RMSNorm +Activation Function,SwiGLU,SwiGLU,SwiGLU,GeGLU,SiLU,GLU,SiLU,SwiGLU,SwiGLU,SiLU,_?_,_?_,SwiGLU,SwiGLU +Attention,_?_,_?_,GQA,MQA,_?_,GQA,"SWA, GQA",GQA,GQA,SWA,GQA; BlockSparse,_?_,Attention-sinks SWA (TBD),MLA +Heads,32,40,64,16,64,48,48,32,64,32,32,40,56,128 +Layers,32,40,80,28,40,40,56,32,80,32,32,40,35,60 +Alignment,"SFT, PPO","SFT, PPO","SFT, Rejection Sampling, PPO","SFT, RLHF",_?_,"SFT, _RLHF (implied)_","? SFT, DPO","SFT, Rejection Sampling, PPO, DPO","SFT, Rejection Sampling, PPO, DPO","SFT, DPO",_?_,_?_,SFT,"SFT, GRPO" +MoE,no,no,no,no,no,yes,yes,no,no,no,no,no,hybrid,yes +Experts,,,,,,16,8,,,,,,128,160+2 +Top-k,,,,,,4,2,,,,,,2,6 +Total Params,,,,,,132B,141B,,,,,,480B,236B +**Parameters (active)**,**7B**,**13B**,**70B**,**7B**,**35B**,**36B**,**39B**,**8B**,**70B**,**3.8B**,**7B**,**14B**,**17B**,**21B** +Context Length (inference),4k,4k,4k,8k,128k,32k,64k,8k,8k,4k; 128k,8k,_?_,4k; 32k with SWA,128k diff --git a/content/blog/llama3-cohort.md/benchmarks.csv b/content/blog/llama3-cohort.md/benchmarks.csv new file mode 100644 index 0000000..b0fde3e --- /dev/null +++ b/content/blog/llama3-cohort.md/benchmarks.csv @@ -0,0 +1,21 @@ +,Release Date,Name,MMLU (language),modifier,GSM8K (math),modifier,HumanEval (code),modifier +Meta,18-Jul-23,llama-2-7B,34.1,5-shot,25.7,8-shot CoT,7.9,0-shot +,,llama-2-13B,47.8,5-shot,77.4,8-shot CoT,14,0-shot +,,llama-2-70B,52.9,5-shot,57.5,8-shot CoT,25.6,0-shot +Google,21-Feb-24,Gemma 7B,64.3,5-shot,46.4,maj@1,32.3,0-shot +Cohere,11-Mar-24,Command-R,59.3,5-shot,,,, +Databricks,27-Mar-24,DBRX,73.7,5-shot,72.8,8-shot CoT,70.1,0-shot +Mistral,17-Apr-24,8x22B,77.7,5-shot,90.8,8-shot CoT,45.1,0-shot +Meta,18-Apr-24,llama-3-8B,68.4,5-shot,79.6,8-shot CoT,62.2,0-shot +,,llama-3-70B,82,5-shot,93,8-shot CoT,71.7,0-shot +Microsoft,22-Apr-24,Phi 3 mini,68.8,5-shot,82.5,0-shot CoT,59.1,0-shot +,,Phi 3 small,75.3,5-shot,88.9,0-shot CoT,59.1,0-shot +,,Phi 3 medium,78.2,5-shot,90.3,0-shot CoT,55.5,0-shot +Snowflake,24-Apr-24,Arctic,67.3,5-shot,74.2,?,64.3,? +DeepSeek,7-May-24,v2,78.5,5-shot,79.2,0-shot CoT,48.8,0-shot +,,,,,,,, +Anthropic,4-Mar-24,Claude 3 Haiku,75.2,5-shot,88.9,0-shot,75.9,0-shot +,,Claude 3 Sonnet,79,5-shot,92.3,0-shot,73,0-shot +,,Claude 3 Opus,86.8,5-shot,95,0-shot,84.9,0-shot +OpenAI,14-Mar-23,GPT 3.5-turbo,70,5-shot,57.1,5-shot CoT,48.1,0-shot +,14-Mar-23,GPT 4,86.4,5-shot,92,5-shot CoT,67,0-shot diff --git a/content/blog/llama3-cohort.md/environmental_impact.csv b/content/blog/llama3-cohort.md/environmental_impact.csv new file mode 100644 index 0000000..41a1d72 --- /dev/null +++ b/content/blog/llama3-cohort.md/environmental_impact.csv @@ -0,0 +1,15 @@ +,Release Date,Name,GPUs,GPU Hours,Power Consumption (W),tCO2eq,FLOPs*,assumed utilization* +Meta,18-Jul-23,llama-2-7B,A100-80GB,"184,320",400,31.22,1.60E+22,30% +,,llama-2-13B,A100-80GB,"368,640",400,62.44,3.10E+22,30% +,,llama-2-70B,A100-80GB,"1,720,320",400,291.42,1.40E+23,30% +Google,21-Feb-24,Gemma 7B,4096 TPUv5e,,,"~131, incl. 2B models",, +Cohere,11-Mar-24,Command-R,,,,,, +Databricks,27-Mar-24,DBRX,3072 H100s,,,,, +Mistral,17-Apr-24,8x22B,,,,,, +Meta,18-Apr-24,llama-3-8B,16k H100s,1.3M,700,390,2.40E+23,40% +,,llama-3-70B,16k H100s,6.4M,700,1900,1.20E+24,40% +Microsoft,22-Apr-24,Phi 3 mini,,,,,, +,,Phi 3 small,,,,,, +,,Phi 3 medium,,,,,, +Snowflake,24-Apr-24,Arctic,H100s,"~504,000",700,,7.10E+22,30% +DeepSeek,7-May-24,v2,H800s,"~172,800",700,,, diff --git a/content/blog/llama3-cohort.md/index.md b/content/blog/llama3-cohort.md/index.md new file mode 100644 index 0000000..d3465cb --- /dev/null +++ b/content/blog/llama3-cohort.md/index.md @@ -0,0 +1,240 @@ +--- +title: Cohort of Models +date: 2024-05-11 +authors: + - name: ahgraber + link: https://github.com/ahgraber + image: https://github.com/ahgraber.png +tags: ["LLMs", "generative ai", "comparison"] +series: [] +layout: wide +toc: false +math: false +draft: false +--- +The "open weights" or "open model" LLM ecosystem is thriving, +with major new releases from Meta, Microsoft, Databricks, and Snowflake in the past two months. +Given that Meta's Llama 2 family became a standard for comparison for all other models, +I thought it would be useful that aggregate all of the information of the 'Llama 3 cohort' in a single place. +I've included Llama 2 as a point of comparison. Models are ordered by date of introduction. + +{{< callout type="question" emoji="📣" >}} + I've done my best to make these table not-terrible, + but they're probably still trash on mobile or non-wide aspect ratios. + Sorry. +{{< /callout >}} + +## Architectures + +{{< table path="architectures.csv" header="true" caption="A comparison of model architectures" >}} + +{{< callout type="warning" >}} + I've not included ChatGPT 3.5 or 4, or Anthropic's Claude because they're completely closed models + and I was unable to find any information worth comparing. +{{< /callout >}} + +## Performance + +{{< callout type="question" emoji="📣" >}} + + 1. I've transposed table rows/columns from here on (they fit better that way). + 2. I've included benchmarks from OpenAI's GPT4 paper and Anthropic's Claude 3 announcement as points of comparison. +{{< /callout >}} + +{{< table path="benchmarks.csv" header="true" caption="A comparison of model architectures" >}} + +## Environmental impact + +{{< table path="environmental_impact.csv" header="true" caption="Environmental impacts of training (but not inference)" >}} + +\* FLOPs and % utilization are estimated using Epoch AI's online tool[^compute] + +## Observations + +In doing the research and aggregation for these tables, +I read, and reread, and pored over the papers describing the technical details of these models. + +The TLDR? All changes have been made with inference efficiency in mind. + +### Data matters + +Almost every paper makes a statement about how they cleaned, filtered, and optimized their data; +several (Llama 3, Phi 3) use LLMs in this process. +Some (the Phi 3 models in particular), also use LLMs to generate "high quality synthetic" data. +Databricks notes in their DBRX announcement that the dataset used to train DBRX is 2x as good as +the dataset they used in 2023; that is, "half as many tokens are necessary to reach the same model quality."[^dbrx] +Concretely, curation of training data removes the low-information sequences that require compute to process +but minimally improve model performance. + +Additionally, models are training on more tokens, +moving from compute-optimal toward data-optimal frontiers of the Chinchilla laws. +Microsoft explicitly state this in the Phi 3 technical paper, +saying they "calibrate the training data to be closer to the "data optimal" regime for small models."[^phi] + +{{< callout type="info" emoji="💡" >}} + The Chinchilla "laws" describe the relationship between model size (parameters), + dataset size (tokens), performance (loss), and cost of compute (FLOPs). + Historically, the Chinchilla laws have been used to understand the "compute optimal" point, + or the point of diminishing returns (in terms of model performance improvement) associated with additional compute, + given a static model size and dataset. +{{< /callout >}} + +Why is this? Well, it turns out that inference costs are expensive. _Really expensive_. +Like, OpenAI-might-go-bankrupt-because-it-costs-$700,000-per-day expensive.[^cost] +LLM providers want efficiency, but they also need to show improvements over the last generation. +To do this, they have transitioned toward the "data optimal" (or more importantly, inference-efficient) training paradigm. +The Chinchilla laws suggest that you can reach the same performance as a "compute optimal" model by training a smaller model for longer on more tokens.[^law] +The Llama 3 cohort trains a model the same size as the prior generation for longer on many more, more informative tokens, +demonstrating generational improvements without increasing model sizes (and therefore without increasing inference requirements). + +### Attention on attention + +Aside from optimizations to Attention, there have been minimal changes to the Transformers architecture. +All models use RoPE for their positional embeddings, almost all use RMSNorm, +and the majority use SwiGLU for the activation function in their feedforward modules. + +Almost all new models use new approaches to optimize Attention, +including Multi-Query Attention (MQA)[^mqa], Grouped-Query Attention (GQA)[^gqa], +Sliding Window Attention (SWA)[^swa], and even attention sinks[^snk]! +Most models use GQA, while Mistral retains its use of SWA from its prior generation, and Phi 3 experiments with it. +Snowflake shared they are "developing an attention-sinks-based sliding window implementation +to support unlimited sequence generation capability" in their Arctic release announcement.[^snow] +DeepSeek just released their v2, which introduces a novel Multi-head Latent Attention (MLA) mechanism.[^mla] + +Why the focus on Attention? The Attention mechanism scales poorly relative to sequence length. +As we increase the context length that LLMs can handle, we massively increase compute and memory required +to allow each token to attend to each other token (quadratic scaling). +While Flash Attention[^flash] and KV Caching have reduced this worst-case quadratic scaling, +Attention is still incredibly memory-hungry - KV caching trades linear compute scaling (good!) for increased memory requirements (bad!) at inference time. +The aforementioned Attention optimizations focus primarily on reducing memory used by Attention, especially during inference. + +### Expert expectations + +Four of the Llama-3 cohort use a Mixture of Experts (MoE) design; +Snowflake Arctic and DeepSeek-v2 use novel MoE architectures while Mistral 8x22B and DBRX use a more standard sparse MoE. +Mixture of Experts models alter the feed-forward component of a standard Transformer, turning the standard dense network +into a larger sparse one. +These wider FF nets are grouped into "experts", and a router determines which expert is active on a per-token basis. +In theory, MoE allows the model to have a much higher parameter count (see `active parameters` in the top table above). +Therefore, an LLM with an MoE architecture is kind of a cheat against the Chinchilla laws; +they tend to train quickly (i.e., with less compute) and have very good performance (on benchmarks) relative to dense models of equivalent _active_ parameter counts. +The sparsity of MoE also means they tend to be fast in inference. + +The downsize? +While MoE models are _compute_ efficient because only a portion of the sparse feed-forward is active at any one time, +MoE models have larger memory requirements because all of the parameters (even the inactive ones) have to be loaded into memory +for inference. + +As an aside, I have to wonder about the use of Mixture-of-Experts as a way to increase model size while maintaining/improving inference speed. +Recent research with pruning dense models indicates LLMs are already overparameterized.[^short] [^prune] +These MoE models seem to be even further overparameterized vs their dense peers. +Both DataBricks DBRX and Snowflake Arctic are MoE models with significantly more total _and_ active parameters than Llama 3 8B, +and both trained on fewer tokens. +DBRX is closer (12T DBRX vs 15T Llama3 8B). Snowflake Arctic only trained on <4T, +thought this may be because Arctic is intended more for "enterprise" use cases than as a general language model +(the lack of MMLU scores reported for Arctic adds weight to this theory). +Perhaps the additional overparameterization helps to explain their training speed and performance, +while the sparsity associated with the expert routing is akin to pruning a dense network? + +### Inference implications + +{{< callout type="question" emoji="📣" >}} + Most of what I'll mention here is a rehash of what I learned listening to [Dylan Patel (of SemiAnalysis) on the Latent Space podcast](https://www.latent.space/p/semianalysis). + It is 100% worth the listen (or reviewing the transcript). + + In fact, go do that now. I'll wait. +{{< /callout >}} + +As Dylan Patel pointed out, training LLMs tends to be compute-intensive; compute FLOPs are the bottleneck. +You get as close to 100% utilization (MFU - model FLOPs utilization) as you can +(given network/communication and sharding inefficiencies) during training. +At inference time, you need _half_ the compute needed when training (you only need the forward pass, no backpropagation). +If you want to get more responses to more prompts, you need memory. +If you want to provide longer answers or have longer input contexts, you need memory. +And as I discussed earlier, Attention is memory hungry; it scales approximately quadratically based on sequence length. +So all of a sudden, you're limited by memory bandwidth (MBU - _memory_ bandwidth utilization). + +Now we see why the AI Engineers made the choices they have for the Llama-3 cohort. +MoE lets us take advantage of larger GPU clusters' total memory and spread the MBU out over more individual GPUs. +Attention optimizations reduce the memory requirements relative to sequence lengths, reducing MBU requirements. +Due to the data-optimal paradigm shift, model sizes have (generally) not increased, +meaning we get better performance on today's infrastructure, or we can retain current performance with smaller models... +reducing MBU requirements. Everything comes back to efficiency in inference. + +### Frustrations and concerns + +Over the course of researching for this post, I ran into some frustrations: + +First, technical papers do not always include details of the model architectures, +either leaving them entirely unsaid or referring the reader to "prior work". +This is lazy. Since the reader _does_ have access to the information in the prior work, +the authors are not hiding anything per se, but are making the reader chase down the unspecified pieces of information. +It also reduces any replicability (not that replication is possible anyway, given these models are generally open weights +and do not share training code or source datasets... but I digress) +and invites assumptions that may not be accurate should the wrong prior work be referenced +or should the current implementation use some slight unmentioned deviations from the prior work +that was glossed over or forgotten during publication. +As a side note, I found HuggingFace's model catalog[^huggingface] to be invaluable in filling in many of the details, +especially around hidden and feed-forward dimensions and attention head and layer counts. + +Secondly, reporting performance benchmarks is unstandardized. +In the papers, there was high variance in terms of benchmark specificity, meaning the reported results are likely _not_ directly comparable -- +and this is prior to considering any shenanigans on the _implementation_ of the benchmarks.[^openllm] [^benchmarks] +MMLU might be measured based with English-only questions, or on subsets of the benchmark, +or used to demonstrate multilingual capabilities. +GSM8K evaluation might use 8-shot chain-of-thought, 5-shot chain-of-thought, 0-shot, or majority voting. +Further, its often unclear whether the reported benchmarks are done on the model after pretraining, +after fine-tuning, or after alignment (or some combination that provides the best results). +So, take all benchmarks with a grain of salt... which, doesn't that kind of invalidate the raison d'etre of a benchmark? + +Finally, I have to credit Meta for revealing training cost in terms of GPU hours _and_ energy implications (tons of CO2 equivalent). +It is frustrating to me that it is nonstandard to discuss energy grid and environmental implications of their training costs. +I would like to see all technical papers discuss the training costs/requirements for their models, +even if they do not release the source code to replicate. +GPU class, GPU time, FLOPs, and tCO2eq are all important metrics to understand the energy requirements of training a model. +Further, I would also like to see papers report the cost per 1M token inference in the same manner - +GPUs required, GPU class, FLOPs, and tCO2eq. + +## References + +- [Models - Hugging Face](https://huggingface.co/models) +- [[2302.13971] LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) +- [[2307.09288] Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288) +- [Introducing Meta Llama 3: The most capable openly available LLM to date](https://ai.meta.com/blog/meta-llama-3/) +- [gemma-report.pdf](https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf) +- [[2310.06825] Mistral 7B](https://arxiv.org/abs/2310.06825) +- [[2401.04088] Mixtral of Experts](https://arxiv.org/abs/2401.04088) +- [Cheaper, Better, Faster, Stronger | Mistral AI | Frontier AI in your hands](https://mistral.ai/news/mixtral-8x22b/) +- [mistralai/mistral-src: Reference implementation of Mistral AI 7B v0.1 model.](https://github.com/mistralai/mistral-src) +- [databricks/dbrx: Code examples and resources for DBRX, a large language model developed by Databricks](https://github.com/databricks/dbrx) +- [Snowflake-Labs/snowflake-arctic](https://github.com/Snowflake-Labs/snowflake-arctic) +- [Snowflake Arctic Cookbook Series: Arctic's Approach to Data | by Snowflake AI Research | Snowflake Builders Blog: Data Engineers, App Developers, AI/ML, & Data Science | Apr, 2024 | Medium](https://medium.com/snowflake/snowflake-arctic-cookbook-series-arctics-approach-to-data-b81a8a0958bd) +- [[2309.05463] Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) +- [[2306.11644] Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) +- [Command R: RAG at Production Scale](https://cohere.com/blog/command-r) +- [Introducing the next generation of Claude \ Anthropic](https://www.anthropic.com/news/claude-3-family) +- [Model_Card_Claude_3.pdf](https://www-cdn.anthropic.com/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627/Model_Card_Claude_3.pdf) +- [Zhen Wang on LinkedIn: #anthropic #claude #tokenizer #llm](https://www.linkedin.com/posts/zhenwang_anthropic-claude-tokenizer-activity-7067072872019619840-hZ-7) +- [GPT-4 | OpenAI](https://openai.com/index/gpt-4-research/) +- [[2303.08774] GPT-4 Technical Report](https://arxiv.org/abs/2303.08774) +- [Andrej Karpathy on X: "Congrats to @AIatMeta on Llama 3 release!! 🎉](https://twitter.com/karpathy/status/1781028605709234613) + +## Works Cited + +[^compute]: [Estimating Training Compute of Deep Learning Models – Epoch AI](https://epochai.org/blog/estimating-training-compute) +[^dbrx]: [Introducing DBRX: A New State-of-the-Art Open LLM | Databricks Blog](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) +[^phi]: [[2404.14219] Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219) +[^cost]: [The Inference Cost Of Search Disruption – Large Language Model Cost Analysis](https://www.semianalysis.com/p/the-inference-cost-of-search-disruption) +[^law]: [Revised Chinchilla scaling laws – LLM compute and token requirements – Educating Silicon](https://www.educatingsilicon.com/2024/04/29/revised-chinchilla-scaling-laws-impact-on-llm-compute-and-token-requirements/) +[^mqa]: [[1911.02150] Fast Transformer Decoding: One Write-Head is All You Need](https://arxiv.org/abs/1911.02150) +[^gqa]: [[2305.13245] GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints](https://arxiv.org/abs/2305.13245) +[^swa]: [[2004.05150v2] Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150v2) +[^snk]: [[2309.17453] Efficient Streaming Language Models with Attention Sinks](https://arxiv.org/abs/2309.17453) +[^snow]: [Snowflake Arctic - LLM for Enterprise AI](https://www.snowflake.com/blog/arctic-open-efficient-foundation-language-models-snowflake/) +[^mla]: [[2405.04434v2] DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model](https://arxiv.org/abs/2405.04434v2) +[^flash]: [[2205.14135] FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness](https://arxiv.org/abs/2205.14135) +[^short]: [[2403.03853] ShortGPT: Layers in Large Language Models are More Redundant Than You Expect](https://arxiv.org/abs/2403.03853) +[^prune]: [[2306.11695] A Simple and Effective Pruning Approach for Large Language Models](https://arxiv.org/abs/2306.11695) +[^huggingface]: [Models - Hugging Face](https://huggingface.co/models) +[^openllm]: [What's going on with the Open LLM Leaderboard?](https://huggingface.co/blog/open-llm-leaderboard-mmlu) +[^benchmarks]: [[2402.01781v1] When Benchmarks are Targets: Revealing the Sensitivity of Large Language Model Leaderboards](https://arxiv.org/abs/2402.01781v1?trk=public_post_comment-text) diff --git a/content/blog/no-mo-robo/index.md b/content/blog/no-mo-robo/index.md index fd9e51f..c34a1a0 100644 --- a/content/blog/no-mo-robo/index.md +++ b/content/blog/no-mo-robo/index.md @@ -1,12 +1,19 @@ --- title: No Mo Robo date: 2024-05-03 +authors: + - name: ahgraber + link: https://github.com/ahgraber + image: https://github.com/ahgraber.png tags: - 'opinion' - 'generative AI' - 'LLMs' - 'copyright' series: [] +layout: single +toc: true +math: false draft: false --- ## Web crawlers & search diff --git a/content/blog/the-compounding-error-of-generative-models/index.md b/content/blog/the-compounding-error-of-generative-models/index.md index 7043972..8633892 100644 --- a/content/blog/the-compounding-error-of-generative-models/index.md +++ b/content/blog/the-compounding-error-of-generative-models/index.md @@ -1,12 +1,18 @@ --- title: The Compounding Error of Generative Models date: 2024-04-30 +authors: + - name: ahgraber + link: https://github.com/ahgraber + image: https://github.com/ahgraber.png tags: - 'agents' - 'generative AI' - 'prompts' - 'LLMs' series: [] +layout: single +toc: true math: true draft: false --- diff --git a/layouts/blog/wide.html b/layouts/blog/wide.html new file mode 100644 index 0000000..0ba01e1 --- /dev/null +++ b/layouts/blog/wide.html @@ -0,0 +1,54 @@ +{{ define "main" }} +
{{ . | markdownify | emojify }} | {{ end }} +|
---|---|
{{ . }} | + {{ else }} +{{ . | markdownify | emojify }} | + {{ end }} + {{ end }} +