diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml new file mode 100644 index 0000000000..6746ff98dc --- /dev/null +++ b/.github/workflows/python.yml @@ -0,0 +1,35 @@ +name: python + +on: + workflow_dispatch: + pull_request: + branches: + '**' + schedule: + - cron: "0 0 * * *" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + unit-tests: + strategy: + matrix: + pyVersion: ["3.10"] + fail-fast: false + + runs-on: ubuntu-22.04 + container: + image: deepspeed/gh-builder:py${{ matrix.pyVersion }} + + steps: + - uses: actions/checkout@v4 + + - name: environment + run: | + which python + python --version + - name: Install Megatron-DeepSpeed + run: | + pip3 install . diff --git a/.gitignore b/.gitignore index 3e46cef4c5..5079fb8bf2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,45 @@ +# User Added +.jobenv +**.e[0-9]** +**.o[0-9]** +**.e6** +**.o6** +**.e9** +**.o9** +**.e1** +**.o1** +*.o17* +*.e17* +*.o1 +*.e1 +deps/* +OUTPUTS/* +ALCF/OUTPUTS/* +*tmp* +*core.* +*old* +*.bak +**index-cache** +**pbslogs** +ezpz +*hostfile* +.deepspeed_env +*.DS_Store +old/* +**venv** +*.json +outputs/ +venvs/ +wandb/ +llama-logs/ +checkpoints/ +*.gz +*.txt +*.idx +*.bin +*.log +__pycache__ + .deepspeed_env *.bak .cache/* diff --git a/ALCF/README.md b/ALCF/README.md index 1a8612ed8a..b0eb99deb6 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -1,74 +1,947 @@ # Megatron-DeepSpeed @ ALCF -## Polaris +> [!IMPORTANT] +> [`train_llama_alcf.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/train_llama_alcf.sh) is the main entry point for launching +> distributed training on {Polaris, Aurora, Sunspot} @ ALCF. -- [ ] Ensure / double check that optimizer settings from `ds_config.json` aren't being overwritten by some defaults in `megatron/arguments.py` - - [ ] specifically, `momentum, beta{1, 2}, etc` - -
Completed +## 🏃‍♂️ Running -- Continue runs on Polaris @ - - [x] 48 Nodes - - [x] 32 Nodes - - [x] 16 Nodes - - [x] 8 Nodes - - [x] 4 Nodes +To launch on {`Polaris`, `Aurora`, `Sunspot`} @ [ALCF](https://alcf.anl.gov): -- [x] Then, try re-creating ( / fixing) conda with `cuda==12.1` - - 😔, failed. - -- ~~‼️ Unable to save checkpoints with `torch==2.1` + `cuda==11.8`~~: - - Fixed in [a57a21f](https://github.com/argonne-lcf/Megatron-DeepSpeed/commit/a57a21f6b2a8abf847f5ef599e1b1edcb5a5e1b5) +1.
⏳ Request an interactive job with qsub -I: -
🐛 Bug - - - Training progresses OK: + ```bash + qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I + ``` + + - Or, alternatively, you can submit [`train_llama_alcf.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/train_llama_alcf.sh) + directly as a batch script with ```bash - [2024-03-07 15:27:02,646] [INFO] [timer.py:260:stop] epoch=0/micro_step=199/global_step=199, RunningAvgSamplesPerSec=58.730622229657506, CurrSamplesPerSec=61.35304005128382, MemAllocated=6.01GB, MaxMemAllocated=19.52GB - iteration 199/ 317892 | consumed samples: 152832 | consumed tokens: 625999872 | elapsed time per iteration (ms): 14287.5 | learning rate: 2.407E-04 | global batch size: 768 | lm loss: 5.905366E+00 | loss scale: 8192.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 53.753 | tokens per gpu per second (tgs): 1146.733 | TFLOPs: 69.85 | - [2024-03-07 15:27:15,063] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=4, lr=[0.000240653265864008, 0.000240653265864008], mom=[(0.9, 0.999), (0.9, 0.999)] - [2024-03-07 15:27:17,188] [INFO] [timer.py:260:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=58.730745476291396, CurrSamplesPerSec=58.75503515561452, MemAllocated=6.01GB, MaxMemAllocated=19.52GB - iteration 200/ 317892 | consumed samples: 153600 | consumed tokens: 629145600 | elapsed time per iteration (ms): 14541.4 | learning rate: 2.407E-04 | global batch size: 768 | lm loss: 5.897035E+00 | loss scale: 8192.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 52.815 | tokens per gpu per second (tgs): 1126.713 | TFLOPs: 68.63 | - saving checkpoint at iteration 200 to checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb768_pp1_tp2_fp16 - # ... + cd Megatron-DeepSpeed + qsub -A -q debug -l select=2 -l walltime=01:00:00:filesystems=eagle:home train_llama_alcf.sh ``` - - Then crashes with: +
- ```python - Traceback (most recent call last): - Traceback (most recent call last): - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/pretrain_gpt_alcf.py", line 575, in - model = main() - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/pretrain_gpt_alcf.py", line 554, in main - model = pretrain( - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 226, in pretrain - iteration = train(forward_step_func, - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 1290, in train - save_checkpoint_and_time(iteration, model, optimizer, - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 1151, in save_checkpoint_and_time - save_checkpoint(iteration, model, optimizer, opt_param_scheduler) - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/checkpointing.py", line 259, in save_checkpoint - state_dict[UNIVERSAL_CHECKPOINT_INFO] = _universal_checkpoint_info(model) - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/checkpointing.py", line 783, in _universal_checkpoint_info - info.update(model[0].universal_checkpoint_info()) - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/model/gpt_model.py", line 203, in universal_checkpoint_info - info[TP_REPLICATED_PARAMETER_PATTERNS] = self._get_tp_replicated_param_patterns() - File "/lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__ - raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") - AttributeError: 'GPTModel' object has no attribute '_get_tp_replicated_param_patterns' - ``` +2.
⬇️ Clone repo + navigate into it: + + ```bash + git clone "https://github.com/argonne-lcf/Megatron-DeepSpeed" + cd Megatron-DeepSpeed + ``` - 🤔
+3.
🐍 Setup Python: + +
+ + > **NOTE**: The following commands should be ran from [`Megatron-DeepSpeed`](https://github.com/argonne-lcf/Megatron-DeepSpeed), following the `cd` command from 2. + + 1. Load `conda` module and activate base environment: + + ```bash + export PBS_O_WORKDIR=$(pwd) && source ALCF/helpers.sh && ezpz_setup + ``` + + -
[output]: + +
+ + -
[Polaris]: + + ```bash + # [05:47:13 PM][foremans@x3001c0s13b1n0][/eagle/a/f/p/ar/Megatron-DeepSpeed-D/Megatron-DeepSpeed] + $ PBS_O_WORKDIR=$(pwd) source ALCF/helpers.sh && setup_python + Using WORKING_DIR: /eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed-DistributedDataLoading/Megatron-DeepSpeed + No conda_prefix or virtual_env found in environment... + Setting up conda... + Running on Polaris !! + + Lmod is automatically replacing "nvhpc/23.9" with "gcc-native/12.3". + + + Lmod is automatically replacing "PrgEnv-nvhpc/8.5.0" with "PrgEnv-gnu/8.5.0". + + + Due to MODULEPATH changes, the following have been reloaded: + 1) cray-mpich/8.1.28 + + Found conda at: /soft/applications/conda/2024-04-29/mconda3 + No VIRTUAL_ENV found in environment! + - Trying to setup from /soft/applications/conda/2024-04-29/mconda3 + - Using VENV_DIR=/eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed-DistributedDataLoading/Megatron-DeepSpeed/venvs/2024-04-29 + - Found existing venv, activating from /eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed-DistributedDataLoading/Megatron-DeepSpeed/venvs/2024-04-29 + [python] Using: /eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed-DistributedDataLoading/Megatron-DeepSpeed/venvs/2024-04-29/bin/python3 + ``` + +
+ + -
[Aurora]: + + ```bash + # [10:04:02 PM][foremans@x4415c0s2b0n0][/gecko/A/fo/p/a/Megatron-DeepSpeed] + $ PBS_O_WORKDIR=$(pwd) source ALCF/helpers.sh && setup_python + Using WORKING_DIR: /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed + No conda_prefix or virtual_env found in environment... + Setting up conda... + + The following have been reloaded with a version change: + 1) intel_compute_runtime/release/821.36 => intel_compute_runtime/release/803.29 2) oneapi/eng-compiler/2024.04.15.002 => oneapi/release/2024.1 + + Found conda at: /opt/aurora/24.086.0/frameworks/aurora_nre_models_frameworks-2024.1 + No VIRTUAL_ENV found in environment! + - Trying to setup from /opt/aurora/24.086.0/frameworks/aurora_nre_models_frameworks-2024.1 + - Using VENV_DIR=/gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1 + - Found existing venv, activating from /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1 + [python] Using: /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1/bin/python3 + ``` + +
+ + -
[Sunspot]: + + ```bash + # [05:37:18 PM][foremans@x1921c0s0b0n0][/gila/A/fo/p/a/Megatron-DeepSpeed] + $ PBS_O_WORKDIR=$(pwd) source ALCF/helpers.sh && setup_python + Using WORKING_DIR: /gila/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed + No conda_prefix or virtual_env found in environment... + Setting up conda... + Running on SunSpot !! + + Due to MODULEPATH changes, the following have been reloaded: + 1) gcc/12.2.0 5) mpich-config/collective-tuning/1024 + 2) gmp/6.2.1-pcxzkau 6) mpich/icc-all-pmix-gpu/20231026 + 3) mpc/1.3.1-dfagrna 7) oneapi/eng-compiler/2024.04.15.002 + 4) mpfr/4.2.0-w7v7yjv + + The following have been reloaded with a version change: + 1) intel_compute_runtime/release/821.36 => intel_compute_runtime/release/775.20 + 2) spack-pe-gcc/0.7.0-24.086.0 => spack-pe-gcc/0.6.1-23.275.2 + UMD: agama-ci-devel-803.29 successfully loaded: + UMD: graphics-compute-runtime/agama-ci-devel-803.29 + + The following have been reloaded with a version change: + 1) oneapi/eng-compiler/2024.04.15.002 => oneapi/release/2024.04.15.001 + + Found conda at: /soft/datascience/aurora_nre_models_frameworks-2024.1_preview_u1 + No VIRTUAL_ENV found in environment! + - Trying to setup from /soft/datascience/aurora_nre_models_frameworks-2024.1_preview_u1 + - Using VENV_DIR=/gila/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1_preview_u1 + - Found existing venv, activating from /gila/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1_preview_u1 + [python] Using: /lus/gila/projects/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1_preview_u1/bin/python3 + ``` + +
+ + + + 2. 🍋 Install [`ezpz`](https://github.com/saforem2/ezpz): + + ```bash + mkdir deps && git clone https://github.com/saforem2/ezpz deps/ezpz + python3 -m pip install -e deps/ezpz --require-virtualenv + ``` + + [^venv]: Its generally a good practice to keep separate virtual Python environments different projects. + We provide a helper function, [`setup_venv_from_conda()`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/2f0154394bbdf3c64b4669f9d944645e2cdb8f2b/ALCF/helpers.sh#L440), + that helps take care of this for you. +
+ This will: activate (or build, if necessary) a `venv` in your working dir, + _automatically_ matching the name of your active `conda` environment (e.g. `2024-04-29`, on Polaris_. + + 3. Setup [`wandb`](https://docs.wandb.ai/quickstart) + + > **NOTE**: this can be disabled by setting `export WANDB_DISABLED=1` +
+ + +4.
🚀 Launch: + + In this case, train a ~ 2B Model (with 10 layers), + for 1000 iterations using the data file list in: + + [`ALCF/data-lists/polaris/books.txt`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/data-lists/polaris/books.txt) + + with a micro-batch-size of 2 (`MICRO_BATCH=2`), with the `torch.optim.AdamW` optimizer (`OPT=adamw`). + + **Note** that _any_ of the options in the [`setParams`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/helpers.sh#L140) + function from [`ALCF/helpers.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/7d203596dbf14e048e756c5ee6705de7dcb22283/ALCF/helpers.sh) + can be overridden dynamically at runtime using this technique. + + ```bash + # for systems other than Polaris, replace "polaris/books.txt" below with: + # "{aurora,sunspot}/books.txt", + PBS_O_WORKDIR=$(pwd) DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt TRAIN_ITER=1000 NLAYERS=10 MICRO_BATCH=2 OPT=adamw bash train_llama_alcf.sh + ``` + + - **Note**: If no additional options specified, i.e. + + ```bash + PBS_O_WORKDIR=$(pwd) bash train_llama_alcf.sh + ``` + + then this will fallback to using the default AuroraGPT-7B architecture + with the full Dolma (v1.7) dataset. + +
[output]: + +
+ + The outputs should look _something_ like this, though YMMV (things change quick): + +
[Aurora]: + + ```bash + #[🌌][10:45:59 AM][foremans@x4711c1s2b0n0][…/Megatron-DeepSpeed][🌱 main][$!?] + $ export PBS_O_WORKDIR=$(pwd) && source ALCF/helpers.sh && setup_python + + #[🌌][10:46:57 AM][foremans@x4711c1s2b0n0][…/Megatron-DeepSpeed][🌱 main][$!?][aurora_nre_models_frameworks-2024.1] + (aurora_nre_models_frameworks-2024.1) $ PBS_O_WORKDIR=$(pwd) DATA_FILE_LIST=./ALCF/data-lists/aurora/books.txt bash train_llama_alcf.sh > train-log-$(tstamp).log 2>&1 & + + Using WORKING_DIR: /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed + Running on: aurora + Using virtual_env: /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1 on top of conda from: /opt/aurora/24.086.0/frameworks/aurora_nre_models_frameworks-2024.1 + [python] Using: /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1/bin/python3 + Ensuring all dependencies from /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/ALCF/requirements/requirements.txt installed... + + [notice] A new release of pip is available: 24.0 -> 24.1 + [notice] To update, run: pip install --upgrade pip + ┌─────────────────────────────────────────────────────────────────────┐ + │ [savejobenv]: + │ • Writing PBS vars to: /home/foremans/.pbsenv + └─────────────────────────────────────────────────────────────────────┘ + ┌─────────────────────────────────────────────────────────────────────┐ + │ [HOSTS]: + │ • [host:0] - x4711c1s2b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov + │ • [host:1] - x4711c1s3b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov + └─────────────────────────────────────────────────────────────────────┘ + ┌─────────────────────────────────────────────────────────────────────┐ + │ [DIST INFO]: + │ • HOSTFILE=/var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov + │ • NHOSTS=2 + │ • NGPU_PER_HOST=12 + │ • NGPUS=24 + └─────────────────────────────────────────────────────────────────────┘ + ┌─────────────────────────────────────────────────────────────────────┐ + │ [LAUNCH]: + │ • To launch across all available GPUs, use: + │ 'launch' ( = mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov ) + └─────────────────────────────────────────────────────────────────────┘ + 2024-06-21 10:47:09,771 - numexpr.utils - INFO - Note: detected 208 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable. + 2024-06-21 10:47:09,772 - numexpr.utils - INFO - Note: NumExpr detected 208 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8. + 2024-06-21 10:47:09,772 - numexpr.utils - INFO - NumExpr defaulting to 8 threads. + /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1/lib/python3.9/site-packages/pandas/core/computation/expressions.py:21: UserWarning: Pandas requires version '2.8.4' or n> + from pandas.core.computation.check import NUMEXPR_INSTALLED + /opt/aurora/24.086.0/frameworks/aurora_nre_models_frameworks-2024.1/lib/python3.9/runpy.py:127: RuntimeWarning: 'ezpz.jobs' found in sys.modules after import of package 'ezpz', but prior to execution of 'ezpz.jobs'; this may result in u> + warn(RuntimeWarning(msg)) + [2024-06-21 10:47:10][INFO][jobs:366] - Caught PBS_JOBID='684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov', pbsnf=PosixPath('/var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov') from env. Saving jobenv! + [2024-06-21 10:47:10][WARNING][jobs:117] - /home/foremans/PBS-jobs/684084 already in /home/foremans/PBS-jobs.log, not appending !! + [2024-06-21 10:47:10][INFO][jobs:192] - Saving job env to /home/foremans/PBS-jobs/684084/jobenv.sh + [2024-06-21 10:47:10][INFO][jobs:220] - Saving job env to /home/foremans/PBS-jobs/684084/jobenv.json + [2024-06-21 10:47:10][INFO][jobs:233] - Saving job env to /home/foremans/PBS-jobs/684084/jobenv.yaml + [2024-06-21 10:47:10][INFO][jobs:137] - Saving job env to .jobenv file in /home/foremans/PBS-jobs/684084/.jobenv + [2024-06-21 10:47:10][INFO][jobs:137] - Saving job env to .jobenv file in /lus/gecko/projects/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/.jobenv + [2024-06-21 10:47:10][WARNING][jobs:154] - To use launch alias, be sure to: source /lus/gecko/projects/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/.jobenv + [2024-06-21 10:47:10][INFO][jobs:277] - Writing PBS env vars to /home/foremans/PBS-jobs/684084 / jobenv{.sh, .yaml, .json} + [2024-06-21 10:47:10][WARNING][jobs:281] - Run: source ./.jobenv in your current shell to set job variables + [2024-06-21 10:47:10][INFO][jobs:374] - + [DIST_INFO]: + • DEVICE=xpu + • DEVICE_ID=xpu:0 + • DISTRIBUTED_BACKEND=ccl + • GPUS_PER_NODE=12 + • HOSTS=['x4711c1s2b0n0', 'x4711c1s3b0n0'] + • HOSTFILE=/var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov + • HOSTNAME=x4711c1s2b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov + • LOCAL_RANK=0 + • MACHINE=Aurora + • NUM_NODES=2 + • NGPUS=24 + • NODE_ID=0 + • RANK=0 + • SCHEDULER=PBS + • WORLD_SIZE_TOTAL=24 + • WORLD_SIZE_IN_USE=1 + [2024-06-21 10:47:10][CRITICAL][jobs:245] - To launch across ALL GPUs in your job, use: + LAUNCH_CMD=mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov + creating alias launch=mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov + Found ezpz! + + [notice] A new release of pip is available: 24.0 -> 24.1 + [notice] To update, run: pip install --upgrade pip + Done with ezpz. + Not using flash-attn!! + LR_ARGS: --lr 0.0003 --lr-decay-style cosine --lr-warmup-fraction 0.05 + DS_CONFIG: /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/ds-configs/ds_stage1_mb4_gb768_pp1_bf16.json + ZS: 1, MB: 4, GB: 768, PP: 1, DTYPE: bf16 + Please see logs at: logs/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240621-104713_24_x4711c1s2b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov + Checkpoints will be saved to: checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05 + !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! + !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! + Setting up tokenizer with Llama2 + Using data_file_list: ./ALCF/data-lists/aurora/books.txt + Using tokenizer: Llama2. Setting up data with ./ALCF/data-lists/aurora/books.txt + Calling: setData() with ./ALCF/data-lists/aurora/books.txt + -------------------- + Updated environment: + DATA_FILE_LIST: ./ALCF/data-lists/aurora/books.txt + NUM_DOCS: 3 + WEIGHT_SUM: 0.0072042092147565125 + DFL_STEM: books + DATA_CACHE_PATH: .cache/books/index-cache + DATA_FLAGS: --data-file-list ./ALCF/data-lists/aurora/books.txt + -------------------- + [setData] DATA_FLAGS: --data-file-list ./ALCF/data-lists/aurora/books.txt + [setData] TOKENIZER_FLAGS: --tokenizer-type Llama2Tokenizer --tokenizer-model /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/ALCF/tokenizer.model + Requirement already satisfied: pybind11 in ./venvs/aurora_nre_models_frameworks-2024.1/lib/python3.9/site-packages (2.12.0) + + [notice] A new release of pip is available: 24.0 -> 24.1 + [notice] To update, run: pip install --upgrade pip + make: Nothing to be done for 'default'. + /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed + ++++++++++++++++++++++++++++++++++++++++++++++++++ + - MPICH_DIR=/opt/aurora/24.086.0/CNDA/mpich/20231026/mpich-ofi-all-icc-default-pmix-gpu-drop20231026 + - Using /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1/bin/python3 + - WORLD_SIZE:24 + - BACKEND: ccl + - MODEL_TYPE: llama-seq4096-pp1-tp1-32layers-32heads-4096hidden + - Using DATA_FILE_LIST: ./ALCF/data-lists/aurora/books.txt + ++++++++++++++++++++++++++++++++++++++++++++++++++ + + Currently Loaded Modules: + 1) mpich/icc-all-pmix-gpu/20231026 3) libfabric/1.15.2.0 5) cray-libpals/1.3.3 7) gmp/6.2.1-pcxzkau 9) mpc/1.3.1-dfagrna 11) intel_compute_runtime/release/803.29 13) frameworks/2024.1 + 2) mpich-config/collective-tuning/1024 4) cray-pals/1.3.3 6) spack-pe-gcc/0.7.0-24.086.0 8) mpfr/4.2.0-w7v7yjv 10) gcc/12.2.0 12) oneapi/release/2024.1 + + + + Saving environment to checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.env + Not currently running. Continuing! + Launching with: MPICH + mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov --genvall --cpu-bind depth -d 16 /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1/bin/python3 -Wignore /lus/gecko/projects/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/pretrain_gpt_alcf.py + Using data_cache_path: checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.cache/books/index-cache + + mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov --genvall --cpu-bind depth -d 16 /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1/bin/python3 -Wignore /lus/gecko/projects/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/pretrain_gpt_alcf.py --bf16 --split 100,0,0 --log-interval 1 --no-bias-gelu-fusion --no-bias-dropout-fusion --no-masked-softmax-fusion --no-gradient-accumulation-fusion > + + [!! NOTE] View output at: + logs/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240621-104713_24_x4711c1s2b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov/output.log + Connected to tcp://x4711c1s2b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov:7919 + Launching application eafe3e80-ad2e-4cee-a3e4-d63af2a77c66 + [2024-06-21 10:47:31,610] [INFO] [comm.py:161:init_deepspeed_backend] Initialize ccl backend + [2024-06-21 10:47:31,610] [INFO] [comm.py:637:init_distributed] cdb=None + [2024-06-21 10:47:31,610] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment... + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=15, local_rank=3, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=23, local_rank=11, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=12, local_rank=0, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=13, local_rank=1, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=3, local_rank=3, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=14, local_rank=2, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=16, local_rank=4, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=17, local_rank=5, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=18, local_rank=6, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=19, local_rank=7, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=20, local_rank=8, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=21, local_rank=9, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=22, local_rank=10, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=6, local_rank=6, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=8, local_rank=8, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=10, local_rank=10, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend ccl + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=1, local_rank=1, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=2, local_rank=2, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=4, local_rank=4, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=5, local_rank=5, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=7, local_rank=7, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=9, local_rank=9, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=11, local_rank=11, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=2/23][local_rank=2/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=6/23][local_rank=6/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=8/23][local_rank=8/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=1/23][local_rank=1/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=15/23][local_rank=3/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=17/23][local_rank=5/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=3/23][local_rank=3/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=13/23][local_rank=1/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=14/23][local_rank=2/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=4/23][local_rank=4/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=18/23][local_rank=6/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=20/23][local_rank=8/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=5/23][local_rank=5/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=21/23][local_rank=9/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=7/23][local_rank=7/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=23/23][local_rank=11/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=9/23][local_rank=9/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=12/23][local_rank=0/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=16/23][local_rank=4/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=10/23][local_rank=10/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=11/23][local_rank=11/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=19/23][local_rank=7/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=22/23][local_rank=10/11][node=0/1] + 2024-06-21 10:47:32][INFO][dist:240] - DistInfo={ + "DEVICE": "xpu", + "DEVICE_ID": "xpu:0", + "DISTRIBUTED_BACKEND": "ccl", + "GPUS_PER_NODE": 12, + "HOSTFILE": "/var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov", + "HOSTNAME": "x4711c1s2b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov", + "HOSTS": "['x4711c1s2b0n0', 'x4711c1s3b0n0']", + "LOCAL_RANK": 0, + "MACHINE": "Aurora", + "NGPUS": 24, + "NODE_ID": 0, + "NUM_NODES": 2, + "RANK": 0, + "SCHEDULER": "PBS", + "WORLD_SIZE_IN_USE": 24, + "WORLD_SIZE_TOTAL": 24 + } + + # [...clipped...] + + [2024-06-21 10:48:48][INFO][utils:307] - > elapsed time for building blendable dataset indices: 1.19 (sec) + [2024-06-21 10:48:48][INFO][utils:307] - > saving index map files + [2024-06-21 10:48:51][INFO][utils:307] - > finished saving index map files in 3.0829622745513916 seconds + [2024-06-21 10:48:51][INFO][utils:307] - > loading blendable dataset index: checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.cache/books/index-cache/49e9529a32d0a98f1e40f4a82872b11c_index.npy + [2024-06-21 10:48:52][INFO][utils:307] - > loading blendable dataset sample index: checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.cache/books/index-cache/49e9529a32d0a98f1e40f4a82872b11c_sample_index.npy + [2024-06-21 10:48:52][INFO][utils:307] - > finished loading in 0.30188989639282227 seconds + [2024-06-21 10:48:52][INFO][utils:307] - >> building dataset for /gecko/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document + [2024-06-21 10:48:52][INFO][utils:307] - > building dataset index ... + [2024-06-21 10:48:52][INFO][utils:307] - reading sizes... + [2024-06-21 10:48:52][INFO][utils:307] - reading pointers... + [2024-06-21 10:48:52][INFO][utils:307] - reading document index... + [2024-06-21 10:48:52][INFO][utils:307] - creating numpy buffer of mmap... + [2024-06-21 10:48:52][INFO][utils:307] - /gecko/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document.bin + [2024-06-21 10:48:52][INFO][utils:307] - creating memory view of numpy buffer... + [2024-06-21 10:48:52][INFO][utils:307] - > finished creating indexed dataset in 0.003112 seconds + [2024-06-21 10:48:52][INFO][utils:307] - number of documents: 7386 + [2024-06-21 10:48:52][INFO][utils:307] - > dataset split: + [2024-06-21 10:48:52][INFO][utils:307] - train: + [2024-06-21 10:48:52][INFO][utils:307] - document indices in [0, 7386) total of 7386 documents + [2024-06-21 10:48:52][INFO][utils:307] - validation: + [2024-06-21 10:48:52][INFO][utils:307] - document indices in [7386, 7386) total of 0 documents + [2024-06-21 10:48:52][INFO][utils:307] - test: + [2024-06-21 10:48:52][INFO][utils:307] - document indices in [7386, 7386) total of 0 documents + [2024-06-21 10:48:52][INFO][utils:307] - > loading doc-idx mapping from checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.cache/books/index-cache/1fa7757ef8907da21e1e1326705e7f3f_doc_idx.npy + [2024-06-21 10:48:52][INFO][utils:307] - > loading sample-idx mapping from checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.cache/books/index-cache/1fa7757ef8907da21e1e1326705e7f3f_sample_idx.npy + [2024-06-21 10:48:52][INFO][utils:307] - > loading shuffle-idx mapping from checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.cache/books/index-cache/1fa7757ef8907da21e1e1326705e7f3f_shuffle_idx.npy + [2024-06-21 10:48:52][INFO][utils:307] - loaded indexed file in 0.008 seconds + [2024-06-21 10:48:52][INFO][utils:307] - total number of samples: 34196233 + [2024-06-21 10:48:52][INFO][utils:307] - total number of epochs: 175 + [2024-06-21 10:48:52][INFO][utils:307] - > size of blendable dataset: 245361763 samples + [2024-06-21 10:48:52][INFO][utils:307] - >>> Finished building BlendableDataset in 4.613574266433716 seconds + [2024-06-21 10:48:52][INFO][pretrain_gpt_alcf:579] - > finished creating GPT datasets. Took: 45730179865763.24219s + [2024-06-21 10:48:53][INFO][training:88] - [after dataloaders are built] datetime=2024-06-21 10:48:53 + [2024-06-21 10:48:53][INFO][training:307] - done with setup ... + [2024-06-21 10:48:53][INFO][training:313] - training ... + (min, max) time across ranks (ms): + model-and-optimizer-setup ......................: (63763.34, 63857.25) + train/valid/test-data-iterators-setup ..........: (12936.53, 13432.64) + [2024-06-21 10:48:53][INFO][training:88] - [before the start of training step] datetime=2024-06-21 10:48:53 + [2024-06-21 10:48:53,396] [INFO] [checkpointing.py:541:forward] Activation Checkpointing Information + [2024-06-21 10:48:53,396] [INFO] [checkpointing.py:542:forward] ----Partition Activations False, CPU CHECKPOINTING False + [2024-06-21 10:48:53,396] [INFO] [checkpointing.py:543:forward] ----contiguous Memory Checkpointing False with 32 total layers + [2024-06-21 10:48:53,396] [INFO] [checkpointing.py:545:forward] ----Synchronization False + [2024-06-21 10:48:53,396] [INFO] [checkpointing.py:546:forward] ----Profiling time in checkpointing False + [2024-06-21 10:50:42,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1867.64 | optimizer_gradients: 19.65 | optimizer_step: 46.07 + [2024-06-21 10:50:42,167] [INFO] [logging.py:96:log_dist] [Rank 0] step=1, skipped=0, lr=[1.887433467970254e-08, 1.887433467970254e-08], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-06-21 10:50:42,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 25341.72 | bwd_microstep: 77707.38 | bwd_inner_microstep: 75751.84 | bwd_allreduce_microstep: 1955.54 | step_microstep: 2218.38 + [2024-06-21 10:50:42,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 25341.72 | bwd: 77707.38 | bwd_inner: 75751.84 | bwd_allreduce: 1955.54 | step: 2218.38 + [2024-06-21 10:50:42][INFO][training:1609] - iteration= 1/ 317892 | consumed_samples= 768 | consumed_tokens= 3145728 | elapsed_time_per_iteration_ms=108893.2 | learning_rate=1.88743e-08 | global_batch_size= 768 | lm loss=11.133188 | loss_scale=1.0 | actual_seqlen= 4096 | number_of_skipped_iterations= 0 | number_of_nan_iterations= 0 | samples_per_second=7.053 | tokens_per_gpu_per_second_tgs=1203.674 | [LM]-TFLOPs=49.66 | [DS]-TFLOPs=73.32 | + [2024-06-21 10:50:42][INFO][utils:190] - [Rank 0] (after 1 iterations) memory (MB) | allocated: 18243.64111328125 | max allocated: 50664.2548828125 | reserved: 54556.0 | max reserved: 54556.0 + (min, max) time across ranks (ms): + forward-backward ...............................: (106622.81, 106624.28) + optimizer ......................................: (2221.02, 2234.98) + ``` + +
+ +
[Sunspot]: + + ```bash + # [09:07:32 AM][foremans@x1921c0s0b0n0][~/q/llm.devkit/Megatron-DeepSpeed][🌱 main][$!?] + $ PBS_O_WORKDIR=$(pwd) DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt bash train_llama_alcf.sh + source-ing /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/ALCF/helpers.sh + Sourcing /home/foremans/q4-drop_sunspot/llm.devkit/setenv.sh... + UMD: agama-ci-devel-736.9 successfully loaded: + UMD: graphics-compute-runtime/agama-ci-devel-736.9 + Lmod has detected the following error: The following module(s) are unknown: "gcc/12.1.0" + + Please check the spelling or version number. Also try "module spider ..." + It is also possible your cache file is out-of-date; it may help to try: + $ module --ignore_cache load "gcc/12.1.0" + + Also make sure that all modulefiles written in TCL start with the string #%Module + + Note: the module "intel_compute_runtime/release/agama-devel-647" cannot be unloaded because it was not loaded. + + Running on SunSpot !! + [python] Using: /home/foremans/miniconda3/envs/q4-drop/bin/python3 + Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env + Found ezpz! + /lus/gila/projects/Aurora_deployment/foremans/locations/sunspot/projects/saforem2/ezpz/src/ezpz/__init__.py + Has ezpz installed. Nothing to do. + Done with ezpz. + ┌─────────────────────────────────────────────────────────────────── + │ Writing PBS vars to /home/foremans/.pbsenv + │ HOSTFILE: /var/spool/pbs/aux/8988430.amn-0001 + │ NHOSTS: 2 + │ NGPU_PER_HOST: 12 GPUs per host + │ NGPUS: 24 GPUs total + └─────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [Hosts]: + │ • [host:0] - x1921c0s0b0n0.hostmgmt2000.cm.americas.sgi.com + │ • [host:1] - x1921c0s1b0n0.hostmgmt2000.cm.americas.sgi.com + └────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [DIST INFO]: + │ • Loading job env from: /home/foremans/.pbsenv + │ • HOSTFILE: /var/spool/pbs/aux/8988430.amn-0001 + │ • NHOSTS: 2 + │ • NGPU_PER_HOST: 12 + │ • NGPUS (NHOSTS x NGPU_PER_HOST): 24 + │ • WORLD_SIZE: 24 + │ • DIST_LAUNCH: mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/8988430.amn-0001 + └────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [Launch]: + │ • Use: 'launch' (=mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/8988430.amn-0001) + │ to launch job + └────────────────────────────────────────────────────────────────── + DS_CONFIG: ds_stage2_mb4_gb96_pp1_bf16.json + ZS: 2, CPU_OPTIMIZER: , MB: 4, GB: 96, PP: 1, DTYPE: bf16!!!Please see logs at logs/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/0404090742_x1921c0s0b0n0 + !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! + !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! + Calling: setData() with ./convergence_debug_small.txt + -------------------- + Updated environment: + DATA_FILE_LIST: ./convergence_debug_small.txt + NUM_DOCS: 15 + WEIGHT_SUM: 15.0 + DFL_STEM: convergence_debug_small + DATA_CACHE_PATH: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache + -------------------- + ++++++++++++++++++++++++++++++++++++++++++++++++++ + - MPICH_DIR= + - Using /home/foremans/miniconda3/envs/q4-drop/bin/python3 + - WORLD_SIZE:24 + - NCCL: nccl + - MODEL_TYPE: llama-seq4096-pp1-tp1-32layers-32heads-4096hidden + - Using DATA_FILE_LIST: ./convergence_debug_small.txt + ++++++++++++++++++++++++++++++++++++++++++++++++++ + ! Using /home/foremans/miniconda3/envs/q4-drop/bin/deepspeed + /home/foremans/miniconda3/envs/q4-drop/bin/ds_report:4: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html + __import__('pkg_resources').require('deepspeed==0.12.3+6ea44d02') + /home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: ''If you dont plan on using image function + ality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torch + vision` from source? + warn( + [2024-04-04 09:07:45,585] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to xpu (auto detect) + [2024-04-04 09:07:45,818] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to xpu (auto detect) + -------------------------------------------------- + DeepSpeed C++/CUDA extension op report + -------------------------------------------------- + NOTE: Ops not installed will be just-in-time (JIT) compiled at + runtime if needed. Op compatibility means that your system + meet the required dependencies to JIT install the op. + -------------------------------------------------- + JIT compiled ops requires ninja + ninja .................. [OKAY] + -------------------------------------------------- + op name ................ installed .. compatible + -------------------------------------------------- + async_io ............... [NO] ....... [OKAY] + cpu_adagrad ............ [NO] ....... [OKAY] + cpu_adam ............... [NO] ....... [OKAY] + flash_attn ............. [NO] ....... [OKAY] + fused_adam ............. [NO] ....... [OKAY] + quantizer .............. [NO] ....... [OKAY] + transformer ............ [NO] ....... [OKAY] + transformer_inference .. [NO] ....... [OKAY] + utils .................. [NO] ....... [OKAY] + -------------------------------------------------- + DeepSpeed general environment info: + torch install path ............... ['/home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torch'] + torch version .................... 2.1.0a0+cxx11.abi + deepspeed install path ........... ['/lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/DeepSpeed/deepspeed'] + deepspeed info ................... 0.12.3+6ea44d02, 6ea44d02, HEAD + deepspeed wheel compiled w. ...... torch 2.1 + shared memory (/dev/shm) size .... 503.18 GB + + deepspeed --hostfile /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/hostfile_deepspeed --launcher MPICH /lus/gila/projects/Aurora_deployment/ + foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/pretrain_gpt_alcf.py --bf16 --optimizer adamw --split 100,0,0 --log-interval 1 --no-bias-gelu-fusion --lr-decay + -style cosine --no-bias-dropout-fusion --no-masked-softmax-fusion --tokenizer-type Llama2Tokenizer --no-gradient-accumulation-fusion --accumulate-allreduce-grads-in-fp32 + --use-checkpoint-opt_param-scheduler --tensorboard-dir checkpoints/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/tensorboard --log-timers-to-tensorboard --log-optimizer + -states-to-tensorboard --lr 0.0003 --save checkpoints/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16 --load checkpoints/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16 + --seq-length 4096 --num-layers 32 --hidden-size 4096 --train-iters 317892 --eval-iters 10 --distributed-backend ccl --num-attention-heads 32 --save-interval 20 + 0 --eval-interval 50000 --max-position-embeddings 4096 --micro-batch-size 4 --data-file-list ./convergence_debug_small.txt --tensor-model-parallel-size 1 --global-bat + ch-size 96 --pipeline-model-parallel-size 1 --num-key-value-heads 8 --data-cache-path /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/ + .cache/convergence_debug_small/index-cache --ffn-hidden-size 11008 --tokenizer-model /home/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/ALCF/tokenizer.model --no-query- + key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear --deepspeed-activation-checkpointing --z + ero-stage=2 --deepspeed_config=ds_stage2_mb4_gb96_pp1_bf16.json --no-pipeline-parallel --deepspeed --checkpoint-activations --checkpoint-num-layers 1 |& tee logs/ds_stage2 + _nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/0404090742_x1921c0s0b0n0/output.log + + [!! NOTE] View output at: + logs/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/0404090742_x1921c0s0b0n0/output.log + + # ... + + /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0051_text_document.bin + creating memory view of numpy buffer... + > finished creating indexed dataset in 0.010017 seconds + number of documents: 1498927 + > dataset split: + train: + document indices in [0, 1498927) total of 1498927 documents + validation: + document indices in [1498927, 1498927) total of 0 documents + test: + document indices in [1498927, 1498927) total of 0 documents + > loading doc-idx mapping from /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/bf90c74a625ac2ee4de6e1d6f7f84fbb_doc_idx.npy + > loading sample-idx mapping from /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/bf90c74a625ac2ee4de6e1d6f7f84fbb_sample_idx.npy + > loading shuffle-idx mapping from /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/bf90c74a625ac2ee4de6e1d6f7f84fbb_shuffle_idx.npy + loaded indexed file in 0.056 seconds + total number of samples: 2318461 + total number of epochs: 8 + > loading blendable dataset index: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/3a426af74008c22f9db24db811aad6b7_index.npy + > loading blendable dataset sample index: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/3a426af74008c22f9db24db811aad6b7_sample_index.npy + /home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torch/utils/data/dataloader.py:557: UserWarning: This DataLoader will create 2 worker processes in total. Our suggested max number of worker in current system is 1, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary. + + [after dataloaders are built] datetime: 2024-04-04 09:09:27 + done with setup ... + (min, max) time across ranks (ms): + model-and-optimizer-setup ......................: (64818.18, 64858.22) + train/valid/test-data-iterators-setup ..........: (1968.10, 2288.56) + training ... + [before the start of training step] datetime: 2024-04-04 09:09:27 + [2024-04-04 09:09:27,718] [INFO] [checkpointing.py:540:forward] Activation Checkpointing Information + [2024-04-04 09:09:27,719] [INFO] [checkpointing.py:541:forward] ----Partition Activations False, CPU CHECKPOINTING False + [2024-04-04 09:09:27,719] [INFO] [checkpointing.py:542:forward] ----contiguous Memory Checkpointing False with 32 total layers + [2024-04-04 09:09:27,719] [INFO] [checkpointing.py:544:forward] ----Synchronization False + [2024-04-04 09:09:27,719] [INFO] [checkpointing.py:545:forward] ----Profiling time in checkpointing False + [2024-04-04 09:09:33][INFO][utils:145] - Note: detected 208 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable. + [2024-04-04 09:09:33][INFO][utils:148] - Note: NumExpr detected 208 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8. + [2024-04-04 09:09:33][INFO][utils:160] - NumExpr defaulting to 8 threads. + ^[c[2024-04-04 09:09:53,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 884.11 | optimizer_gradients: 6.43 | optimizer_step: 23.44 + [2024-04-04 09:09:53,312] [INFO] [logging.py:96:log_dist] [Rank 0] step=1, skipped=0, lr=[0.00029999999999267505, 0.00029999999999267505], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-04-04 09:09:53,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 6567.68 | bwd_microstep: 17950.36 | bwd_inner_microstep: 17711.20 | bwd_allreduce_microstep: 239.11 | step_microstep: 1139.27 + [2024-04-04 09:09:53,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 6567.66 | bwd: 17950.35 | bwd_inner: 17711.19 | bwd_allreduce: 239.11 | step: 1139.29 + [Rank 0] (after 1 iterations) memory (MB) | allocated: 18244.640625 | max allocated: 41299.50146484375 | reserved: 46764.0 | max reserved: 46764.0 + iteration 1/ 317892 | consumed samples: 96 | consumed tokens: 393216 | elapsed time per iteration (ms): 25849.1 | learning rate: 3.000E-04 | global batch size: 96 | lm loss: 1.117136E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 3.714 | tokens per gpu per second(tgs): 633.832 | TFLOPs: 38.61 | + [2024-04-04 09:10:13,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 327.85 | optimizer_gradients: 6.26 | optimizer_step: 23.60 + [2024-04-04 09:10:13,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=2, skipped=0, lr=[0.00029999999997070033, 0.00029999999997070033], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-04-04 09:10:13,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 4022.74 | bwd_microstep: 15738.67 | bwd_inner_microstep: 15556.80 | bwd_allreduce_microstep: 181.82 | step_microstep: 371.01 + [2024-04-04 09:10:13,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4022.73 | bwd: 15738.66 | bwd_inner: 15556.62 | bwd_allreduce: 181.81 | step: 371.02 + iteration 2/ 317892 | consumed samples: 192 | consumed tokens: 786432 | elapsed time per iteration (ms): 20298.3 | learning rate: 3.000E-04 | global batch size: 96 | lm loss: 2.537718E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 4.729 | tokens per gpu per second(tgs): 807.159 | TFLOPs: 49.17 | + ``` + +
+ +
[Polaris]: + + ```bash + # [09:31:35 AM][foremans@x3112c0s13b0n0][~/pol/p/a/Megatron-DeepSpeed][🌱 main][$!?] + $ PBS_O_WORKDIR=$(pwd) DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt OPT=adamw bash train_llama_alcf.sh + source-ing /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/ALCF/helpers.sh + Running on Polaris !! + + [python] Using: /eagle/datascience/foremans/miniconda3/envs/cu118-pt221/bin/python3 + Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env + Found ezpz! + /lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/ezpz/src/ezpz/__init__.py + Has ezpz installed. Nothing to do. + Done with ezpz. + ┌─────────────────────────────────────────────────────────────────── + │ Writing PBS vars to /home/foremans/.pbsenv + │ HOSTFILE: /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov + │ NHOSTS: 2 + │ NGPU_PER_HOST: 4 GPUs per host + │ NGPUS: 8 GPUs total + └─────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [Hosts]: + │ • [host:0] - x3112c0s13b0n0.hsn.cm.polaris.alcf.anl.gov + │ • [host:1] - x3112c0s13b1n0.hsn.cm.polaris.alcf.anl.gov + └────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [DIST INFO]: + │ • Loading job env from: /home/foremans/.pbsenv + │ • HOSTFILE: /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov + │ • NHOSTS: 2 + │ • NGPU_PER_HOST: 4 + │ • NGPUS (NHOSTS x NGPU_PER_HOST): 8 + │ • WORLD_SIZE: 8 + │ • DIST_LAUNCH: mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov + └────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [Launch]: + │ • Use: 'launch' (=mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov) + │ to launch job + └────────────────────────────────────────────────────────────────── + DS_CONFIG: ds_stage2_mb8_gb32_pp1_bf16.json + ZS: 2, CPU_OPTIMIZER: , MB: 8, GB: 32, PP: 1, DTYPE: bf16!!!Please see logs at logs/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/0404093534_x3112c0s13b0n0 + !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! + !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! + Calling: setData() with "./convergence_debug_small.txt" + -------------------- + Updated environment: + DATA_FILE_LIST: ./convergence_debug_small.txt + NUM_DOCS: 15 + WEIGHT_SUM: 15.0 + DFL_STEM: convergence_debug_small + DATA_CACHE_PATH: /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache + -------------------- + ++++++++++++++++++++++++++++++++++++++++++++++++++ + - MPICH_DIR=/opt/cray/pe/mpich/8.1.25/ofi/gnu/9.1 + - Using /eagle/datascience/foremans/miniconda3/envs/cu118-pt221/bin/python3 + - WORLD_SIZE:8 + - NCCL: nccl + - MODEL_TYPE: llama-seq4096-pp1-tp2-32layers-32heads-4096hidden + - Using DATA_FILE_LIST: ./convergence_debug_small.txt + ++++++++++++++++++++++++++++++++++++++++++++++++++ + ! Using /eagle/datascience/foremans/miniconda3/envs/cu118-pt221/bin/deepspeed + [2024-04-04 09:35:35,959] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda [auto detect] + -------------------------------------------------- + DeepSpeed C++/CUDA extension op report + -------------------------------------------------- + NOTE: Ops not installed will be just-in-time (JIT) compiled at + runtime if needed. Op compatibility means that your system + meet the required dependencies to JIT install the op. + -------------------------------------------------- + JIT compiled ops requires ninja + ninja .................. [OKAY] + -------------------------------------------------- + op name ................ installed .. compatible + -------------------------------------------------- + async_io ............... [NO] ....... [OKAY] + fused_adam ............. [NO] ....... [OKAY] + cpu_adam ............... [NO] ....... [OKAY] + cpu_adagrad ............ [NO] ....... [OKAY] + cpu_lion ............... [NO] ....... [OKAY] + [WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH + evoformer_attn ......... [NO] ....... [NO] + fused_lamb ............. [NO] ....... [OKAY] + fused_lion ............. [NO] ....... [OKAY] + inference_core_ops ..... [NO] ....... [OKAY] + cutlass_ops ............ [NO] ....... [OKAY] + transformer_inference .. [NO] ....... [OKAY] + quantizer .............. [NO] ....... [OKAY] + ragged_device_ops ...... [NO] ....... [OKAY] + ragged_ops ............. [NO] ....... [OKAY] + random_ltd ............. [NO] ....... [OKAY] + [WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.2 + [WARNING] using untested triton version (2.2.0), only 1.0.0 is known to be compatible + sparse_attn ............ [NO] ....... [NO] + spatial_inference ...... [NO] ....... [OKAY] + transformer ............ [NO] ....... [OKAY] + stochastic_transformer . [NO] ....... [OKAY] + -------------------------------------------------- + DeepSpeed general environment info: + torch install path ............... ['/eagle/datascience/foremans/miniconda3/envs/cu118-pt221/lib/python3.12/site-packages/torch'] + torch version .................... 2.2.1 + deepspeed install path ........... ['/eagle/datascience/foremans/miniconda3/envs/cu118-pt221/lib/python3.12/site-packages/deepspeed'] + deepspeed info ................... 0.14.0, unknown, unknown + torch cuda version ............... 11.8 + torch hip version ................ None + nvcc version ..................... 11.8 + deepspeed wheel compiled w. ...... torch 2.2, cuda 11.8 + shared memory (/dev/shm) size .... 251.61 GB + + deepspeed --hostfile /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/hostfile_deepspeed --launcher MPICH /lus/eagle/projects/datascienc + e/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/pretrain_gpt_alcf.py --bf16 --optimizer adamw --split 100,0,0 --log-interval 1 --no-bias-gelu-fusion + --lr-decay-style cosine --no-bias-dropout-fusion --no-masked-softmax-fusion --tokenizer-type Llama2Tokenizer --no-gradient-accumulation-fusion --accumulate-allreduce- + grads-in-fp32 --use-checkpoint-opt_param-scheduler --tensorboard-dir checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/tensorboard --log-timers-to-tensorboard - + -log-optimizer-states-to-tensorboard --lr 0.0003 --save checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16 --load checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_ + pp1_tp2_bf16 --seq-length 4096 --num-layers 32 --hidden-size 4096 --train-iters 317892 --eval-iters 10 --distributed-backend nccl --num-attention-heads 32 --s + ave-interval 200 --eval-interval 50000 --max-position-embeddings 4096 --micro-batch-size 8 --data-file-list ./convergence_debug_small.txt --tensor-model-parallel-size 2 + --global-batch-size 32 --pipeline-model-parallel-size 1 --num-key-value-heads 8 --data-cache-path /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-l + cf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache --ffn-hidden-size 11008 --tokenizer-model /home/foremans/polaris/projects/argonne-lcf/Megatron-DeepSpeed/ALCF/tokeniz + er.model --no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear --use-flash-attn-v2 + --deepspeed-activation-checkpointing --zero-stage=2 --deepspeed_config=ds_stage2_mb8_gb32_pp1_bf16.json --no-pipeline-parallel --deepspeed --checkpoint-activations --checkpoint- + num-layers 1 |& tee logs/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/0404093534_x3112c0s13b0n0/output.log + + [!! NOTE] View output at: + logs/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/0404093534_x3112c0s13b0n0/output.log + + # ... + + /eagle/datasets/dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0051_text_document.bin + creating memory view of numpy buffer... + > finished creating indexed dataset in 0.001280 seconds + number of documents: 1498927 + > dataset split: + train: + document indices in [0, 1498927) total of 1498927 documents + validation: + document indices in [1498927, 1498927) total of 0 documents + test: + document indices in [1498927, 1498927) total of 0 documents + > loading doc-idx mapping from /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/9217d94f3290abc2fddf9e87bff236d6_doc_idx.npy + > loading sample-idx mapping from /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/9217d94f3290abc2fddf9e87bff236d6_sample_idx.npy + > loading shuffle-idx mapping from /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/9217d94f3290abc2fddf9e87bff236d6_shuffle_idx.npy + loaded indexed file in 0.004 seconds + total number of samples: 869423 + total number of epochs: 3 + > loading blendable dataset index: /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/a815d51f6752c6f486d94194ce95fb87_index.npy + > loading blendable dataset sample index: /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/a815d51f6752c6f486d94194ce95fb87_sample_index.npy + > size of blendable dataset: 10223415 samples + > finished creating GPT datasets ... + [after dataloaders are built] datetime: 2024-04-04 09:36:07 + done with setup ... + (min, max) time across ranks (ms): + model-and-optimizer-setup ......................: (4794.78, 4795.23) + train/valid/test-data-iterators-setup ..........: (589.69, 721.20) + training ... + [before the start of training step] datetime: 2024-04-04 09:36:07 + [2024-04-04 09:36:07,407] [INFO] [checkpointing.py:539:forward] Activation Checkpointing Information + [2024-04-04 09:36:07,407] [INFO] [checkpointing.py:540:forward] ----Partition Activations False, CPU CHECKPOINTING False + [2024-04-04 09:36:07,407] [INFO] [checkpointing.py:541:forward] ----contiguous Memory Checkpointing False with 32 total layers + [2024-04-04 09:36:07,407] [INFO] [checkpointing.py:543:forward] ----Synchronization False + [2024-04-04 09:36:07,407] [INFO] [checkpointing.py:544:forward] ----Profiling time in checkpointing False + [2024-04-04 09:36:28,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1626.54 | optimizer_gradients: 19.29 | optimizer_step: 419.48 + [2024-04-04 09:36:28,430] [INFO] [logging.py:96:log_dist] [Rank 0] step=1, skipped=0, lr=[0.00029999999999267505, 0.00029999999999267505], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-04-04 09:36:28,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 11336.34 | bwd_microstep: 7134.73 | bwd_inner_microstep: 7090.02 | bwd_allreduce_microstep: 44.65 | step_microstep: 2564.02 + [2024-04-04 09:36:28,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 11336.33 | bwd: 7134.75 | bwd_inner: 7090.01 | bwd_allreduce: 44.66 | step: 2564.02 + iteration 1/ 317892 | consumed samples: 32 | consumed tokens: 131072 | elapsed time per iteration (ms): 21133.8 | learning rate: 3.000E-04 | global batch size: 32 | lm loss: 1.119983E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1.514 | tokens per gpu per second(tgs): 775.250 | TFLOPs: 47.23 | + [Rank 1] (after 1 iterations) memory (MB) | allocated: 14165.525390625 | max allocated: 22332.37255859375 | reserved: 24642.0 | max reserved: 35824.0 + [Rank 0] (after 1 iterations) memory (MB) | allocated: 14165.525390625 | max allocated: 22332.37255859375 | reserved: 24642.0 | max reserved: 32994.0 + [2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1605.55 | optimizer_gradients: 11.56 | optimizer_step: 50.92 + [2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] step=2, skipped=0, lr=[0.00029999999997070033, 0.00029999999997070033], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1395.17 | bwd_microstep: 6832.48 | bwd_inner_microstep: 6789.73 | bwd_allreduce_microstep: 42.70 | step_microstep: 1867.64 + [2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 1395.15 | bwd: 6832.49 | bwd_inner: 6789.73 | bwd_allreduce: 42.71 | step: 1867.65 + iteration 2/ 317892 | consumed samples: 64 | consumed tokens: 262144 | elapsed time per iteration (ms): 10154.3 | learning rate: 3.000E-04 | global batch size: 32 | lm loss: 1.766422E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 3.151 | tokens per gpu per second(tgs): 1613.503 | TFLOPs: 98.29 | + + # ... + ``` + +
+ +
+
+ + + + + + + + + + +### 🚀 Submit as a batch job + +```bash +$ cd Megatron-DeepSpeed +$ qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home train_llama_alcf.sh +``` + + + +## 📝 Data Preprocessing + +
Data Pre-Processing: + +AuroraGPT is trained on the Dolma dataset (initially v0), now in the process of moving to v6. For more details on the dataset, refer to https://huggingface.co/datasets/allenai/dolma. The dolma dataset downloaded is already preprocessing to remove the duplicates (dedup) and filtering the data (mixing). For more details refer to https://github.com/allenai/dolma/tree/main/docs and https://github.com/vksastry/dolma_alcf/blob/main/ALCF/Readme.md. + +The data preprocessing of Dolma dataset before training consists of tokenization of the data using a specific tokenizer (LlamaTokenizer is what we are currently using), Use the below script to tokenize the entire dataset. Example shown for Polaris. + +``` bash +cd /eagle/datasets/dolma/utils +./tokenization.sh +``` + +
+ +## ✅ TODOs + +
+TODOs: + +- [ ] Ensure / double check that optimizer settings from `ds_config.json` aren't being overwritten by some defaults in `megatron/arguments.py` + - [ ] specifically, `momentum, beta{1, 2}, etc` + +
Completed + +- Continue runs on Polaris @ + - [x] 48 Nodes + - [x] 32 Nodes + - [x] 16 Nodes + - [x] 8 Nodes + - [x] 4 Nodes + +- [x] Then, try re-creating ( / fixing) conda with `cuda==12.1` + - 😔, failed. + +- ~~‼️ Unable to save checkpoints with `torch==2.1` + `cuda==11.8`~~: + - Fixed in [a57a21f](https://github.com/argonne-lcf/Megatron-DeepSpeed/commit/a57a21f6b2a8abf847f5ef599e1b1edcb5a5e1b5) + +
🐛 Bug + + - Training progresses OK: + + ```bash + [2024-03-07 15:27:02,646] [INFO] [timer.py:260:stop] epoch=0/micro_step=199/global_step=199, RunningAvgSamplesPerSec=58.730622229657506, CurrSamplesPerSec=61.35304005128382, MemAllocated=6.01GB, MaxMemAllocated=19.52GB + iteration 199/ 317892 | consumed samples: 152832 | consumed tokens: 625999872 | elapsed time per iteration (ms): 14287.5 | learning rate: 2.407E-04 | global batch size: 768 | lm loss: 5.905366E+00 | loss scale: 8192.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 53.753 | tokens per gpu per second (tgs): 1146.733 | TFLOPs: 69.85 | + [2024-03-07 15:27:15,063] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=4, lr=[0.000240653265864008, 0.000240653265864008], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-03-07 15:27:17,188] [INFO] [timer.py:260:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=58.730745476291396, CurrSamplesPerSec=58.75503515561452, MemAllocated=6.01GB, MaxMemAllocated=19.52GB + iteration 200/ 317892 | consumed samples: 153600 | consumed tokens: 629145600 | elapsed time per iteration (ms): 14541.4 | learning rate: 2.407E-04 | global batch size: 768 | lm loss: 5.897035E+00 | loss scale: 8192.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 52.815 | tokens per gpu per second (tgs): 1126.713 | TFLOPs: 68.63 | + saving checkpoint at iteration 200 to checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb768_pp1_tp2_fp16 + # ... + ``` + + - Then crashes with: + + ```python + Traceback (most recent call last): + Traceback (most recent call last): + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/pretrain_gpt_alcf.py", line 575, in + model = main() + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/pretrain_gpt_alcf.py", line 554, in main + model = pretrain( + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 226, in pretrain + iteration = train(forward_step_func, + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 1290, in train + save_checkpoint_and_time(iteration, model, optimizer, + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 1151, in save_checkpoint_and_time + save_checkpoint(iteration, model, optimizer, opt_param_scheduler) + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/checkpointing.py", line 259, in save_checkpoint + state_dict[UNIVERSAL_CHECKPOINT_INFO] = _universal_checkpoint_info(model) + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/checkpointing.py", line 783, in _universal_checkpoint_info + info.update(model[0].universal_checkpoint_info()) + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/model/gpt_model.py", line 203, in universal_checkpoint_info + info[TP_REPLICATED_PARAMETER_PATTERNS] = self._get_tp_replicated_param_patterns() + File "/lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__ + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + AttributeError: 'GPTModel' object has no attribute '_get_tp_replicated_param_patterns' + ``` + + 🤔 +
+ +
+ +
+ +
+ +
diff --git a/ALCF/aws_ofi_nccl_plugin.sh b/ALCF/aws_ofi_nccl_plugin.sh new file mode 100644 index 0000000000..ffd1471cd3 --- /dev/null +++ b/ALCF/aws_ofi_nccl_plugin.sh @@ -0,0 +1,20 @@ +#!/bin/bash --login + +# AWS NCCL OFI Plugin settings below +export NCCL_CROSS_NIC=1 +export NCCL_COLLNET_ENABLE=1 +export NCCL_NET="AWS Libfabric" +export LD_LIBRARY_PATH=/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/soft/libraries/hwloc/lib/:$LD_LIBRARY_PATH +export FI_CXI_DISABLE_HOST_REGISTER=1 +export FI_MR_CACHE_MONITOR=userfaultfd +export FI_CXI_DEFAULT_CQ_SIZE=131072 +######################################################### +# WARNING: !!! +# - Currently, `export NCCL_NET_GDR_LEVEL=PHB` +# causes a hang on Polaris. +# so, we don't set it for the time being [2024-05-14]. +# - Seems to work on Perlmutter ??? +# +# export NCCL_NET_GDR_LEVEL=PHB +######################################################### diff --git a/ALCF/data-lists/aurora/algebraic.txt b/ALCF/data-lists/aurora/algebraic.txt new file mode 100644 index 0000000000..d3685cb42b --- /dev/null +++ b/ALCF/data-lists/aurora/algebraic.txt @@ -0,0 +1,16 @@ +0.0018520780893211373 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document algebraic-stack-train +0.0017591050606817512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document algebraic-stack-train +0.001459052794333798 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document algebraic-stack-train +0.0007405667281569194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document algebraic-stack-train +0.00019420030110896795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document algebraic-stack-train +0.0009008668715801845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document algebraic-stack-train +0.00015115827957143057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document algebraic-stack-train +0.0014552844319220648 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document algebraic-stack-train +0.0012469861325685161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document algebraic-stack-train +0.00136412011372413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document algebraic-stack-train +0.0007064279699221103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document algebraic-stack-train +0.0008472240000687427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document algebraic-stack-train +0.0001984375713341955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document algebraic-stack-train +0.0005472773881697123 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document algebraic-stack-train +0.001815779629850992 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document algebraic-stack-train +0.0018313600689757324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document algebraic-stack-train diff --git a/ALCF/data-lists/aurora/arxiv.txt b/ALCF/data-lists/aurora/arxiv.txt new file mode 100644 index 0000000000..c18c2befd2 --- /dev/null +++ b/ALCF/data-lists/aurora/arxiv.txt @@ -0,0 +1,100 @@ +0.0002583902668716813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document arxiv +0.0002646575141232155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document arxiv +0.0003165521247456758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document arxiv +0.0002920706460176214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document arxiv +0.00028396813182810215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document arxiv +0.00030445161883108107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document arxiv +0.00031628781276576474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document arxiv +0.0003083776568189157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document arxiv +0.0003176359471472902 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document arxiv +0.0002536009369131698 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document arxiv +0.0003067491424681363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document arxiv +0.0002597217257557784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document arxiv +0.0003788556450109768 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document arxiv +0.0002796563272052598 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document arxiv +0.00033573826524290287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document arxiv +0.00030523658022800287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document arxiv +0.00032211552192240096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document arxiv +0.0003329295675164247 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document arxiv +0.0003101982186639862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document arxiv +0.00032361798234223355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document arxiv +0.0003495541581652915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document arxiv +0.0002821637448858042 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document arxiv +0.00030399523537629673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document arxiv +0.0002955658968247219 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document arxiv +0.00028942158502924254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document arxiv +0.00028769546171490733 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document arxiv +0.0002938111057234182 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document arxiv +0.0002711150403010948 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document arxiv +0.00031130095874747565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document arxiv +0.0003002996118160777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document arxiv +0.0003732757901604459 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document arxiv +0.00026784205751795894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document arxiv +0.0002799626521661984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document arxiv +0.00034334276069078164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document arxiv +0.0003582469803674965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document arxiv +0.00031094844818418623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document arxiv +0.0002766228384977191 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document arxiv +0.00030297116159471485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document arxiv +0.00027033888377464685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document arxiv +0.00030090862368377933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document arxiv +0.00028543875802490955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document arxiv +0.00027559768459074204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document arxiv +0.0003182185533962886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document arxiv +0.0003311392971435837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document arxiv +0.00028751652060804325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document arxiv +0.000303466863212589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document arxiv +0.00033400462801277524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document arxiv +0.0002589234031777426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document arxiv +0.0002913508598466723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document arxiv +0.0002670572450004856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document arxiv +0.00032027399105647656 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document arxiv +0.00032188376258379377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document arxiv +0.0003161585784100882 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document arxiv +0.0003184249182974135 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document arxiv +0.00030381336664000807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document arxiv +0.0003190437442184283 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document arxiv +0.0002537961798200545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document arxiv +0.0003017817117223326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document arxiv +0.00028685268513240224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document arxiv +0.00031265179094451165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document arxiv +0.00034708319096986816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document arxiv +0.00026650837943080664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document arxiv +0.00034588832248507335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document arxiv +0.0002416982248399037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document arxiv +0.0003089296918222243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document arxiv +0.00029137184185700827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document arxiv +0.00026464226846800774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document arxiv +0.00030545397919456627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document arxiv +0.0003206778460448875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document arxiv +0.00030968971641110967 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document arxiv +0.00023325653928600864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document arxiv +0.00030526899198338555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document arxiv +0.00035376719076633584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document arxiv +0.000290224385981026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document arxiv +0.000294650083382008 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document arxiv +0.00028768858128616436 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document arxiv +0.00030856965235527843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document arxiv +0.00030579942447879054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document arxiv +0.0002863101084704357 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document arxiv +0.0002870032092492213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document arxiv +0.000264182727569885 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document arxiv +0.0002974012367036449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document arxiv +0.00032238412143059203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document arxiv +0.00031683716893819036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document arxiv +0.00031157434937617524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document arxiv +0.0003411742735695989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document arxiv +0.00026778444816570715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document arxiv +0.0003037045797275201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document arxiv +0.00027746114370081314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document arxiv +0.00027148285946862043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document arxiv +0.00028042950114678207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document arxiv +0.0003235607816590721 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document arxiv +0.0003086692227306295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document arxiv +0.00033990349455148105 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document arxiv +0.00030945053208470265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document arxiv +0.00027309074552265303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document arxiv +0.00028737393506316194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document arxiv +0.0003098868328009879 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document arxiv +0.0002614229162588409 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document arxiv +0.0002884388407820923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document arxiv diff --git a/ALCF/data-lists/aurora/books.txt b/ALCF/data-lists/aurora/books.txt new file mode 100644 index 0000000000..6f37023596 --- /dev/null +++ b/ALCF/data-lists/aurora/books.txt @@ -0,0 +1,3 @@ +0.0031025147279277244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document books +0.003102019887362634 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document books +0.0009996745994661548 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document books diff --git a/ALCF/data-lists/aurora/c4.txt b/ALCF/data-lists/aurora/c4.txt new file mode 100644 index 0000000000..7ad92c6086 --- /dev/null +++ b/ALCF/data-lists/aurora/c4.txt @@ -0,0 +1,171 @@ +0.0002406272620255565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document c4 +0.0002404825539493424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document c4 +0.00024062296575435581 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document c4 +0.00024069315766818953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document c4 +0.00024055829162263452 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document c4 +0.00024062053397343032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document c4 +0.0002410715545206964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document c4 +0.00024024881846087368 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document c4 +0.0002407074700790688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document c4 +0.00024072141428809043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document c4 +0.00024027710230872736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document c4 +0.0002409111299205489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document c4 +0.00024081954058275009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document c4 +0.00024086076794990912 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document c4 +0.00024098672620832446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document c4 +0.00024068622303333862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document c4 +0.00024140627024291824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document c4 +0.0002414512033594384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document c4 +0.00024028742594941463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document c4 +0.00024018036089269645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document c4 +0.0002398347365034979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document c4 +0.00024006780153485276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document c4 +0.00024015620270419213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document c4 +0.0002408848259695227 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document c4 +0.0002408023185278831 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document c4 +0.00024021196580140326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document c4 +0.00024077677271297493 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document c4 +0.00024087392454668027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document c4 +0.0002408071293824126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document c4 +0.00024042223828845715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document c4 +0.0002411484752360495 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document c4 +0.00023605263746465907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document c4 +0.00023471222158326908 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document c4 +0.00023432138580287644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document c4 +0.00023407385623382327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document c4 +0.00023487504174367091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document c4 +0.0002341843704976313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document c4 +0.00023421993170282486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document c4 +0.00023445057969132037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document c4 +0.0002337681680073047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document c4 +0.000234627964808109 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document c4 +0.0002338942211888584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document c4 +0.00023403849286843386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document c4 +0.00023405641310796305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document c4 +0.00023349169562397965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document c4 +0.00023381157386048856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document c4 +0.00023388742993790587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document c4 +0.00023363103829469813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document c4 +0.00023421141834630477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document c4 +0.00023420564352232565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document c4 +0.00023367463699173143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document c4 +0.00023344969163567033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document c4 +0.00023372196941547188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document c4 +0.00023399207645297834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document c4 +0.00023357915605505856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document c4 +0.00023337585642190864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document c4 +0.00023385005470157914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document c4 +0.00023301533534493465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document c4 +0.00023377864302541782 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document c4 +0.00023323745848621437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document c4 +0.0002330594611151835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document c4 +0.0002334149675026783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document c4 +0.00023198945902291534 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document c4 +0.00023023784834634142 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document c4 +0.00022985623060187217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document c4 +0.0002292605284569516 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document c4 +0.00022926593333048894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document c4 +0.00022922766406807777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document c4 +0.00022898153911167426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document c4 +0.0002292473111593315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document c4 +0.000228804579400424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document c4 +0.00022865485613513526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document c4 +0.00022937426835887895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document c4 +0.00022917388311587372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document c4 +0.0002291660582019043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document c4 +0.00022907895248360543 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document c4 +0.0002294617879920205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document c4 +0.0002290452150516566 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document c4 +0.00022943405619715553 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document c4 +0.0002296271421006204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document c4 +0.00022854791372910372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document c4 +0.00022923123467686557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document c4 +0.00022852404355738494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document c4 +0.00022847798660086642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document c4 +0.0002289604586810316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document c4 +0.00022835479834950643 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document c4 +0.0002289149402884243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document c4 +0.00022806655474763446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document c4 +0.00022826296420992974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document c4 +0.00022906829636213627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document c4 +0.0002287628414466998 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document c4 +0.0002282673911253445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document c4 +0.00022869309841939134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document c4 +0.0002281540116815451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document c4 +0.0002259755756162738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document c4 +0.00022562331285233504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document c4 +0.0002259061146106053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document c4 +0.00022567670836663787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document c4 +0.00022573165387587061 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document c4 +0.00022508514961670572 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document c4 +0.00022564642513773356 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document c4 +0.00022563088621998788 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document c4 +0.0002250438755373707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document c4 +0.00022524465346241134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document c4 +0.00022531737657666812 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document c4 +0.00022444687519363458 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document c4 +0.00022460397498596298 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document c4 +0.00022454218976501763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document c4 +0.00022447528843671366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document c4 +0.00022501666332178926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document c4 +0.00022453752304377972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document c4 +0.00022484451871163002 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document c4 +0.00022465678847154914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document c4 +0.00022453180917044732 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document c4 +0.0002247278486823009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document c4 +0.00022465794828242097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document c4 +0.00022431000701925386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document c4 +0.00022476020248460963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document c4 +0.00022467531771795015 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document c4 +0.0002236391309945234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document c4 +0.00022458764920536007 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document c4 +0.00022430877426744415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document c4 +0.0002247047786127192 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document c4 +0.0002245298090400035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document c4 +0.0002245648831396188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document c4 +0.00022292894729820784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document c4 +0.00022236668082957533 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document c4 +0.0002217622659895442 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document c4 +0.00022252452726732609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document c4 +0.00022135333211363678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document c4 +0.0002214571757787971 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document c4 +0.0002217188139237798 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document c4 +0.00022144214894640303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document c4 +0.00022100172806631854 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document c4 +0.00022156392409199052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document c4 +0.00022134830143710272 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document c4 +0.00022158598922529453 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document c4 +0.00022142932483041377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document c4 +0.00022120980907786554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document c4 +0.00022117917738112441 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document c4 +0.00022077089397851235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document c4 +0.00022093265074996711 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document c4 +0.00022091299741377004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document c4 +0.0002205849150703338 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document c4 +0.0002210648204787979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document c4 +0.0002214235747364102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document c4 +0.00022083907302221787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document c4 +0.0002206334237915964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document c4 +0.00022065193929912214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document c4 +0.00022079775597767288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document c4 +0.00022091492909963518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document c4 +0.00022095009987097293 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document c4 +0.0002208150577180165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document c4 +0.00022085759102772088 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document c4 +0.00022073789170129016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document c4 +0.00022049322781182384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document c4 +0.00022083270617761285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document c4 +0.00021982452827473632 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document c4 +0.00021899870446514259 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document c4 +0.00021890358773356361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document c4 +0.00021875556609042841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document c4 +0.00021861195987201226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document c4 +0.00021856782186167455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document c4 +0.00021912837771543515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document c4 +0.00021900213768517756 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document c4 +0.00021871675851390374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document c4 +0.0002180537056545586 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document c4 +0.0002188196714327129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document c4 +0.00021851362624523464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document c4 +0.0002183236795498736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document c4 +7.291153618675672e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document c4 diff --git a/ALCF/data-lists/aurora/cc.txt b/ALCF/data-lists/aurora/cc.txt new file mode 100644 index 0000000000..174bae9d6a --- /dev/null +++ b/ALCF/data-lists/aurora/cc.txt @@ -0,0 +1,1108 @@ +0.0003742481815405742 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document cc +0.00038204855962733055 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document cc +0.00038821818392663593 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document cc +0.00038723332988783727 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document cc +0.00038916141142149904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document cc +0.00038049542523949033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document cc +0.0003854755539534284 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document cc +0.00024202756466512517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document cc +0.0003915405155008087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document cc +0.0003927382151931033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document cc +0.0003839151202260479 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document cc +0.00040006817468967907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document cc +0.00040318965964443476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document cc +0.0003831013019452741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document cc +0.00039166638383204036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document cc +0.00039962784023961004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document cc +0.00039536707853602614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document cc +0.0004204304698247758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document cc +0.00041538899178693555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document cc +0.00039186953333675306 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document cc +0.00038945837196504305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document cc +0.0003919951238929062 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document cc +0.00044377065718528966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document cc +0.0004407759068603017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document cc +0.0002487811895843715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document cc +0.00039349432045556636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document cc +0.00041223198559462343 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document cc +0.0004036573014830213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document cc +0.0003825982215521807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document cc +0.00040386867133151386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document cc +0.00024460575279105167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document cc +0.000269029789531335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document cc +0.0003573757493252864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document cc +0.0004600876681392076 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document cc +0.0002605354166397086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document cc +0.0003882502452157999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document cc +0.0002466747612126512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document cc +0.0004024726105072402 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document cc +0.00040820631128483644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document cc +0.0002691094350403538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document cc +0.00026916830387277267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document cc +0.0004204663297880574 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document cc +0.00042379698687085554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document cc +0.0004502169227311871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document cc +0.0002661708937015295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document cc +0.00031239486948031334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document cc +0.0003109054589936201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document cc +0.00045873053079760646 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document cc +0.00022904931423244635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document cc +0.0003813462028433663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document cc +0.00039188129256500874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document cc +0.00045124222276983765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document cc +0.00048138658436853695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document cc +0.0003944178776279866 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document cc +0.00039941569676754006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document cc +0.00037952761190240494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document cc +0.0003944870860881476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document cc +0.0003891842411856621 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document cc +0.000387688981934861 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document cc +0.00039197953876258005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document cc +0.00039007915280311206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document cc +0.0003995520363699188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document cc +0.00039230985654592406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document cc +0.0003929472067173851 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document cc +0.0003924096172671473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document cc +0.0003881636143629905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document cc +0.000389790617937084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document cc +0.00037351762309221023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document cc +0.0003630196170929407 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document cc +0.00033532465765142113 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document cc +0.0003076088685761823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document cc +0.00039463850897720803 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document cc +0.0002843816115231449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document cc +0.0002909175709416474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document cc +0.00028867170997202486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document cc +0.0002838644617723659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document cc +0.00029027869525543416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document cc +0.0002821339567560056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document cc +0.0002922988877045601 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document cc +0.0002866955958315786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document cc +0.0002865271754558126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document cc +0.0002861247475618473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document cc +0.0002826681072408606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document cc +0.0002849746458282827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document cc +0.0002816966633435316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document cc +0.00026255342235948463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document cc +0.0002552895098829678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document cc +0.00025990194083107813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document cc +0.0002524062657685835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document cc +0.0002538577379748611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document cc +0.0002561415177406761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document cc +0.00026206253059694905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document cc +0.00026168095406910565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document cc +0.0002601305742008613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document cc +0.00025200823006814814 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document cc +0.0003229951981263502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document cc +0.00037289448266476045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document cc +0.0003807825862179898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document cc +0.0003616333738191483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document cc +0.0003665117918907636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document cc +0.0003684186453633228 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document cc +0.0003589330610806066 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document cc +0.00036383861418030395 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document cc +0.000359841363355303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document cc +0.00036431044063050464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document cc +0.0003668574090358279 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document cc +0.000362768263620199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document cc +0.0003501888032771077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document cc +0.000352401968221528 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document cc +0.0003541019701869794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document cc +0.0003628121865546891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document cc +0.0003752582953758773 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document cc +0.00037902046230424966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document cc +0.0003777927146925147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document cc +0.0003760676130509053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document cc +0.00034046049078755405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document cc +0.0003338847563259091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document cc +0.00033294499102761794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document cc +0.0004912026198265864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document cc +0.00032064363474664014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document cc +0.00032154190389541214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document cc +0.00032309660151746207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document cc +0.00031181143365304544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document cc +0.00031046092294569104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document cc +0.00031150165249068046 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document cc +0.0003041314265988224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document cc +0.0003024834909739394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document cc +0.0003019936835833604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document cc +0.000292329665283177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document cc +0.0002867061143144972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document cc +0.00028443615610701707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document cc +0.00028462291013755945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document cc +0.0002793538601205013 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document cc +0.00027306573977044246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document cc +0.00027097155673336525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document cc +0.0002752934202112985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document cc +0.00043042012694697647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document cc +0.00047495648822986177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document cc +0.00047755032493473855 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document cc +0.0004706974343933747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document cc +0.00046682163297771817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document cc +0.0004616765425874178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document cc +0.00030644496751628097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document cc +0.0002909492555358308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document cc +0.00027272036068261724 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document cc +0.0004101070217315588 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document cc +0.0003728914338834357 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document cc +0.00036546911442305647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document cc +0.0003669945482407483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document cc +0.0003715902407424017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document cc +0.00035837486406683366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document cc +0.0003573318538685469 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document cc +0.0003553784893071916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document cc +0.0004920659809912352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document cc +0.0004533619411303183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document cc +0.00045067066057818706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document cc +0.00044396985139270645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document cc +0.00043198288204468477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document cc +0.00043005174223738454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document cc +0.00041847118430776784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document cc +0.00042952036375796664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document cc +0.00043420594647324267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document cc +0.0003461123241053012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document cc +0.0003408581597849182 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document cc +0.00033172705422182547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document cc +0.0003392566490686136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document cc +0.00033578341518385483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document cc +0.0003439196710518844 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document cc +0.00034559163447085543 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document cc +0.00033762478642902825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document cc +0.00033215210055107224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document cc +0.00033423579608014966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document cc +0.0004963355016025102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document cc +0.0004996862761456923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document cc +0.0005000551829325451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document cc +0.0005004212610098755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document cc +0.00027768695585500585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document cc +0.00028395983854338433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document cc +0.00027835826303062254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document cc +0.0002740073176010804 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document cc +0.0002791830529274016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document cc +0.0002796863816194411 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document cc +0.00026697453022672804 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document cc +0.0002594197440280141 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document cc +0.0003779565697649222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document cc +0.00041835823476586606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document cc +0.00043788493575265915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document cc +0.0002731731970096006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document cc +0.000276305847423402 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document cc +0.0002704955773958623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document cc +0.0002629635944827518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document cc +0.000260070956974436 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document cc +0.00025661553791456334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document cc +0.00025794727207576157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document cc +0.00025295733980001527 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document cc +0.0003788106407021029 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document cc +0.0004882344027669431 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document cc +0.0003275324309642705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document cc +0.0004803401856640094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document cc +0.00046720138323433943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document cc +0.00043527810307095335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document cc +0.00043905395741627827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document cc +0.00048774175867331425 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document cc +0.00048380704121346737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document cc +0.0004779011848346118 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document cc +0.00046255587581908036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document cc +0.00045127922880511576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document cc +0.0004503891485256095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document cc +0.0004450142332303422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document cc +0.00044630282482516654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document cc +0.00044325014465743616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document cc +0.0004263874842796447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document cc +0.0004217530913646938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document cc +0.000415120314341852 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document cc +0.00040987168279144537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document cc +0.00033468337266607834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document cc +0.0003353094464683005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document cc +0.0004833936821707294 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document cc +0.00047194878988920935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document cc +0.0004648324126996427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document cc +0.0004562345003964941 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document cc +0.0004933203505465098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document cc +0.0003530166075325466 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document cc +0.00035368548192804685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document cc +0.0004872620828289663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document cc +0.00048293889392426456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document cc +0.00047936768462267655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document cc +0.00047821013991587545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document cc +0.0004660610308564753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document cc +0.000394683430103437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document cc +0.00039165053441571324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document cc +0.0003906936040164381 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document cc +0.00038074803919159006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document cc +0.0003686529291578143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document cc +0.00035832920428870976 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document cc +0.00035929024535947033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document cc +0.0003538226556050544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document cc +0.0003584167868708799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document cc +0.0003480507542594234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document cc +0.0003413709023543034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document cc +0.00034001304759361455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document cc +0.00033430532902756514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document cc +0.00046519252660631277 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document cc +0.0002938876402514769 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document cc +0.00028676090994509047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document cc +0.00027296150117506716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document cc +0.00026513502621960483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document cc +0.0002680081327926125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document cc +0.00025831225828720344 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document cc +0.00026647037295561 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document cc +0.0002525733734572654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document cc +0.00025831708887575375 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document cc +0.00042487627444443476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document cc +0.0004951213245023891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document cc +0.0004804051413177752 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document cc +0.0004662397611340532 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document cc +0.0004550138655253933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document cc +0.00044494909122746795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document cc +0.0002899112253051385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document cc +0.0004372879736279761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document cc +0.0004529568099252922 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document cc +0.00045127826158829573 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document cc +0.0004436558176737439 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document cc +0.0004419233237678378 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document cc +0.000434589215880319 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document cc +0.00029153613207706566 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document cc +0.0004312458058738854 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document cc +0.00028741854968757313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document cc +0.00046853200754421234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document cc +0.0004949145252030074 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document cc +0.00044459683920483167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document cc +0.0003836095306696336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document cc +0.0003789760237872398 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document cc +0.0003749227438304427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document cc +0.0003628558277173369 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document cc +0.00039468301394041474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document cc +0.00038874701821614864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document cc +0.0004158492456077867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document cc +0.00042360504554060077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document cc +0.00040386729844317623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document cc +0.00027595096702902474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document cc +0.00043638766787829135 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document cc +0.0002218691596850179 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document cc +0.0004437566108089954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document cc +0.0003889996411609667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document cc +0.00043454421906537704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document cc +0.0004522564392830988 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document cc +0.00041517835659357416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document cc +0.0002614360863446896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document cc +0.00037543522111463596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document cc +0.0004386190133514781 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document cc +0.00046358333286115075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document cc +0.00043186261317942404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document cc +0.0002377581602097957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document cc +0.00025973334085074254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document cc +0.00040139099332000796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document cc +0.00043674860686687174 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document cc +0.00040853289309329373 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document cc +0.000242910191729688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document cc +0.0004431071731750582 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document cc +0.0004388092670482523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document cc +0.000381418866255965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document cc +0.0004100117296419717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document cc +0.00042469230366022745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document cc +0.00041744151905374254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document cc +0.00022835699906752945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document cc +0.0004380161085387397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document cc +0.00044803212381807456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document cc +0.00040554932796137236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document cc +0.0004234508646347761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document cc +0.00043341209652360653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document cc +0.00023966604734537185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document cc +0.000259165907316014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document cc +0.0004270653021833602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document cc +0.0004341547032162028 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document cc +0.0004111478117275994 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document cc +0.0004299383567984396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document cc +0.0004241899124590779 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document cc +0.0004502719349364145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document cc +0.00038994621469645615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document cc +0.0003859912398894952 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document cc +0.0004247535950310557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document cc +0.000386982084327716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document cc +0.0004196451040053251 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document cc +0.0004096278509782259 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document cc +0.0004373334932695721 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document cc +0.0004180889975240641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document cc +0.00042079636929672745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document cc +0.00038063574611812913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document cc +0.0003817505891515542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document cc +0.0004420096268860222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document cc +0.00039182670726410623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document cc +0.0003635667850372299 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document cc +0.00041564996472055667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document cc +0.000400529358757286 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document cc +0.0003939113874958451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document cc +0.00039066622068940996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document cc +0.0004290098538807143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document cc +0.0004240739958197099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document cc +0.00040775392659215333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document cc +0.0004091634200396925 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document cc +0.00042299190476617914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document cc +0.0003701492680344151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document cc +0.0003807353844384635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document cc +0.00038813507771983156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document cc +0.00040072346558408346 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document cc +0.0003603595180423597 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document cc +0.00038799421353112465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document cc +0.00037575235582264926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document cc +0.0004239190342959713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document cc +0.0004606044799136546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document cc +0.00045107950652529253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document cc +0.0004391947201871058 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document cc +0.0004457516661123035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document cc +0.0004301297170991686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document cc +0.00044661704164586694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document cc +0.0004438849846114837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document cc +0.0004444205734316823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document cc +0.0004190924165303394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document cc +0.00043942581131677875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document cc +0.00021568459798090663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document cc +0.0003814929225407199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document cc +0.0003217453179359235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document cc +0.00031719591470267974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document cc +0.00032434115726922137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document cc +0.0004079911120371051 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document cc +0.000329492766381148 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document cc +0.0003845916162001633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document cc +0.0003835208964390098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document cc +0.00037847334157173194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document cc +0.00038296039903791865 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document cc +0.00037896336828472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document cc +0.00037620974396391355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document cc +0.00037420590727111843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document cc +0.000340490625886403 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document cc +0.0003078314411035827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document cc +0.00034153990750656097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document cc +0.0003308858103982067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document cc +0.0003452640607156025 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document cc +0.00033095276418403455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document cc +0.0003116308995860414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document cc +0.00032446713226408477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document cc +0.0003015816821912984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document cc +0.00031612418775706894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document cc +0.0003278516344971041 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document cc +0.00033079446736097217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document cc +0.00032278977146550837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document cc +0.00032065272988207914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document cc +0.0003936696452406576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document cc +0.0003450109536627789 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document cc +0.0003339787189919641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document cc +0.0003284303856176974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document cc +0.00033652677276843477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document cc +0.0003257822443845694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document cc +0.0003293985569149334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document cc +0.0003310360260148262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document cc +0.0003233770986418526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document cc +0.0003172280092149422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document cc +0.0003160674744292835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document cc +0.00030931090289598506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document cc +0.0003093173886443107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document cc +0.00033167847081104083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document cc +0.00031131501311729723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document cc +0.00031046608876279845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document cc +0.00030569235942207244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document cc +0.00030777943671285197 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document cc +0.00029303314290956683 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document cc +0.0003045824546400205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document cc +0.00030360880677729793 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document cc +0.00031646239964835433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document cc +0.0003129122300603785 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document cc +0.00031060464956661433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document cc +0.000311819032500067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document cc +0.0002977872483902282 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document cc +0.0003009448600922438 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document cc +0.00028610292098537774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document cc +0.0002988326876216654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document cc +0.00028550828372819075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document cc +0.0002830381750875739 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document cc +0.0002848495855927156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document cc +0.0002856443760308144 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document cc +0.00027442895344188584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document cc +0.0002681160554049462 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document cc +0.0003421482544126989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document cc +0.0004005872948449718 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document cc +0.0003930123959320308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document cc +0.0003867271832275778 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document cc +0.000380805140455254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document cc +0.0003814769861947819 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document cc +0.00038025170883282324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document cc +0.0003738026647867475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document cc +0.00018960856915036276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document cc +0.0003697177501953134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document cc +0.00036674194328136693 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document cc +0.00036447406838697555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document cc +0.00036686410861101255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document cc +0.00035915267825103423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document cc +0.0003624758404026675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document cc +0.0002822812140180794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document cc +0.00030620512946920813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document cc +0.000294249776520589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document cc +0.00030238536967523434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document cc +0.00029509593361580754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document cc +0.0002906912701830899 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document cc +0.0002921944165474959 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document cc +0.00028358919691127954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document cc +0.0002813182772323272 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document cc +0.00027442640800299205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document cc +0.0002747820342933984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document cc +0.0002747584403979717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document cc +0.00027499129634862444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document cc +0.0002712050404257197 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document cc +0.0002616256943143254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document cc +0.00026769938929002815 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document cc +0.00038396081322727017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document cc +0.0003863140490027991 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document cc +0.00037702277513203237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document cc +0.0003633274156107032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document cc +0.0003587473889240435 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document cc +0.0003507672084278415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document cc +0.00033776425499780385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document cc +0.0003377914127574796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document cc +0.00032948015659161326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document cc +0.00033245638541392985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document cc +0.00031080707640648695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document cc +0.0002976903331149755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document cc +0.0002965121463725523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document cc +0.0002933849695266647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document cc +0.0002837035078508233 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document cc +0.00028684569079589323 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document cc +0.0003145192320802359 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document cc +0.0003566937253273515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document cc +0.0003470199109592918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document cc +0.0003060245312041868 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document cc +0.0002650817213818789 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document cc +0.0002643604938780134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document cc +0.000299350876031416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document cc +0.0003178540797697938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document cc +0.000271850367887767 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document cc +0.00031349896596549 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document cc +0.00031749734412765755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document cc +0.0003791137842391209 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document cc +0.0003742334169957992 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document cc +0.0003705639757351107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document cc +0.0003126986769797042 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document cc +0.00031038132814561196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document cc +0.00036464437173804883 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document cc +0.0003569480488951322 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document cc +0.0003541239221619106 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document cc +0.00035315297411308053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document cc +0.0003572451925404141 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document cc +0.0003514986129411253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document cc +0.0003521798298425866 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document cc +0.00034553677439244716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document cc +0.000349004719809412 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document cc +0.0003468247484872769 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document cc +0.0003465822608356558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document cc +0.00035410983132162007 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document cc +0.0003487908354969444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document cc +0.0003479024763238147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document cc +0.000341412530646823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document cc +0.00034451316273667034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document cc +0.0002618849993484869 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document cc +0.00026788679978901144 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document cc +0.00027450670773227214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document cc +0.0002661273129899329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document cc +0.00026836569676402957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document cc +0.00026155876975483236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document cc +0.0002609276830117151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document cc +0.0002644161630512771 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document cc +0.00036789208972872557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document cc +0.00037829849439990513 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document cc +0.0003788894943523098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document cc +0.0003617207777959397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document cc +0.0002541334487248998 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document cc +0.0002707945538071073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document cc +0.00027046282716455214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document cc +0.0002652443167243215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document cc +0.0002685859923850986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document cc +0.00025734961751176414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document cc +0.000259041720872915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document cc +0.00025340107274823446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document cc +0.00025757135121837893 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document cc +0.00025617700500574084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document cc +0.0002566931670562857 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document cc +0.0002543871190716101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document cc +0.00024997565589481713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document cc +0.0002954079779456287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document cc +0.00034890741135252835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document cc +0.0003473298137731525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document cc +0.0003296959618486435 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document cc +0.0003304520061604598 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document cc +0.00032377956175729824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document cc +0.00031700696295168713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document cc +0.0003060382346081943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document cc +0.0003012003005056863 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document cc +0.0002981074073993884 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document cc +0.0002922128825950705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document cc +0.000348901087722931 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document cc +0.0003408286289467841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document cc +0.0003410649680770183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document cc +0.0003358524215576502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document cc +0.0003343661874989231 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document cc +0.00032810573699389156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document cc +0.00032261449539097497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document cc +0.0003162694866049203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document cc +0.0003158381156468853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document cc +0.000317376061083603 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document cc +0.0003125788639953052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document cc +0.0003010105041885602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document cc +0.0003065865059090678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document cc +0.0003084275726508053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document cc +0.00030966560718296085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document cc +0.0002957728057853081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document cc +0.00029904164542325336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document cc +0.0002955358888729187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document cc +0.00028692976446931544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document cc +0.0002923476214935797 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document cc +0.0002893691697212419 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document cc +0.0002855895211981585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document cc +0.00027968347097626246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document cc +0.0002810783462604979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document cc +0.00027794080455729715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document cc +0.00034784376461416953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document cc +0.0003488347959010943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document cc +0.00034790583710250724 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document cc +0.000345913166618151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document cc +0.00033801936268066675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document cc +0.0003290591130212315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document cc +0.00034051399521366823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document cc +0.00032470943131841784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document cc +0.00031679540050914276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document cc +0.00031814596342422325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document cc +0.0003156466289485036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document cc +0.00029985010879003633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document cc +0.0002905176377776361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document cc +0.0004206836775460856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document cc +0.00020660449162246918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document cc +0.0003461727254468087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document cc +0.00020592870907067763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document cc +0.00034173505299233005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document cc +0.0004052437256652738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document cc +0.0004080650901351697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document cc +0.00039778184149144276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document cc +0.00039046311464950275 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document cc +0.00039043444911071384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document cc +0.000388575704932843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document cc +0.00019737533145666597 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document cc +0.00037610755595812403 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document cc +0.00037315400127598317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document cc +0.00037415028580922163 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document cc +0.00036694041707212337 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document cc +0.00018947219857306515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document cc +0.00037046050826533545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document cc +0.0003587440768559087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document cc +0.00034623936498708903 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document cc +0.0003502289592617922 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document cc +0.00034692398063649823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document cc +0.000339340809421849 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document cc +0.0003360510394816983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document cc +0.0003354673850814145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document cc +0.00032937682875877047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document cc +0.00032844505049317715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document cc +0.00028287199339908627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document cc +0.0002795217197003578 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document cc +0.00028048955601883463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document cc +0.0002769326396439027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document cc +0.0002727090021299243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document cc +0.0002726577841024554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document cc +0.00026663619593455374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document cc +0.00026068042672138127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document cc +0.0002637704114326801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document cc +0.0002593043567100412 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document cc +0.0002599897110113453 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document cc +0.0002435078682758859 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document cc +0.0002450530071379054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document cc +0.00024233331983743606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document cc +0.0002934750947999535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document cc +0.00033241226364044474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document cc +0.00032938406090272075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document cc +0.00032778705403953246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document cc +0.00032184551480398754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document cc +0.00031874002264945737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document cc +0.0003165319685666433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document cc +0.00031307071173376295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document cc +0.00031119524184911957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document cc +0.0003102253344576429 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document cc +0.0003088976240383192 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document cc +0.0002951410823077708 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document cc +0.00029772657676757413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document cc +0.0003056048989909935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document cc +0.00031991305381648026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document cc +0.00030890256978362426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document cc +0.0003109382904091933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document cc +0.00031035798529690644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document cc +0.00030741666395911753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document cc +0.0002989918594861846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document cc +0.00029569635443989434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document cc +0.0002973992445667285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document cc +0.000293397351001072 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document cc +0.00028737817438047954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document cc +0.00028252738144009747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document cc +0.0002805511898623541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document cc +0.0003718020784620472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document cc +0.0003499713845765235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document cc +0.00034283547445326676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document cc +0.00031464759888838765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document cc +0.00033188946446414833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document cc +0.000326084432195463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document cc +0.0003764568303917893 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document cc +0.0003604955598858414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document cc +0.0003655654554133222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document cc +0.00035762304033750504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document cc +0.00038478883950347103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document cc +0.00027735714341247454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document cc +0.00028139534607773563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document cc +0.00019777292251713763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document cc +0.000285571704874486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document cc +0.00028543482146244363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document cc +0.00019434234484256758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document cc +0.00027854908176986763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document cc +0.0002847068039566143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document cc +0.00028672356943064853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document cc +0.00027782687605808177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document cc +0.0002843539634105203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document cc +0.0002894748379090401 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document cc +0.0002868852440186493 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document cc +0.0002818504885373851 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document cc +0.00028680112812941034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document cc +0.00019258978168723977 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document cc +0.00028760637934715155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document cc +0.0002820439443912918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document cc +0.0002831001054410018 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document cc +0.00029001901552467397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document cc +0.00027779449377883156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document cc +0.00019949837437516796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document cc +0.0002907306472984446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document cc +0.00027814858381318327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document cc +0.00019472790889161432 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document cc +0.00020472626596924125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document cc +0.0002870045081974301 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document cc +0.00019812241927078482 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document cc +0.0002817553333369554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document cc +0.00027829782796642117 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document cc +0.00028289431732284113 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document cc +0.0002795526296717729 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document cc +0.00027682829988044574 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document cc +0.0002895432402719184 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document cc +0.0002823174903941811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document cc +0.00028170972351837796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document cc +0.00027807915877838826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document cc +0.00028588515681452956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document cc +0.00028112324090816726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document cc +0.00020636178289985485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document cc +0.00019447255290980535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document cc +0.0002850824220591452 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document cc +0.00027856429520116784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document cc +0.0002820880676635633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document cc +0.00028943902215995714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document cc +0.0002676366291085329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document cc +0.00023806333809954687 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document cc +0.00024526460430233455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document cc +0.00023876876664622726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document cc +0.00023379770334179805 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document cc +0.00024175151269138382 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document cc +0.00023386583242595706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document cc +0.00023771797150160827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document cc +0.0002262748967483896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document cc +0.0002408148346432682 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document cc +0.00023398651720444235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document cc +0.00022989433874474592 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document cc +0.00023948500543957772 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document cc +0.0002331594076859196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document cc +0.00023375132439600242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document cc +0.00023923410909668642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document cc +0.00023952796315562954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document cc +0.0002327466076905069 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document cc +0.00023082758956797212 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document cc +0.0002240509275524448 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document cc +0.00022798879995765268 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document cc +0.000221172516774386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document cc +0.00021767045123534623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document cc +0.00021982832794804484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document cc +0.00021971626543789102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document cc +0.00022566565206920132 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document cc +0.0002181984894194856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document cc +0.00021831417549554653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document cc +0.00021601405421187145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document cc +0.00022275733725519607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document cc +0.00021847734911973986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document cc +0.0002243591012664014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document cc +0.00021688758139483833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document cc +0.0002182953624789215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document cc +0.00020475155724026002 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document cc +0.00021498078062960065 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document cc +0.0002157914337233064 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document cc +0.00021781838494967963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document cc +0.00021723242266814558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document cc +0.0002176782686553837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document cc +0.0003486179404943968 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document cc +0.00034882846352857634 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document cc +0.00031400868448352596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document cc +0.00030273484020011963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document cc +0.00029895889118145404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document cc +0.00029770764609621714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document cc +0.0002990181332116852 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document cc +0.00029653733972285996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document cc +0.00029624649222942476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document cc +0.00029625609720203576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document cc +0.00029731928930852147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document cc +0.00029011721326148513 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document cc +0.00028849788197494655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document cc +0.00021601278623858145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document cc +0.00021319599281739178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document cc +0.0002153325290600083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document cc +0.00018566946174516558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document cc +0.00020736824394291617 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document cc +0.00020857419820128004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document cc +0.00020058526129536423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document cc +0.00020745812166665217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document cc +0.00020652171015271702 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document cc +0.00020643808911278608 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document cc +0.00020040513914482103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document cc +0.00020598050188272898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document cc +0.0001969184139343296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document cc +0.0001972748812937012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document cc +0.0002038556751586195 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document cc +0.00020245186011313464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document cc +0.00019950381422038783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document cc +0.00020837055459665258 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document cc +0.00020371856218246096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document cc +0.00019537612301625791 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document cc +0.00019914984508813857 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document cc +0.0002053787713691309 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document cc +0.00019082100541008637 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document cc +0.00020397153334531813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document cc +0.0002021462693077317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document cc +0.00019609357008124035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document cc +0.00019693256622486236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document cc +0.00020007239732428112 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document cc +0.00020467075741591954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document cc +0.00019584883400022932 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document cc +0.00019135050391176972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document cc +0.0003362829834208298 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document cc +0.00034013691154784095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document cc +0.00033215887031941976 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document cc +0.00032681189065396707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document cc +0.0003149138485493094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document cc +0.00030179177307540077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document cc +0.0002923278437581119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document cc +0.00029470052278994486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document cc +0.0002994095093045731 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document cc +0.00029033525096085037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document cc +0.00029390798852496565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document cc +0.0002916230924130842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document cc +0.00029419886374594913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document cc +0.0002865469756730764 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document cc +0.00021191292549942086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document cc +0.00021369664817409847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document cc +0.00021612485624266726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document cc +0.00022242192634588478 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document cc +0.00014605095659989698 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document cc +0.00022070626106341693 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document cc +0.0002174420774054071 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document cc +0.00021325858963116995 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document cc +0.0002124322999488052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document cc +0.0002081218896969054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document cc +0.0002108710211556957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document cc +0.00020686867095978426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document cc +0.00020895752681041895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document cc +0.00020741922266415738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document cc +0.0002069112657197308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document cc +0.00020644627473468118 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document cc +0.00020332991338121604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document cc +0.0003560895677789848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document cc +0.00032915779111908214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document cc +0.00033810613317040864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document cc +0.00033729626594036923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document cc +0.00033550342864602944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document cc +0.00034173474024556906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document cc +0.000331505340748827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document cc +0.0003270050330117195 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document cc +0.00032585275329172556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document cc +0.0003143383203190604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document cc +0.00031655199110388894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document cc +0.00030738872158476413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document cc +0.00030838388352699285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document cc +0.0003053596995351888 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document cc +0.00031836304739584593 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document cc +0.000315315435873905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document cc +0.0003087116248965243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document cc +0.00030396790625537645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document cc +0.0003335812246032149 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document cc +0.00034570956323095843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document cc +0.00034563035636675786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document cc +0.00033411265479076335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document cc +0.00034439191141692787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document cc +0.0003364483125496565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document cc +0.0003299500453608033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document cc +0.00033163377700074837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document cc +0.00032638649660627673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document cc +0.00032616167939645234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document cc +0.0003205289298760723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document cc +0.00031939393740815355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document cc +0.00031593164066731296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document cc +0.00031928871111254405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document cc +0.00029670189073175004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document cc +0.00020517703846735904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document cc +0.00020128418186172073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document cc +0.00019662723895606717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document cc +0.0001981157042081407 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document cc +0.00019703489037041608 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document cc +0.00019079796331785068 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document cc +0.0001909352306690079 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document cc +0.00018824662295261396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document cc +0.00019864275319325954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document cc +0.00018818516521649587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document cc +0.00018875694972812844 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document cc +0.00018231621170645482 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document cc +0.00018349407845798273 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document cc +0.00018088971427746906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document cc +0.00018296284236327237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document cc +0.0001876011825819916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document cc +0.000329052068725176 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document cc +0.00032223616273648536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document cc +0.00031272564089633955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document cc +0.00031621609908414494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document cc +0.0003117213560911235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document cc +0.00030218064069945934 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document cc +0.00030658916600512085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document cc +0.0002915863534115821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document cc +0.0002940280138374372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document cc +0.00029067860468866085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document cc +0.00028529228063135635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document cc +0.00028336893301452256 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document cc +0.0002794668089130099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document cc +0.00021681361378827842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document cc +0.0001484664674497246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document cc +0.00021950558378215133 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document cc +0.00021806860758808645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document cc +0.00021819568718852282 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document cc +0.00021626925931585001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document cc +0.0001464536143077762 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document cc +0.00021432777088808917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document cc +0.000213473805865147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document cc +0.00021397067253964538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document cc +0.00020758957647437263 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document cc +0.00020687124337683314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document cc +0.00020630057046511005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document cc +0.0002091166859352538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document cc +0.00020777355025615267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document cc +0.00020709287641496176 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document cc +0.00020736464660577094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document cc +0.00020062246741862607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document cc +0.00020693207561942915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document cc +0.00021151004871893024 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document cc +0.00019930249098689716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document cc +0.00021589710041231824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document cc +0.00021369204789905741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document cc +0.0002147099923936778 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document cc +0.00021077531190389536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document cc +0.0002100509829113836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document cc +0.00021185362601571124 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document cc +0.00020722136637339565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document cc +0.00020300093701169531 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document cc +0.00019859737993313477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document cc +0.00019971314372100164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document cc +0.00019549908270269278 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document cc +0.00019649820843534028 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document cc +0.00019619415513498067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document cc +0.00019493006120377898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document cc +0.00019499409035775506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document cc +0.00019252988593634277 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document cc +0.00019440768268686405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document cc +0.00018747161324755577 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document cc +0.0001879575932372779 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document cc +0.00019040707058357506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document cc +0.0001871931095090703 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document cc +0.00020112966223017096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document cc +0.00020516878165311017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document cc +0.00020664735191740533 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document cc +0.00021041398572882962 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document cc +0.00020397992929690396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document cc +0.0002039978580295561 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document cc +0.00020592785601142126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document cc +0.0001990755527445265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document cc +0.00019729564847798732 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document cc +0.00019958182230527032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document cc +0.0001985037302636386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document cc +0.00020204130355115716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document cc +0.0002000296401958085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document cc +0.0001983064832295463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document cc +0.00019663108484195617 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document cc +0.00019510678560556523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document cc +0.0001873284057063206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document cc +0.00019311553072495885 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document cc +0.00034652137288816547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document cc +0.0002813690318850024 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document cc +0.00027697649713138685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document cc +0.0002755419092534421 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document cc +0.0002681583054440219 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document cc +0.00026945753192750824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document cc +0.00026169470768245737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document cc +0.00026437008960810825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document cc +0.0002637294838228 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document cc +0.00026491867965088836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document cc +0.00025504483625138986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document cc +0.0002545040623796586 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document cc +0.0002546682814073622 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document cc +0.00025545439487142615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document cc +0.0002626896557978271 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document cc +0.00025092040940402784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document cc +0.0002589154885863872 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document cc +0.00024106160482721467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document cc +0.0002483289690087987 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document cc +0.0002388930282784437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document cc +0.00024006340759273874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document cc +0.00023765248178029045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document cc +0.00023061351965578936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document cc +0.00024954224883546477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document cc +0.00017861017233018525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document cc +0.00017810832743667658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document cc +0.00017599709170759497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document cc +0.00017462723516505223 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document cc +0.0002906316527068669 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document cc +0.00033762141066247166 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document cc +0.00017170670574152494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document cc +0.00017258674515137717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document cc +0.0002815386173173926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document cc +0.0002996845935618989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document cc +0.0002735268488987296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document cc +0.0002971738713071517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document cc +0.0002942690674002763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document cc +0.0003322222207729567 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document cc +0.0003378721656198464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document cc +0.00018307262621851067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document cc +0.00033956081502775057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document cc +0.00031604820927876276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document cc +0.00028805657681088917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document cc +0.00026312293321215633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document cc +0.00034366936722921455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document cc +0.0002865256504406559 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document cc +0.0003063615195861786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document cc +0.00028412791619666136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document cc +0.00028060835132727154 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document cc +0.00032544974761560506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document cc +0.0002647177833217225 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document cc +0.0003152621884896575 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document cc +0.0003054625140336913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document cc +0.00031183308312292263 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document cc +0.00018175026696621178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document cc +0.00017699918328872 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document cc +0.00018222339261441908 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document cc +0.00018348005930964137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document cc +0.0001810735993810541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document cc +0.00030846441282038914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document cc +0.0002972326889310354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document cc +0.00017433421318235594 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document cc +0.00032799458649525895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document cc +0.00032482130048512673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document cc +0.00031943465668672475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document cc +0.00029615593630484517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document cc +0.0002893126939511001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document cc +0.0002849288351723284 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document cc +0.00028383906633569267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document cc +0.00028072526091262615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document cc +0.000284239564292377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document cc +0.0002778903109432523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document cc +0.0002771644389501471 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document cc +0.0002733316182319337 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document cc +0.00026362539185869363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document cc +0.0002636325383220217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document cc +0.00026740622442302886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document cc +0.0002646771971853427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document cc +0.0002628566720605389 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document cc +0.0002644760695434766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document cc +0.0002623837702310999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document cc +0.00026088722976772894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document cc +0.0002567065374799158 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document cc +0.00018857382101207726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document cc +0.00019036580399817203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document cc +0.00018348828065261222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document cc +0.00018491851780345073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document cc +0.00018904887260080187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document cc +0.0001875609304251801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document cc +0.00018393034720015817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document cc +0.00018419795526114903 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document cc +0.00018699955623404795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document cc +0.00018276256902965128 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document cc +0.00017698045695190812 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document cc +0.00018104650132303642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document cc +0.00017758206731279688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document cc +0.00017131402995103497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document cc +0.000175944428350446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document cc +0.0003416745727147391 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document cc +0.0003163259373952889 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document cc +0.0002804489269172448 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document cc +0.00028748272397403175 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document cc +0.00027603318345630605 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document cc +0.000271638824679648 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document cc +0.0002763761210210942 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document cc +0.00026501984873172717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document cc +0.00026422486894694714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document cc +0.0002686339100849262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document cc +0.0002610837453940606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document cc +0.000260974343729353 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document cc +0.0002599403837029134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document cc +0.0002937273113238609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document cc +0.0003341790732600504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document cc +0.0002620661576600244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document cc +0.0003027929169239288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document cc +0.00031944039129326894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document cc +0.00019025676304139009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document cc +0.00018680910145009907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document cc +0.00034215840419416437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document cc +0.00018618120812119364 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document cc +0.00018605853095599425 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document cc +0.00018120712626096538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document cc +0.00018315079292495327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document cc +0.00018362556449041974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document cc +0.0001780024456718171 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document cc +0.00033296526436178697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document cc +0.0001802398632282846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document cc +0.00017340263100798256 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document cc +0.00017755840547238697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document cc +0.00018419413735260606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document cc +0.00017869518174591322 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document cc +0.00017526271460129484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document cc +0.00017852168597981907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document cc +0.00017566536156787157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document cc +0.00017589867964432936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document cc +0.00017831487394075305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document cc +0.00017837310528935862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document cc +0.00018200908814216548 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document cc +0.0001795136627511612 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document cc +0.0003414021775300033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document cc +0.00017177291787788502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document cc +0.0003441900648571877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document cc +0.0003394534597060673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document cc +0.0003236887233114832 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document cc +0.0001639544129688747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document cc +0.00019137443753211255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document cc +0.00018575146284680153 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document cc +0.00019184792863440243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document cc +0.00018966043065679055 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document cc +0.00017968851317035848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document cc +0.00018479881897661546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document cc +0.0001813642692683015 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document cc +0.0001686449798983066 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document cc +0.00018516104592230446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document cc +0.00031283726601066385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document cc +0.0003248607542883853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document cc +0.00031583241601202365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document cc +0.00031238270857730376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document cc +0.000307150592403979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document cc +0.00029443829986847044 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document cc +0.0002942723732234677 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document cc +0.00023514930666443422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document cc +0.0020776328951453444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document cc +0.0021768234410538883 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document cc +0.002106973549276289 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document cc +0.002110915756171751 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document cc +0.0017032382109816464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document cc +0.0019047944877712286 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document cc +0.0019402711744016077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document cc +0.0006264790011223686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document cc +0.0017885401938106643 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document cc + diff --git a/ALCF/data-lists/aurora/dolma.txt b/ALCF/data-lists/aurora/dolma.txt new file mode 100644 index 0000000000..4aba801e00 --- /dev/null +++ b/ALCF/data-lists/aurora/dolma.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document algebraic-stack-train +0.0017591050606817512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document algebraic-stack-train +0.001459052794333798 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document algebraic-stack-train +0.0007405667281569194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document algebraic-stack-train +0.00019420030110896795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document algebraic-stack-train +0.0009008668715801845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document algebraic-stack-train +0.00015115827957143057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document algebraic-stack-train +0.0014552844319220648 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document algebraic-stack-train +0.0012469861325685161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document algebraic-stack-train +0.00136412011372413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document algebraic-stack-train +0.0007064279699221103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document algebraic-stack-train +0.0008472240000687427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document algebraic-stack-train +0.0001984375713341955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document algebraic-stack-train +0.0005472773881697123 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document algebraic-stack-train +0.001815779629850992 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document algebraic-stack-train +0.0018313600689757324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document algebraic-stack-train +0.0002583902668716813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document arxiv +0.0002646575141232155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document arxiv +0.0003165521247456758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document arxiv +0.0002920706460176214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document arxiv +0.00028396813182810215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document arxiv +0.00030445161883108107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document arxiv +0.00031628781276576474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document arxiv +0.0003083776568189157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document arxiv +0.0003176359471472902 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document arxiv +0.0002536009369131698 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document arxiv +0.0003067491424681363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document arxiv +0.0002597217257557784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document arxiv +0.0003788556450109768 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document arxiv +0.0002796563272052598 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document arxiv +0.00033573826524290287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document arxiv +0.00030523658022800287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document arxiv +0.00032211552192240096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document arxiv +0.0003329295675164247 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document arxiv +0.0003101982186639862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document arxiv +0.00032361798234223355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document arxiv +0.0003495541581652915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document arxiv +0.0002821637448858042 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document arxiv +0.00030399523537629673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document arxiv +0.0002955658968247219 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document arxiv +0.00028942158502924254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document arxiv +0.00028769546171490733 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document arxiv +0.0002938111057234182 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document arxiv +0.0002711150403010948 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document arxiv +0.00031130095874747565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document arxiv +0.0003002996118160777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document arxiv +0.0003732757901604459 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document arxiv +0.00026784205751795894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document arxiv +0.0002799626521661984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document arxiv +0.00034334276069078164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document arxiv +0.0003582469803674965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document arxiv +0.00031094844818418623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document arxiv +0.0002766228384977191 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document arxiv +0.00030297116159471485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document arxiv +0.00027033888377464685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document arxiv +0.00030090862368377933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document arxiv +0.00028543875802490955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document arxiv +0.00027559768459074204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document arxiv +0.0003182185533962886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document arxiv +0.0003311392971435837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document arxiv +0.00028751652060804325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document arxiv +0.000303466863212589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document arxiv +0.00033400462801277524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document arxiv +0.0002589234031777426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document arxiv +0.0002913508598466723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document arxiv +0.0002670572450004856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document arxiv +0.00032027399105647656 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document arxiv +0.00032188376258379377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document arxiv +0.0003161585784100882 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document arxiv +0.0003184249182974135 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document arxiv +0.00030381336664000807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document arxiv +0.0003190437442184283 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document arxiv +0.0002537961798200545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document arxiv +0.0003017817117223326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document arxiv +0.00028685268513240224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document arxiv +0.00031265179094451165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document arxiv +0.00034708319096986816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document arxiv +0.00026650837943080664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document arxiv +0.00034588832248507335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document arxiv +0.0002416982248399037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document arxiv +0.0003089296918222243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document arxiv +0.00029137184185700827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document arxiv +0.00026464226846800774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document arxiv +0.00030545397919456627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document arxiv +0.0003206778460448875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document arxiv +0.00030968971641110967 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document arxiv +0.00023325653928600864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document arxiv +0.00030526899198338555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document arxiv +0.00035376719076633584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document arxiv +0.000290224385981026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document arxiv +0.000294650083382008 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document arxiv +0.00028768858128616436 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document arxiv +0.00030856965235527843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document arxiv +0.00030579942447879054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document arxiv +0.0002863101084704357 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document arxiv +0.0002870032092492213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document arxiv +0.000264182727569885 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document arxiv +0.0002974012367036449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document arxiv +0.00032238412143059203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document arxiv +0.00031683716893819036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document arxiv +0.00031157434937617524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document arxiv +0.0003411742735695989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document arxiv +0.00026778444816570715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document arxiv +0.0003037045797275201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document arxiv +0.00027746114370081314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document arxiv +0.00027148285946862043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document arxiv +0.00028042950114678207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document arxiv +0.0003235607816590721 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document arxiv +0.0003086692227306295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document arxiv +0.00033990349455148105 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document arxiv +0.00030945053208470265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document arxiv +0.00027309074552265303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document arxiv +0.00028737393506316194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document arxiv +0.0003098868328009879 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document arxiv +0.0002614229162588409 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document arxiv +0.0002884388407820923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document arxiv +0.0031025147279277244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document books +0.003102019887362634 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document books +0.0009996745994661548 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document books +0.0002406272620255565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document c4 +0.0002404825539493424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document c4 +0.00024062296575435581 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document c4 +0.00024069315766818953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document c4 +0.00024055829162263452 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document c4 +0.00024062053397343032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document c4 +0.0002410715545206964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document c4 +0.00024024881846087368 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document c4 +0.0002407074700790688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document c4 +0.00024072141428809043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document c4 +0.00024027710230872736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document c4 +0.0002409111299205489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document c4 +0.00024081954058275009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document c4 +0.00024086076794990912 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document c4 +0.00024098672620832446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document c4 +0.00024068622303333862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document c4 +0.00024140627024291824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document c4 +0.0002414512033594384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document c4 +0.00024028742594941463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document c4 +0.00024018036089269645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document c4 +0.0002398347365034979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document c4 +0.00024006780153485276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document c4 +0.00024015620270419213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document c4 +0.0002408848259695227 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document c4 +0.0002408023185278831 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document c4 +0.00024021196580140326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document c4 +0.00024077677271297493 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document c4 +0.00024087392454668027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document c4 +0.0002408071293824126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document c4 +0.00024042223828845715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document c4 +0.0002411484752360495 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document c4 +0.00023605263746465907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document c4 +0.00023471222158326908 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document c4 +0.00023432138580287644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document c4 +0.00023407385623382327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document c4 +0.00023487504174367091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document c4 +0.0002341843704976313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document c4 +0.00023421993170282486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document c4 +0.00023445057969132037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document c4 +0.0002337681680073047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document c4 +0.000234627964808109 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document c4 +0.0002338942211888584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document c4 +0.00023403849286843386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document c4 +0.00023405641310796305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document c4 +0.00023349169562397965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document c4 +0.00023381157386048856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document c4 +0.00023388742993790587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document c4 +0.00023363103829469813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document c4 +0.00023421141834630477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document c4 +0.00023420564352232565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document c4 +0.00023367463699173143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document c4 +0.00023344969163567033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document c4 +0.00023372196941547188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document c4 +0.00023399207645297834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document c4 +0.00023357915605505856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document c4 +0.00023337585642190864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document c4 +0.00023385005470157914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document c4 +0.00023301533534493465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document c4 +0.00023377864302541782 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document c4 +0.00023323745848621437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document c4 +0.0002330594611151835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document c4 +0.0002334149675026783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document c4 +0.00023198945902291534 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document c4 +0.00023023784834634142 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document c4 +0.00022985623060187217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document c4 +0.0002292605284569516 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document c4 +0.00022926593333048894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document c4 +0.00022922766406807777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document c4 +0.00022898153911167426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document c4 +0.0002292473111593315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document c4 +0.000228804579400424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document c4 +0.00022865485613513526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document c4 +0.00022937426835887895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document c4 +0.00022917388311587372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document c4 +0.0002291660582019043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document c4 +0.00022907895248360543 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document c4 +0.0002294617879920205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document c4 +0.0002290452150516566 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document c4 +0.00022943405619715553 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document c4 +0.0002296271421006204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document c4 +0.00022854791372910372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document c4 +0.00022923123467686557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document c4 +0.00022852404355738494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document c4 +0.00022847798660086642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document c4 +0.0002289604586810316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document c4 +0.00022835479834950643 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document c4 +0.0002289149402884243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document c4 +0.00022806655474763446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document c4 +0.00022826296420992974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document c4 +0.00022906829636213627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document c4 +0.0002287628414466998 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document c4 +0.0002282673911253445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document c4 +0.00022869309841939134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document c4 +0.0002281540116815451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document c4 +0.0002259755756162738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document c4 +0.00022562331285233504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document c4 +0.0002259061146106053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document c4 +0.00022567670836663787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document c4 +0.00022573165387587061 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document c4 +0.00022508514961670572 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document c4 +0.00022564642513773356 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document c4 +0.00022563088621998788 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document c4 +0.0002250438755373707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document c4 +0.00022524465346241134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document c4 +0.00022531737657666812 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document c4 +0.00022444687519363458 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document c4 +0.00022460397498596298 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document c4 +0.00022454218976501763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document c4 +0.00022447528843671366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document c4 +0.00022501666332178926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document c4 +0.00022453752304377972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document c4 +0.00022484451871163002 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document c4 +0.00022465678847154914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document c4 +0.00022453180917044732 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document c4 +0.0002247278486823009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document c4 +0.00022465794828242097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document c4 +0.00022431000701925386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document c4 +0.00022476020248460963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document c4 +0.00022467531771795015 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document c4 +0.0002236391309945234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document c4 +0.00022458764920536007 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document c4 +0.00022430877426744415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document c4 +0.0002247047786127192 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document c4 +0.0002245298090400035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document c4 +0.0002245648831396188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document c4 +0.00022292894729820784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document c4 +0.00022236668082957533 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document c4 +0.0002217622659895442 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document c4 +0.00022252452726732609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document c4 +0.00022135333211363678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document c4 +0.0002214571757787971 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document c4 +0.0002217188139237798 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document c4 +0.00022144214894640303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document c4 +0.00022100172806631854 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document c4 +0.00022156392409199052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document c4 +0.00022134830143710272 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document c4 +0.00022158598922529453 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document c4 +0.00022142932483041377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document c4 +0.00022120980907786554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document c4 +0.00022117917738112441 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document c4 +0.00022077089397851235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document c4 +0.00022093265074996711 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document c4 +0.00022091299741377004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document c4 +0.0002205849150703338 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document c4 +0.0002210648204787979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document c4 +0.0002214235747364102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document c4 +0.00022083907302221787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document c4 +0.0002206334237915964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document c4 +0.00022065193929912214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document c4 +0.00022079775597767288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document c4 +0.00022091492909963518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document c4 +0.00022095009987097293 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document c4 +0.0002208150577180165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document c4 +0.00022085759102772088 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document c4 +0.00022073789170129016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document c4 +0.00022049322781182384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document c4 +0.00022083270617761285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document c4 +0.00021982452827473632 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document c4 +0.00021899870446514259 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document c4 +0.00021890358773356361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document c4 +0.00021875556609042841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document c4 +0.00021861195987201226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document c4 +0.00021856782186167455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document c4 +0.00021912837771543515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document c4 +0.00021900213768517756 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document c4 +0.00021871675851390374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document c4 +0.0002180537056545586 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document c4 +0.0002188196714327129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document c4 +0.00021851362624523464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document c4 +0.0002183236795498736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document c4 +7.291153618675672e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document c4 +0.0003742481815405742 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document cc +0.00038204855962733055 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document cc +0.00038821818392663593 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document cc +0.00038723332988783727 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document cc +0.00038916141142149904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document cc +0.00038049542523949033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document cc +0.0003854755539534284 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document cc +0.00024202756466512517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document cc +0.0003915405155008087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document cc +0.0003927382151931033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document cc +0.0003839151202260479 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document cc +0.00040006817468967907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document cc +0.00040318965964443476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document cc +0.0003831013019452741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document cc +0.00039166638383204036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document cc +0.00039962784023961004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document cc +0.00039536707853602614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document cc +0.0004204304698247758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document cc +0.00041538899178693555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document cc +0.00039186953333675306 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document cc +0.00038945837196504305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document cc +0.0003919951238929062 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document cc +0.00044377065718528966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document cc +0.0004407759068603017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document cc +0.0002487811895843715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document cc +0.00039349432045556636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document cc +0.00041223198559462343 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document cc +0.0004036573014830213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document cc +0.0003825982215521807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document cc +0.00040386867133151386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document cc +0.00024460575279105167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document cc +0.000269029789531335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document cc +0.0003573757493252864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document cc +0.0004600876681392076 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document cc +0.0002605354166397086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document cc +0.0003882502452157999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document cc +0.0002466747612126512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document cc +0.0004024726105072402 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document cc +0.00040820631128483644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document cc +0.0002691094350403538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document cc +0.00026916830387277267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document cc +0.0004204663297880574 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document cc +0.00042379698687085554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document cc +0.0004502169227311871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document cc +0.0002661708937015295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document cc +0.00031239486948031334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document cc +0.0003109054589936201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document cc +0.00045873053079760646 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document cc +0.00022904931423244635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document cc +0.0003813462028433663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document cc +0.00039188129256500874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document cc +0.00045124222276983765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document cc +0.00048138658436853695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document cc +0.0003944178776279866 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document cc +0.00039941569676754006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document cc +0.00037952761190240494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document cc +0.0003944870860881476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document cc +0.0003891842411856621 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document cc +0.000387688981934861 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document cc +0.00039197953876258005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document cc +0.00039007915280311206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document cc +0.0003995520363699188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document cc +0.00039230985654592406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document cc +0.0003929472067173851 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document cc +0.0003924096172671473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document cc +0.0003881636143629905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document cc +0.000389790617937084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document cc +0.00037351762309221023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document cc +0.0003630196170929407 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document cc +0.00033532465765142113 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document cc +0.0003076088685761823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document cc +0.00039463850897720803 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document cc +0.0002843816115231449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document cc +0.0002909175709416474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document cc +0.00028867170997202486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document cc +0.0002838644617723659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document cc +0.00029027869525543416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document cc +0.0002821339567560056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document cc +0.0002922988877045601 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document cc +0.0002866955958315786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document cc +0.0002865271754558126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document cc +0.0002861247475618473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document cc +0.0002826681072408606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document cc +0.0002849746458282827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document cc +0.0002816966633435316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document cc +0.00026255342235948463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document cc +0.0002552895098829678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document cc +0.00025990194083107813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document cc +0.0002524062657685835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document cc +0.0002538577379748611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document cc +0.0002561415177406761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document cc +0.00026206253059694905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document cc +0.00026168095406910565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document cc +0.0002601305742008613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document cc +0.00025200823006814814 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document cc +0.0003229951981263502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document cc +0.00037289448266476045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document cc +0.0003807825862179898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document cc +0.0003616333738191483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document cc +0.0003665117918907636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document cc +0.0003684186453633228 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document cc +0.0003589330610806066 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document cc +0.00036383861418030395 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document cc +0.000359841363355303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document cc +0.00036431044063050464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document cc +0.0003668574090358279 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document cc +0.000362768263620199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document cc +0.0003501888032771077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document cc +0.000352401968221528 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document cc +0.0003541019701869794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document cc +0.0003628121865546891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document cc +0.0003752582953758773 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document cc +0.00037902046230424966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document cc +0.0003777927146925147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document cc +0.0003760676130509053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document cc +0.00034046049078755405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document cc +0.0003338847563259091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document cc +0.00033294499102761794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document cc +0.0004912026198265864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document cc +0.00032064363474664014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document cc +0.00032154190389541214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document cc +0.00032309660151746207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document cc +0.00031181143365304544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document cc +0.00031046092294569104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document cc +0.00031150165249068046 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document cc +0.0003041314265988224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document cc +0.0003024834909739394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document cc +0.0003019936835833604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document cc +0.000292329665283177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document cc +0.0002867061143144972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document cc +0.00028443615610701707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document cc +0.00028462291013755945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document cc +0.0002793538601205013 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document cc +0.00027306573977044246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document cc +0.00027097155673336525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document cc +0.0002752934202112985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document cc +0.00043042012694697647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document cc +0.00047495648822986177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document cc +0.00047755032493473855 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document cc +0.0004706974343933747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document cc +0.00046682163297771817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document cc +0.0004616765425874178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document cc +0.00030644496751628097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document cc +0.0002909492555358308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document cc +0.00027272036068261724 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document cc +0.0004101070217315588 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document cc +0.0003728914338834357 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document cc +0.00036546911442305647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document cc +0.0003669945482407483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document cc +0.0003715902407424017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document cc +0.00035837486406683366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document cc +0.0003573318538685469 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document cc +0.0003553784893071916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document cc +0.0004920659809912352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document cc +0.0004533619411303183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document cc +0.00045067066057818706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document cc +0.00044396985139270645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document cc +0.00043198288204468477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document cc +0.00043005174223738454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document cc +0.00041847118430776784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document cc +0.00042952036375796664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document cc +0.00043420594647324267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document cc +0.0003461123241053012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document cc +0.0003408581597849182 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document cc +0.00033172705422182547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document cc +0.0003392566490686136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document cc +0.00033578341518385483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document cc +0.0003439196710518844 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document cc +0.00034559163447085543 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document cc +0.00033762478642902825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document cc +0.00033215210055107224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document cc +0.00033423579608014966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document cc +0.0004963355016025102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document cc +0.0004996862761456923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document cc +0.0005000551829325451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document cc +0.0005004212610098755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document cc +0.00027768695585500585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document cc +0.00028395983854338433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document cc +0.00027835826303062254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document cc +0.0002740073176010804 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document cc +0.0002791830529274016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document cc +0.0002796863816194411 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document cc +0.00026697453022672804 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document cc +0.0002594197440280141 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document cc +0.0003779565697649222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document cc +0.00041835823476586606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document cc +0.00043788493575265915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document cc +0.0002731731970096006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document cc +0.000276305847423402 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document cc +0.0002704955773958623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document cc +0.0002629635944827518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document cc +0.000260070956974436 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document cc +0.00025661553791456334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document cc +0.00025794727207576157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document cc +0.00025295733980001527 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document cc +0.0003788106407021029 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document cc +0.0004882344027669431 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document cc +0.0003275324309642705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document cc +0.0004803401856640094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document cc +0.00046720138323433943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document cc +0.00043527810307095335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document cc +0.00043905395741627827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document cc +0.00048774175867331425 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document cc +0.00048380704121346737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document cc +0.0004779011848346118 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document cc +0.00046255587581908036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document cc +0.00045127922880511576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document cc +0.0004503891485256095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document cc +0.0004450142332303422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document cc +0.00044630282482516654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document cc +0.00044325014465743616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document cc +0.0004263874842796447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document cc +0.0004217530913646938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document cc +0.000415120314341852 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document cc +0.00040987168279144537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document cc +0.00033468337266607834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document cc +0.0003353094464683005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document cc +0.0004833936821707294 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document cc +0.00047194878988920935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document cc +0.0004648324126996427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document cc +0.0004562345003964941 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document cc +0.0004933203505465098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document cc +0.0003530166075325466 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document cc +0.00035368548192804685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document cc +0.0004872620828289663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document cc +0.00048293889392426456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document cc +0.00047936768462267655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document cc +0.00047821013991587545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document cc +0.0004660610308564753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document cc +0.000394683430103437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document cc +0.00039165053441571324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document cc +0.0003906936040164381 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document cc +0.00038074803919159006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document cc +0.0003686529291578143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document cc +0.00035832920428870976 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document cc +0.00035929024535947033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document cc +0.0003538226556050544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document cc +0.0003584167868708799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document cc +0.0003480507542594234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document cc +0.0003413709023543034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document cc +0.00034001304759361455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document cc +0.00033430532902756514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document cc +0.00046519252660631277 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document cc +0.0002938876402514769 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document cc +0.00028676090994509047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document cc +0.00027296150117506716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document cc +0.00026513502621960483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document cc +0.0002680081327926125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document cc +0.00025831225828720344 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document cc +0.00026647037295561 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document cc +0.0002525733734572654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document cc +0.00025831708887575375 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document cc +0.00042487627444443476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document cc +0.0004951213245023891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document cc +0.0004804051413177752 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document cc +0.0004662397611340532 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document cc +0.0004550138655253933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document cc +0.00044494909122746795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document cc +0.0002899112253051385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document cc +0.0004372879736279761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document cc +0.0004529568099252922 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document cc +0.00045127826158829573 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document cc +0.0004436558176737439 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document cc +0.0004419233237678378 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document cc +0.000434589215880319 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document cc +0.00029153613207706566 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document cc +0.0004312458058738854 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document cc +0.00028741854968757313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document cc +0.00046853200754421234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document cc +0.0004949145252030074 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document cc +0.00044459683920483167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document cc +0.0003836095306696336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document cc +0.0003789760237872398 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document cc +0.0003749227438304427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document cc +0.0003628558277173369 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document cc +0.00039468301394041474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document cc +0.00038874701821614864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document cc +0.0004158492456077867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document cc +0.00042360504554060077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document cc +0.00040386729844317623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document cc +0.00027595096702902474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document cc +0.00043638766787829135 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document cc +0.0002218691596850179 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document cc +0.0004437566108089954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document cc +0.0003889996411609667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document cc +0.00043454421906537704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document cc +0.0004522564392830988 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document cc +0.00041517835659357416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document cc +0.0002614360863446896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document cc +0.00037543522111463596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document cc +0.0004386190133514781 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document cc +0.00046358333286115075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document cc +0.00043186261317942404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document cc +0.0002377581602097957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document cc +0.00025973334085074254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document cc +0.00040139099332000796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document cc +0.00043674860686687174 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document cc +0.00040853289309329373 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document cc +0.000242910191729688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document cc +0.0004431071731750582 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document cc +0.0004388092670482523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document cc +0.000381418866255965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document cc +0.0004100117296419717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document cc +0.00042469230366022745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document cc +0.00041744151905374254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document cc +0.00022835699906752945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document cc +0.0004380161085387397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document cc +0.00044803212381807456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document cc +0.00040554932796137236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document cc +0.0004234508646347761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document cc +0.00043341209652360653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document cc +0.00023966604734537185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document cc +0.000259165907316014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document cc +0.0004270653021833602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document cc +0.0004341547032162028 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document cc +0.0004111478117275994 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document cc +0.0004299383567984396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document cc +0.0004241899124590779 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document cc +0.0004502719349364145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document cc +0.00038994621469645615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document cc +0.0003859912398894952 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document cc +0.0004247535950310557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document cc +0.000386982084327716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document cc +0.0004196451040053251 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document cc +0.0004096278509782259 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document cc +0.0004373334932695721 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document cc +0.0004180889975240641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document cc +0.00042079636929672745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document cc +0.00038063574611812913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document cc +0.0003817505891515542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document cc +0.0004420096268860222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document cc +0.00039182670726410623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document cc +0.0003635667850372299 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document cc +0.00041564996472055667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document cc +0.000400529358757286 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document cc +0.0003939113874958451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document cc +0.00039066622068940996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document cc +0.0004290098538807143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document cc +0.0004240739958197099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document cc +0.00040775392659215333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document cc +0.0004091634200396925 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document cc +0.00042299190476617914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document cc +0.0003701492680344151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document cc +0.0003807353844384635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document cc +0.00038813507771983156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document cc +0.00040072346558408346 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document cc +0.0003603595180423597 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document cc +0.00038799421353112465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document cc +0.00037575235582264926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document cc +0.0004239190342959713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document cc +0.0004606044799136546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document cc +0.00045107950652529253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document cc +0.0004391947201871058 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document cc +0.0004457516661123035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document cc +0.0004301297170991686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document cc +0.00044661704164586694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document cc +0.0004438849846114837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document cc +0.0004444205734316823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document cc +0.0004190924165303394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document cc +0.00043942581131677875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document cc +0.00021568459798090663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document cc +0.0003814929225407199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document cc +0.0003217453179359235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document cc +0.00031719591470267974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document cc +0.00032434115726922137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document cc +0.0004079911120371051 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document cc +0.000329492766381148 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document cc +0.0003845916162001633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document cc +0.0003835208964390098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document cc +0.00037847334157173194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document cc +0.00038296039903791865 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document cc +0.00037896336828472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document cc +0.00037620974396391355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document cc +0.00037420590727111843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document cc +0.000340490625886403 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document cc +0.0003078314411035827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document cc +0.00034153990750656097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document cc +0.0003308858103982067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document cc +0.0003452640607156025 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document cc +0.00033095276418403455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document cc +0.0003116308995860414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document cc +0.00032446713226408477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document cc +0.0003015816821912984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document cc +0.00031612418775706894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document cc +0.0003278516344971041 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document cc +0.00033079446736097217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document cc +0.00032278977146550837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document cc +0.00032065272988207914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document cc +0.0003936696452406576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document cc +0.0003450109536627789 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document cc +0.0003339787189919641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document cc +0.0003284303856176974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document cc +0.00033652677276843477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document cc +0.0003257822443845694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document cc +0.0003293985569149334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document cc +0.0003310360260148262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document cc +0.0003233770986418526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document cc +0.0003172280092149422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document cc +0.0003160674744292835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document cc +0.00030931090289598506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document cc +0.0003093173886443107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document cc +0.00033167847081104083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document cc +0.00031131501311729723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document cc +0.00031046608876279845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document cc +0.00030569235942207244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document cc +0.00030777943671285197 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document cc +0.00029303314290956683 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document cc +0.0003045824546400205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document cc +0.00030360880677729793 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document cc +0.00031646239964835433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document cc +0.0003129122300603785 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document cc +0.00031060464956661433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document cc +0.000311819032500067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document cc +0.0002977872483902282 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document cc +0.0003009448600922438 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document cc +0.00028610292098537774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document cc +0.0002988326876216654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document cc +0.00028550828372819075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document cc +0.0002830381750875739 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document cc +0.0002848495855927156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document cc +0.0002856443760308144 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document cc +0.00027442895344188584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document cc +0.0002681160554049462 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document cc +0.0003421482544126989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document cc +0.0004005872948449718 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document cc +0.0003930123959320308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document cc +0.0003867271832275778 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document cc +0.000380805140455254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document cc +0.0003814769861947819 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document cc +0.00038025170883282324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document cc +0.0003738026647867475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document cc +0.00018960856915036276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document cc +0.0003697177501953134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document cc +0.00036674194328136693 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document cc +0.00036447406838697555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document cc +0.00036686410861101255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document cc +0.00035915267825103423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document cc +0.0003624758404026675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document cc +0.0002822812140180794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document cc +0.00030620512946920813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document cc +0.000294249776520589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document cc +0.00030238536967523434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document cc +0.00029509593361580754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document cc +0.0002906912701830899 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document cc +0.0002921944165474959 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document cc +0.00028358919691127954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document cc +0.0002813182772323272 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document cc +0.00027442640800299205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document cc +0.0002747820342933984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document cc +0.0002747584403979717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document cc +0.00027499129634862444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document cc +0.0002712050404257197 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document cc +0.0002616256943143254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document cc +0.00026769938929002815 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document cc +0.00038396081322727017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document cc +0.0003863140490027991 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document cc +0.00037702277513203237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document cc +0.0003633274156107032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document cc +0.0003587473889240435 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document cc +0.0003507672084278415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document cc +0.00033776425499780385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document cc +0.0003377914127574796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document cc +0.00032948015659161326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document cc +0.00033245638541392985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document cc +0.00031080707640648695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document cc +0.0002976903331149755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document cc +0.0002965121463725523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document cc +0.0002933849695266647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document cc +0.0002837035078508233 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document cc +0.00028684569079589323 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document cc +0.0003145192320802359 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document cc +0.0003566937253273515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document cc +0.0003470199109592918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document cc +0.0003060245312041868 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document cc +0.0002650817213818789 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document cc +0.0002643604938780134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document cc +0.000299350876031416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document cc +0.0003178540797697938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document cc +0.000271850367887767 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document cc +0.00031349896596549 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document cc +0.00031749734412765755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document cc +0.0003791137842391209 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document cc +0.0003742334169957992 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document cc +0.0003705639757351107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document cc +0.0003126986769797042 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document cc +0.00031038132814561196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document cc +0.00036464437173804883 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document cc +0.0003569480488951322 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document cc +0.0003541239221619106 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document cc +0.00035315297411308053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document cc +0.0003572451925404141 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document cc +0.0003514986129411253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document cc +0.0003521798298425866 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document cc +0.00034553677439244716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document cc +0.000349004719809412 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document cc +0.0003468247484872769 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document cc +0.0003465822608356558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document cc +0.00035410983132162007 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document cc +0.0003487908354969444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document cc +0.0003479024763238147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document cc +0.000341412530646823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document cc +0.00034451316273667034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document cc +0.0002618849993484869 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document cc +0.00026788679978901144 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document cc +0.00027450670773227214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document cc +0.0002661273129899329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document cc +0.00026836569676402957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document cc +0.00026155876975483236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document cc +0.0002609276830117151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document cc +0.0002644161630512771 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document cc +0.00036789208972872557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document cc +0.00037829849439990513 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document cc +0.0003788894943523098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document cc +0.0003617207777959397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document cc +0.0002541334487248998 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document cc +0.0002707945538071073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document cc +0.00027046282716455214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document cc +0.0002652443167243215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document cc +0.0002685859923850986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document cc +0.00025734961751176414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document cc +0.000259041720872915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document cc +0.00025340107274823446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document cc +0.00025757135121837893 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document cc +0.00025617700500574084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document cc +0.0002566931670562857 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document cc +0.0002543871190716101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document cc +0.00024997565589481713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document cc +0.0002954079779456287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document cc +0.00034890741135252835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document cc +0.0003473298137731525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document cc +0.0003296959618486435 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document cc +0.0003304520061604598 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document cc +0.00032377956175729824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document cc +0.00031700696295168713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document cc +0.0003060382346081943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document cc +0.0003012003005056863 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document cc +0.0002981074073993884 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document cc +0.0002922128825950705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document cc +0.000348901087722931 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document cc +0.0003408286289467841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document cc +0.0003410649680770183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document cc +0.0003358524215576502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document cc +0.0003343661874989231 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document cc +0.00032810573699389156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document cc +0.00032261449539097497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document cc +0.0003162694866049203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document cc +0.0003158381156468853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document cc +0.000317376061083603 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document cc +0.0003125788639953052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document cc +0.0003010105041885602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document cc +0.0003065865059090678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document cc +0.0003084275726508053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document cc +0.00030966560718296085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document cc +0.0002957728057853081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document cc +0.00029904164542325336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document cc +0.0002955358888729187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document cc +0.00028692976446931544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document cc +0.0002923476214935797 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document cc +0.0002893691697212419 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document cc +0.0002855895211981585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document cc +0.00027968347097626246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document cc +0.0002810783462604979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document cc +0.00027794080455729715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document cc +0.00034784376461416953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document cc +0.0003488347959010943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document cc +0.00034790583710250724 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document cc +0.000345913166618151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document cc +0.00033801936268066675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document cc +0.0003290591130212315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document cc +0.00034051399521366823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document cc +0.00032470943131841784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document cc +0.00031679540050914276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document cc +0.00031814596342422325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document cc +0.0003156466289485036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document cc +0.00029985010879003633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document cc +0.0002905176377776361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document cc +0.0004206836775460856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document cc +0.00020660449162246918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document cc +0.0003461727254468087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document cc +0.00020592870907067763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document cc +0.00034173505299233005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document cc +0.0004052437256652738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document cc +0.0004080650901351697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document cc +0.00039778184149144276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document cc +0.00039046311464950275 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document cc +0.00039043444911071384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document cc +0.000388575704932843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document cc +0.00019737533145666597 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document cc +0.00037610755595812403 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document cc +0.00037315400127598317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document cc +0.00037415028580922163 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document cc +0.00036694041707212337 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document cc +0.00018947219857306515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document cc +0.00037046050826533545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document cc +0.0003587440768559087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document cc +0.00034623936498708903 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document cc +0.0003502289592617922 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document cc +0.00034692398063649823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document cc +0.000339340809421849 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document cc +0.0003360510394816983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document cc +0.0003354673850814145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document cc +0.00032937682875877047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document cc +0.00032844505049317715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document cc +0.00028287199339908627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document cc +0.0002795217197003578 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document cc +0.00028048955601883463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document cc +0.0002769326396439027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document cc +0.0002727090021299243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document cc +0.0002726577841024554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document cc +0.00026663619593455374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document cc +0.00026068042672138127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document cc +0.0002637704114326801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document cc +0.0002593043567100412 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document cc +0.0002599897110113453 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document cc +0.0002435078682758859 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document cc +0.0002450530071379054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document cc +0.00024233331983743606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document cc +0.0002934750947999535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document cc +0.00033241226364044474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document cc +0.00032938406090272075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document cc +0.00032778705403953246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document cc +0.00032184551480398754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document cc +0.00031874002264945737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document cc +0.0003165319685666433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document cc +0.00031307071173376295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document cc +0.00031119524184911957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document cc +0.0003102253344576429 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document cc +0.0003088976240383192 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document cc +0.0002951410823077708 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document cc +0.00029772657676757413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document cc +0.0003056048989909935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document cc +0.00031991305381648026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document cc +0.00030890256978362426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document cc +0.0003109382904091933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document cc +0.00031035798529690644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document cc +0.00030741666395911753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document cc +0.0002989918594861846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document cc +0.00029569635443989434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document cc +0.0002973992445667285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document cc +0.000293397351001072 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document cc +0.00028737817438047954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document cc +0.00028252738144009747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document cc +0.0002805511898623541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document cc +0.0003718020784620472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document cc +0.0003499713845765235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document cc +0.00034283547445326676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document cc +0.00031464759888838765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document cc +0.00033188946446414833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document cc +0.000326084432195463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document cc +0.0003764568303917893 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document cc +0.0003604955598858414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document cc +0.0003655654554133222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document cc +0.00035762304033750504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document cc +0.00038478883950347103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document cc +0.00027735714341247454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document cc +0.00028139534607773563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document cc +0.00019777292251713763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document cc +0.000285571704874486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document cc +0.00028543482146244363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document cc +0.00019434234484256758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document cc +0.00027854908176986763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document cc +0.0002847068039566143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document cc +0.00028672356943064853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document cc +0.00027782687605808177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document cc +0.0002843539634105203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document cc +0.0002894748379090401 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document cc +0.0002868852440186493 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document cc +0.0002818504885373851 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document cc +0.00028680112812941034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document cc +0.00019258978168723977 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document cc +0.00028760637934715155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document cc +0.0002820439443912918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document cc +0.0002831001054410018 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document cc +0.00029001901552467397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document cc +0.00027779449377883156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document cc +0.00019949837437516796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document cc +0.0002907306472984446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document cc +0.00027814858381318327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document cc +0.00019472790889161432 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document cc +0.00020472626596924125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document cc +0.0002870045081974301 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document cc +0.00019812241927078482 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document cc +0.0002817553333369554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document cc +0.00027829782796642117 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document cc +0.00028289431732284113 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document cc +0.0002795526296717729 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document cc +0.00027682829988044574 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document cc +0.0002895432402719184 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document cc +0.0002823174903941811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document cc +0.00028170972351837796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document cc +0.00027807915877838826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document cc +0.00028588515681452956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document cc +0.00028112324090816726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document cc +0.00020636178289985485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document cc +0.00019447255290980535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document cc +0.0002850824220591452 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document cc +0.00027856429520116784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document cc +0.0002820880676635633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document cc +0.00028943902215995714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document cc +0.0002676366291085329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document cc +0.00023806333809954687 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document cc +0.00024526460430233455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document cc +0.00023876876664622726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document cc +0.00023379770334179805 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document cc +0.00024175151269138382 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document cc +0.00023386583242595706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document cc +0.00023771797150160827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document cc +0.0002262748967483896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document cc +0.0002408148346432682 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document cc +0.00023398651720444235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document cc +0.00022989433874474592 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document cc +0.00023948500543957772 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document cc +0.0002331594076859196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document cc +0.00023375132439600242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document cc +0.00023923410909668642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document cc +0.00023952796315562954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document cc +0.0002327466076905069 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document cc +0.00023082758956797212 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document cc +0.0002240509275524448 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document cc +0.00022798879995765268 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document cc +0.000221172516774386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document cc +0.00021767045123534623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document cc +0.00021982832794804484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document cc +0.00021971626543789102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document cc +0.00022566565206920132 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document cc +0.0002181984894194856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document cc +0.00021831417549554653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document cc +0.00021601405421187145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document cc +0.00022275733725519607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document cc +0.00021847734911973986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document cc +0.0002243591012664014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document cc +0.00021688758139483833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document cc +0.0002182953624789215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document cc +0.00020475155724026002 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document cc +0.00021498078062960065 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document cc +0.0002157914337233064 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document cc +0.00021781838494967963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document cc +0.00021723242266814558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document cc +0.0002176782686553837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document cc +0.0003486179404943968 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document cc +0.00034882846352857634 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document cc +0.00031400868448352596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document cc +0.00030273484020011963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document cc +0.00029895889118145404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document cc +0.00029770764609621714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document cc +0.0002990181332116852 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document cc +0.00029653733972285996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document cc +0.00029624649222942476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document cc +0.00029625609720203576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document cc +0.00029731928930852147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document cc +0.00029011721326148513 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document cc +0.00028849788197494655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document cc +0.00021601278623858145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document cc +0.00021319599281739178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document cc +0.0002153325290600083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document cc +0.00018566946174516558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document cc +0.00020736824394291617 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document cc +0.00020857419820128004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document cc +0.00020058526129536423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document cc +0.00020745812166665217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document cc +0.00020652171015271702 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document cc +0.00020643808911278608 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document cc +0.00020040513914482103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document cc +0.00020598050188272898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document cc +0.0001969184139343296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document cc +0.0001972748812937012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document cc +0.0002038556751586195 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document cc +0.00020245186011313464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document cc +0.00019950381422038783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document cc +0.00020837055459665258 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document cc +0.00020371856218246096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document cc +0.00019537612301625791 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document cc +0.00019914984508813857 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document cc +0.0002053787713691309 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document cc +0.00019082100541008637 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document cc +0.00020397153334531813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document cc +0.0002021462693077317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document cc +0.00019609357008124035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document cc +0.00019693256622486236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document cc +0.00020007239732428112 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document cc +0.00020467075741591954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document cc +0.00019584883400022932 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document cc +0.00019135050391176972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document cc +0.0003362829834208298 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document cc +0.00034013691154784095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document cc +0.00033215887031941976 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document cc +0.00032681189065396707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document cc +0.0003149138485493094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document cc +0.00030179177307540077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document cc +0.0002923278437581119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document cc +0.00029470052278994486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document cc +0.0002994095093045731 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document cc +0.00029033525096085037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document cc +0.00029390798852496565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document cc +0.0002916230924130842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document cc +0.00029419886374594913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document cc +0.0002865469756730764 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document cc +0.00021191292549942086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document cc +0.00021369664817409847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document cc +0.00021612485624266726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document cc +0.00022242192634588478 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document cc +0.00014605095659989698 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document cc +0.00022070626106341693 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document cc +0.0002174420774054071 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document cc +0.00021325858963116995 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document cc +0.0002124322999488052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document cc +0.0002081218896969054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document cc +0.0002108710211556957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document cc +0.00020686867095978426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document cc +0.00020895752681041895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document cc +0.00020741922266415738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document cc +0.0002069112657197308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document cc +0.00020644627473468118 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document cc +0.00020332991338121604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document cc +0.0003560895677789848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document cc +0.00032915779111908214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document cc +0.00033810613317040864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document cc +0.00033729626594036923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document cc +0.00033550342864602944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document cc +0.00034173474024556906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document cc +0.000331505340748827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document cc +0.0003270050330117195 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document cc +0.00032585275329172556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document cc +0.0003143383203190604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document cc +0.00031655199110388894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document cc +0.00030738872158476413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document cc +0.00030838388352699285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document cc +0.0003053596995351888 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document cc +0.00031836304739584593 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document cc +0.000315315435873905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document cc +0.0003087116248965243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document cc +0.00030396790625537645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document cc +0.0003335812246032149 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document cc +0.00034570956323095843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document cc +0.00034563035636675786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document cc +0.00033411265479076335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document cc +0.00034439191141692787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document cc +0.0003364483125496565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document cc +0.0003299500453608033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document cc +0.00033163377700074837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document cc +0.00032638649660627673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document cc +0.00032616167939645234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document cc +0.0003205289298760723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document cc +0.00031939393740815355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document cc +0.00031593164066731296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document cc +0.00031928871111254405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document cc +0.00029670189073175004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document cc +0.00020517703846735904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document cc +0.00020128418186172073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document cc +0.00019662723895606717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document cc +0.0001981157042081407 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document cc +0.00019703489037041608 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document cc +0.00019079796331785068 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document cc +0.0001909352306690079 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document cc +0.00018824662295261396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document cc +0.00019864275319325954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document cc +0.00018818516521649587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document cc +0.00018875694972812844 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document cc +0.00018231621170645482 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document cc +0.00018349407845798273 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document cc +0.00018088971427746906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document cc +0.00018296284236327237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document cc +0.0001876011825819916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document cc +0.000329052068725176 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document cc +0.00032223616273648536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document cc +0.00031272564089633955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document cc +0.00031621609908414494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document cc +0.0003117213560911235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document cc +0.00030218064069945934 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document cc +0.00030658916600512085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document cc +0.0002915863534115821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document cc +0.0002940280138374372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document cc +0.00029067860468866085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document cc +0.00028529228063135635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document cc +0.00028336893301452256 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document cc +0.0002794668089130099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document cc +0.00021681361378827842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document cc +0.0001484664674497246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document cc +0.00021950558378215133 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document cc +0.00021806860758808645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document cc +0.00021819568718852282 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document cc +0.00021626925931585001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document cc +0.0001464536143077762 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document cc +0.00021432777088808917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document cc +0.000213473805865147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document cc +0.00021397067253964538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document cc +0.00020758957647437263 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document cc +0.00020687124337683314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document cc +0.00020630057046511005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document cc +0.0002091166859352538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document cc +0.00020777355025615267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document cc +0.00020709287641496176 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document cc +0.00020736464660577094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document cc +0.00020062246741862607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document cc +0.00020693207561942915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document cc +0.00021151004871893024 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document cc +0.00019930249098689716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document cc +0.00021589710041231824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document cc +0.00021369204789905741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document cc +0.0002147099923936778 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document cc +0.00021077531190389536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document cc +0.0002100509829113836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document cc +0.00021185362601571124 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document cc +0.00020722136637339565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document cc +0.00020300093701169531 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document cc +0.00019859737993313477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document cc +0.00019971314372100164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document cc +0.00019549908270269278 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document cc +0.00019649820843534028 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document cc +0.00019619415513498067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document cc +0.00019493006120377898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document cc +0.00019499409035775506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document cc +0.00019252988593634277 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document cc +0.00019440768268686405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document cc +0.00018747161324755577 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document cc +0.0001879575932372779 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document cc +0.00019040707058357506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document cc +0.0001871931095090703 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document cc +0.00020112966223017096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document cc +0.00020516878165311017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document cc +0.00020664735191740533 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document cc +0.00021041398572882962 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document cc +0.00020397992929690396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document cc +0.0002039978580295561 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document cc +0.00020592785601142126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document cc +0.0001990755527445265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document cc +0.00019729564847798732 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document cc +0.00019958182230527032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document cc +0.0001985037302636386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document cc +0.00020204130355115716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document cc +0.0002000296401958085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document cc +0.0001983064832295463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document cc +0.00019663108484195617 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document cc +0.00019510678560556523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document cc +0.0001873284057063206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document cc +0.00019311553072495885 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document cc +0.00034652137288816547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document cc +0.0002813690318850024 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document cc +0.00027697649713138685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document cc +0.0002755419092534421 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document cc +0.0002681583054440219 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document cc +0.00026945753192750824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document cc +0.00026169470768245737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document cc +0.00026437008960810825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document cc +0.0002637294838228 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document cc +0.00026491867965088836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document cc +0.00025504483625138986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document cc +0.0002545040623796586 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document cc +0.0002546682814073622 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document cc +0.00025545439487142615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document cc +0.0002626896557978271 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document cc +0.00025092040940402784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document cc +0.0002589154885863872 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document cc +0.00024106160482721467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document cc +0.0002483289690087987 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document cc +0.0002388930282784437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document cc +0.00024006340759273874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document cc +0.00023765248178029045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document cc +0.00023061351965578936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document cc +0.00024954224883546477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document cc +0.00017861017233018525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document cc +0.00017810832743667658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document cc +0.00017599709170759497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document cc +0.00017462723516505223 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document cc +0.0002906316527068669 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document cc +0.00033762141066247166 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document cc +0.00017170670574152494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document cc +0.00017258674515137717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document cc +0.0002815386173173926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document cc +0.0002996845935618989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document cc +0.0002735268488987296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document cc +0.0002971738713071517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document cc +0.0002942690674002763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document cc +0.0003322222207729567 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document cc +0.0003378721656198464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document cc +0.00018307262621851067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document cc +0.00033956081502775057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document cc +0.00031604820927876276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document cc +0.00028805657681088917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document cc +0.00026312293321215633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document cc +0.00034366936722921455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document cc +0.0002865256504406559 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document cc +0.0003063615195861786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document cc +0.00028412791619666136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document cc +0.00028060835132727154 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document cc +0.00032544974761560506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document cc +0.0002647177833217225 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document cc +0.0003152621884896575 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document cc +0.0003054625140336913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document cc +0.00031183308312292263 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document cc +0.00018175026696621178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document cc +0.00017699918328872 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document cc +0.00018222339261441908 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document cc +0.00018348005930964137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document cc +0.0001810735993810541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document cc +0.00030846441282038914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document cc +0.0002972326889310354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document cc +0.00017433421318235594 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document cc +0.00032799458649525895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document cc +0.00032482130048512673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document cc +0.00031943465668672475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document cc +0.00029615593630484517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document cc +0.0002893126939511001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document cc +0.0002849288351723284 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document cc +0.00028383906633569267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document cc +0.00028072526091262615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document cc +0.000284239564292377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document cc +0.0002778903109432523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document cc +0.0002771644389501471 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document cc +0.0002733316182319337 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document cc +0.00026362539185869363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document cc +0.0002636325383220217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document cc +0.00026740622442302886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document cc +0.0002646771971853427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document cc +0.0002628566720605389 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document cc +0.0002644760695434766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document cc +0.0002623837702310999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document cc +0.00026088722976772894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document cc +0.0002567065374799158 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document cc +0.00018857382101207726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document cc +0.00019036580399817203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document cc +0.00018348828065261222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document cc +0.00018491851780345073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document cc +0.00018904887260080187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document cc +0.0001875609304251801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document cc +0.00018393034720015817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document cc +0.00018419795526114903 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document cc +0.00018699955623404795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document cc +0.00018276256902965128 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document cc +0.00017698045695190812 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document cc +0.00018104650132303642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document cc +0.00017758206731279688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document cc +0.00017131402995103497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document cc +0.000175944428350446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document cc +0.0003416745727147391 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document cc +0.0003163259373952889 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document cc +0.0002804489269172448 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document cc +0.00028748272397403175 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document cc +0.00027603318345630605 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document cc +0.000271638824679648 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document cc +0.0002763761210210942 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document cc +0.00026501984873172717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document cc +0.00026422486894694714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document cc +0.0002686339100849262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document cc +0.0002610837453940606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document cc +0.000260974343729353 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document cc +0.0002599403837029134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document cc +0.0002937273113238609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document cc +0.0003341790732600504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document cc +0.0002620661576600244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document cc +0.0003027929169239288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document cc +0.00031944039129326894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document cc +0.00019025676304139009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document cc +0.00018680910145009907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document cc +0.00034215840419416437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document cc +0.00018618120812119364 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document cc +0.00018605853095599425 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document cc +0.00018120712626096538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document cc +0.00018315079292495327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document cc +0.00018362556449041974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document cc +0.0001780024456718171 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document cc +0.00033296526436178697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document cc +0.0001802398632282846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document cc +0.00017340263100798256 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document cc +0.00017755840547238697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document cc +0.00018419413735260606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document cc +0.00017869518174591322 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document cc +0.00017526271460129484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document cc +0.00017852168597981907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document cc +0.00017566536156787157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document cc +0.00017589867964432936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document cc +0.00017831487394075305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document cc +0.00017837310528935862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document cc +0.00018200908814216548 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document cc +0.0001795136627511612 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document cc +0.0003414021775300033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document cc +0.00017177291787788502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document cc +0.0003441900648571877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document cc +0.0003394534597060673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document cc +0.0003236887233114832 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document cc +0.0001639544129688747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document cc +0.00019137443753211255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document cc +0.00018575146284680153 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document cc +0.00019184792863440243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document cc +0.00018966043065679055 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document cc +0.00017968851317035848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document cc +0.00018479881897661546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document cc +0.0001813642692683015 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document cc +0.0001686449798983066 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document cc +0.00018516104592230446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document cc +0.00031283726601066385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document cc +0.0003248607542883853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document cc +0.00031583241601202365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document cc +0.00031238270857730376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document cc +0.000307150592403979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document cc +0.00029443829986847044 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document cc +0.0002942723732234677 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document cc +0.00023514930666443422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document cc +0.0020776328951453444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document cc +0.0021768234410538883 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document cc +0.002106973549276289 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document cc +0.002110915756171751 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document cc +0.0017032382109816464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document cc +0.0019047944877712286 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document cc +0.0019402711744016077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document cc +0.0006264790011223686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document cc +0.0017885401938106643 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document cc +0.0003547982093445404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document falcon +0.00035934014428504944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document falcon +0.00035707704501371544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document falcon +0.00035287930712815354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document falcon +0.00035977166728996823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document falcon +0.0003581675664109838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document falcon +0.0003548617059697185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document falcon +0.0003639582000286208 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document falcon +0.00035375839698688127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document falcon +0.0003743722020080678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document falcon +0.0003530399715341242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document falcon +0.00035511875882752406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document falcon +0.0003618733574783154 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document falcon +0.00035185243285420104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document falcon +0.0003541503739732106 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document falcon +0.0003631679485751914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document falcon +0.00035748045578182274 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document falcon +0.0003606490690555877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document falcon +0.0003626383296610091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document falcon +0.00035442644361264756 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document falcon +0.00035978370170539796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document falcon +0.0003585562375341541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document falcon +0.0003601958372888019 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document falcon +0.000350277765402227 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document falcon +0.0003616521184211704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document falcon +0.0003620625543608188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document falcon +0.0003560781983850704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document falcon +0.0003553209610592676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document falcon +0.00035905348643915075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document falcon +0.00034744258805696526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document falcon +0.00035462784035661496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document falcon +0.00034768186175100895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document falcon +0.0003568534635532736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document falcon +0.00035586511544371234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document falcon +0.0003524567827568137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document falcon +0.0003512453770426313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document falcon +0.0003591792726468799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document falcon +0.0003514024529343127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document falcon +0.0003584880112586934 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document falcon +0.00035133552916418045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document falcon +0.0003600811981350215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document falcon +0.0003571663974228119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document falcon +0.00035768103378874214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document falcon +0.00035939205561113694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document falcon +0.00035186773916029825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document falcon +0.0003542829672490847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document falcon +0.0003592783642898726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document falcon +0.0003556367340099302 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document falcon +0.00035391392271377027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document falcon +0.00035486725707484836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document falcon +0.00034866743396828035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document falcon +0.0003517219808644735 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document falcon +0.00034874458549673823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document falcon +0.000355773136961014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document falcon +0.00035611750387841917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document falcon +0.00035305602013916315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document falcon +0.0003578207127071924 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document falcon +0.00035514635841943707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document falcon +0.00034816946212866206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document falcon +0.0003512707269761496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document falcon +0.0003483392117980654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document falcon +0.0003572169607204321 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document falcon +0.00035139153281660794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document falcon +0.00035536422129036537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document falcon +0.000352017164107143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document falcon +0.000351889550179365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document falcon +0.000358759689953589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document falcon +0.0003569286079869268 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document falcon +0.0003657752958602099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document falcon +0.00035396127934790697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document falcon +0.0003618565071224743 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document falcon +0.00035146051531973204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document falcon +0.00036107135765783567 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document falcon +0.00035019554279994576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document falcon +0.00035567858879904983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document falcon +0.0003504753174793183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document falcon +0.00035931140831329194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document falcon +0.0003502967866002823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document falcon +0.0003532911801041972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document falcon +0.0003583543013070199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document falcon +0.0003566243489931224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document falcon +0.0003468752314799221 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document falcon +0.0003597840618138091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document falcon +0.00035128822484768084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document falcon +0.00035889496943437507 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document falcon +0.000352400524650424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document falcon +0.0003518689536768735 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document falcon +0.00035866864741303467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document falcon +0.0003454687659106334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document falcon +0.00035348007259317576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document falcon +0.0003539752270940644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document falcon +0.00035146495994081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document falcon +0.00035397212846310423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document falcon +0.00035208246467162587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document falcon +0.0003490843168676626 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document falcon +0.00035299633658644394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document falcon +0.00034868327466167065 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document falcon +0.00035941351365601583 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document falcon +0.0003545343062735255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document falcon +0.0003528956380445978 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document falcon +0.0003553355770443352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document falcon +0.0003644224004937743 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document falcon +0.00035234291036216907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document falcon +0.0003596237469847771 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document falcon +0.0003531996065735989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document falcon +0.0003547177054106099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document falcon +0.0003575586499260483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document falcon +0.00035262635135283667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document falcon +0.0003624191962188944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document falcon +0.0003488398052948616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document falcon +0.0003598294093147917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document falcon +0.00035583006534466323 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document falcon +0.00035403139653225103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document falcon +0.00036134702642187156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document falcon +0.0003573689927162834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document falcon +0.0003577141131435527 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document falcon +0.00035208814419277406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document falcon +0.00035996720683665625 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document falcon +0.00035415304658912596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document falcon +0.00036353353029443546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document falcon +0.0003537326003150983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document falcon +0.00036053976358299083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document falcon +0.000352380489373494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document falcon +0.00036154661616900994 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document falcon +0.00035959332325963614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document falcon +0.0003597954667189692 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document falcon +0.0003563108270597542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document falcon +0.0003582891940460143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document falcon +0.0003497728210484297 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document falcon +0.0003549834902179354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document falcon +0.0003529828233484542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document falcon +0.00034627483903285777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document falcon +0.00035569006572589215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document falcon +0.00035449377946910314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document falcon +0.00035802844396194623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document falcon +0.0003617277809353208 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document falcon +0.00035034118898654814 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document falcon +0.000351091193908611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document falcon +0.0003527914342210668 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document falcon +0.00035028288369781376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document falcon +0.00035775745592780506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document falcon +0.0003449630690661468 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document falcon +0.0003583490698830361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document falcon +0.0003476995746684122 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document falcon +0.0003535632505019212 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document falcon +0.00035640180641147417 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document falcon +0.000361731045691765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document falcon +0.0003534082129597368 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document falcon +0.0003550344149828664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document falcon +0.00035363002411364057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document falcon +0.0003537265579677396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document falcon +0.00034950531383577937 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document falcon +0.00035008511827347514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document falcon +0.00035594533400871325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document falcon +0.00035266312861335946 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document falcon +0.00035280268794863923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document falcon +0.0003565470391528536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document falcon +0.0003588492322689137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document falcon +0.00035469909697832775 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document falcon +0.00034712082813410526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document falcon +0.000348701157101807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document falcon +0.0003500192014479944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document falcon +0.00035120560544669755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document falcon +0.00035403656850437445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document falcon +0.00035852376560749366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document falcon +0.0003534754068111774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document falcon +0.00035591740046720765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document falcon +0.000348522354782563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document falcon +0.0003533533959664415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document falcon +0.00035631425964030697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document falcon +0.0003485886551574741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document falcon +0.00035917652631065777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document falcon +0.0003482975272111288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document falcon +0.00035580661277480167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document falcon +0.0003492290722955348 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document falcon +0.00034989284450240613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document falcon +0.0003545677216162781 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document falcon +0.00034622286859463484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document falcon +0.00036070626989861965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document falcon +0.00035518365036320786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document falcon +0.00035272907057848406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document falcon +0.0003547343638218734 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document falcon +0.0003496450144966242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document falcon +0.0003537407829294287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document falcon +0.0003489722653985685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document falcon +0.00035057186899911295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document falcon +0.0003507566548933051 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document falcon +0.00035630360179023747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document falcon +0.00035631362503416367 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document falcon +0.0003490204248026821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document falcon +0.00035761724058371226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document falcon +0.00035037664777467137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document falcon +0.000353402110481068 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document falcon +0.00034524163568371745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document falcon +0.00035528523728570974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document falcon +0.00034784916132431703 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document falcon +0.00034928476408048925 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document falcon +0.00034989205973784984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document falcon +0.00034201664404094254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document falcon +0.0003529676016338611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document falcon +0.00034643433682346637 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document falcon +0.0003511666373001904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document falcon +0.00034828669066575333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document falcon +0.0003494625207264413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document falcon +0.0003458957535879216 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document falcon +0.0003543020478990003 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document falcon +0.00034754384069014956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document falcon +0.0003598856392240133 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document falcon +0.0003503335458553846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document falcon +0.00035919595619778716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document falcon +0.00035767737970754404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document falcon +0.00035197152783998165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document falcon +0.0003549609834422404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document falcon +0.0003568184100569753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document falcon +0.0003512652818651935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document falcon +0.00035912648958665754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document falcon +0.00034764526964056546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document falcon +0.000352439784960359 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document falcon +0.00035295886560764226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document falcon +0.0003518132693658672 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document falcon +0.00035589987915465713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document falcon +0.00034923863317385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document falcon +0.0003457987267929692 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document falcon +0.0003560928663480501 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document falcon +0.0003529603811204932 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document falcon +0.0003524438555443043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document falcon +0.0003438847030263783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document falcon +0.00035981978898461613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document falcon +0.0003446342778566972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document falcon +0.00035529584995236537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document falcon +0.00034855740895831116 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document falcon +0.00034932634912802544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document falcon +0.00035805518303064666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document falcon +0.0003497941877073061 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document falcon +0.00035774398685405447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document falcon +0.0003560421780316607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document falcon +0.0003508844468369392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document falcon +0.00035731928892270107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document falcon +0.0003557884626314314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document falcon +0.00034992996760289355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document falcon +0.000360752554360921 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document falcon +0.0003452321668708545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document falcon +0.0003591745226131023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document falcon +0.00035256981433229084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document falcon +0.00035378123159712034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document falcon +0.000350464354895999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document falcon +0.00035074625557389677 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document falcon +0.00035025894701994667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document falcon +0.00035437902514857614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document falcon +0.0003514684519732232 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document falcon +0.00035449717909633905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document falcon +0.0003436816402714221 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document falcon +0.00035139158071782116 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document falcon +0.0003509424079843335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document falcon +0.000343894618577506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document falcon +0.0003500789770661659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document falcon +0.0003407788080680086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document falcon +0.0003581908175239701 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document falcon +0.0003465541618780918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document falcon +0.00034600228792437736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document falcon +0.00034416738982773204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document falcon +0.0003519900340150641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document falcon +0.000343369616864659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document falcon +0.0003544993883274688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document falcon +0.0003504441365073392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document falcon +0.00034859160702727056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document falcon +0.00035355909532647185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document falcon +0.0003471900922691849 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document falcon +0.0003563015508709187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document falcon +0.0003487888744148821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document falcon +0.00034711767548688336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document falcon +0.0003530734609369085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document falcon +0.00035123969242560935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document falcon +0.0003517127620891489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document falcon +0.00035232835416868673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document falcon +0.0003524437481912308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document falcon +0.0003525996167005602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document falcon +0.00035064770545242043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document falcon +0.00035311558274981226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document falcon +0.00034952204800569914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document falcon +0.0003541471367344846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document falcon +0.00035418812454561825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document falcon +0.0003528951372900714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document falcon +0.0003542338042975688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document falcon +0.00034937738939942796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document falcon +0.0003522182190878447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document falcon +0.0003501406466507449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document falcon +0.00034973079877492633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document falcon +0.0003485274567713538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document falcon +0.00034999308679368985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document falcon +0.0003570051724707296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document falcon +0.00034567230462019706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document falcon +0.00035529000940160696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document falcon +0.00034956512308671755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document falcon +0.0003496962834028953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document falcon +0.0003468745282493457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document falcon +0.0003502717155809202 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document falcon +0.0003556240880896514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document falcon +0.0003515109488424343 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document falcon +0.0003563156688192592 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document falcon +0.00035040277363989817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document falcon +0.0003481408593290717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document falcon +0.0003624575124332874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document falcon +0.0003522684124250313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document falcon +0.00035286996027653544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document falcon +0.00034967623997256725 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document falcon +0.00035182649587602765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document falcon +0.0003524892557026489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document falcon +0.0003507642477451811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document falcon +0.00036190408389835666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document falcon +0.00035102739424880766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document falcon +0.00035239718753257265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document falcon +0.00035298076121821316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document falcon +0.0003478704389752654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document falcon +0.0003503109191567942 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document falcon +0.00035143250975654426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document falcon +0.0003480663923069012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document falcon +0.00035691540219998623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document falcon +0.000348815437166351 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document falcon +0.00035202073257766225 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document falcon +0.0003491569096274706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document falcon +0.00035277390475511834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document falcon +0.0003524972090026609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document falcon +0.0003504854249750236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document falcon +0.00034740238025423914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document falcon +0.00034968015462277606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document falcon +0.0003493798632762674 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document falcon +0.0003488202537862122 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document falcon +0.0003525461864643725 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document falcon +0.00034903815232825664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document falcon +0.00035536982539258216 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document falcon +0.00034858083265155483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document falcon +0.0003505014973608067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document falcon +0.00035327984042622104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document falcon +0.0003503286677453136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document falcon +0.00035835274842442816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document falcon +0.00034970302660275595 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document falcon +0.000357929573140149 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document falcon +0.0003517238649788585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document falcon +0.00036097027318848475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document falcon +0.0003502734074110026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document falcon +0.00035801510806036273 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document falcon +0.0003568006373479869 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document falcon +0.00036128108717454636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document falcon +0.0003563436883111686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document falcon +0.00035559725321852463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document falcon +0.00035089656006854944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document falcon +0.000359453964362057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document falcon +0.00035629498059104033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document falcon +0.0003622207707090437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document falcon +0.0003540946784512821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document falcon +0.0003594750565232011 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document falcon +0.0003566007415086991 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document falcon +0.0003562142599126134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document falcon +0.0003569948186744601 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document falcon +0.00035166554847920186 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document falcon +0.00035047994419295137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document falcon +0.0003561578193739437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document falcon +0.00035470866838811544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document falcon +0.00034216920464876335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document falcon +0.0003550021513075795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document falcon +0.0003488045105938729 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document falcon +0.0003513340720840151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document falcon +0.0003448558566387584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document falcon +0.0003460966026953241 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document falcon +0.0003488157616036459 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document falcon +0.0003446120387842362 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document falcon +0.000351528602987427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document falcon +0.00035661118227454713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document falcon +0.0003551342699877457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document falcon +0.0003478953397924445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document falcon +0.00034625782458988215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document falcon +0.0003527515447405871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document falcon +0.00034823744889805696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document falcon +0.00034823314560254406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document falcon +0.00035162668292961944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document falcon +0.0003477307716074623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document falcon +0.0003446457989477787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document falcon +0.00034782916273767795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document falcon +0.0003517249130302248 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document falcon +0.0003449873430908556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document falcon +0.00034841291749669877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document falcon +0.0003466028498941749 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document falcon +0.0003486436831199424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document falcon +0.0003478279234211838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document falcon +0.0003495903653274374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document falcon +0.00034896893881218957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document falcon +0.000348941645312426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document falcon +0.0003474221308416894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document falcon +0.0003462621543839385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document falcon +0.0003669373860863891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document falcon +0.00034691156268163006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document falcon +0.0003527774103765281 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document falcon +0.00034684565672734663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document falcon +0.0003454250599604457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document falcon +0.0003541536557159006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document falcon +0.000345735737037366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document falcon +0.0003524669816385214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document falcon +0.0003441817133096468 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document falcon +0.0003519093265859089 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document falcon +0.00035080085480352095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document falcon +0.00035285227929327434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document falcon +0.00034354836346901676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document falcon +0.00034789770937373467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document falcon +0.000343665920520102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document falcon +0.0003490884931060568 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document falcon +0.00034380029463398654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document falcon +0.00034874768005099945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document falcon +0.0003457058510967673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document falcon +0.00034644265227023904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document falcon +0.00035008339858594957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document falcon +0.0003462377193296194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document falcon +0.0003620491787114201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document falcon +0.000348717011044469 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document falcon +0.00034370072363913706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document falcon +0.0003551981066775649 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document falcon +0.0003500119496799342 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document falcon +0.0003485082952669081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document falcon +0.0003508155580978919 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document falcon +0.00035311375163251416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document falcon +0.00034945972003423253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document falcon +0.0003474220353789879 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document falcon +0.0003536443686585001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document falcon +0.0003560350489042953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document falcon +0.0003493655927914396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document falcon +0.0003528423977146383 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document falcon +0.00035255554724471217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document falcon +0.0003479760010190111 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document falcon +0.00035458598862501956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document falcon +0.0003458990560538315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document falcon +0.00035157946422379875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document falcon +0.00034736860650169996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document falcon +0.0003529152313394119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document falcon +0.00034586294329524465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document falcon +0.00035707214923794877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document falcon +0.0003509580363496512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document falcon +0.00035244176725524474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document falcon +0.0003467539557999047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document falcon +0.00034919687962275546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document falcon +0.00035094031731719953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document falcon +0.0003484309008351352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document falcon +0.0003485409424916253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document falcon +0.0003499590776117838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document falcon +0.0003492842758957848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document falcon +0.0003529712275178912 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document falcon +0.0003566141287087449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document falcon +0.0003649496522047409 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document falcon +0.0003563218912208234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document falcon +0.00035614782126966145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document falcon +0.0003531944298453266 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document falcon +0.0003535950949566616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document falcon +0.0003544295554928795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document falcon +0.0003519908503740376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document falcon +0.00035752817626134463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document falcon +0.0003515322689589972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document falcon +0.0003486893890307115 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document falcon +0.0003446520464889867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document falcon +0.0003509421562481707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document falcon +0.00035335015702909084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document falcon +0.0003490178167345008 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document falcon +0.0003520497821155174 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document falcon +0.0003549762618908944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document falcon +0.00035072190850833103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document falcon +0.0003542458638526423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document falcon +0.000352419194572916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document falcon +0.0003545102564672614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document falcon +0.0003495437992331806 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document falcon +0.0003542843376993964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document falcon +0.000352827529313958 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document falcon +0.00035442506093223886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document falcon +0.0003496970719044257 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document falcon +0.0003553096424442362 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document falcon +0.00034986845565067564 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document falcon +0.000352131055186658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document falcon +0.0003527021708198983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document falcon +0.00034905885414547214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document falcon +0.0003583433842468394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document falcon +0.00034409435202828383 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document falcon +0.00034846410520871483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document falcon +0.0003554459991927314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document falcon +0.00035310507471843076 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document falcon +0.000350028910786098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document falcon +0.00035049727458009896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document falcon +0.0003519047735925826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document falcon +0.0003513027429919726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document falcon +0.0003626947260354396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document falcon +0.0003500087324849783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document falcon +0.0003618315726725285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document falcon +0.0003535385113938023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document falcon +0.0003487064058517615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document falcon +0.0003618709124780938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document falcon +0.00035040070335625915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document falcon +0.0003506279032267829 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document falcon +0.0003498435310527524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document falcon +0.0003554634749821431 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document falcon +0.00035091209738758963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document falcon +0.00035034103678978573 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document falcon +0.00035398931854386146 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document falcon +0.00035495529304989485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document falcon +0.00036067883473356603 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document falcon +6.322825248625475e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document megawika +2.4432314037946264e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document megawika +5.6313888721313454e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document megawika +2.4208171781595055e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document megawika +2.325811856369237e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document megawika +2.4010790356322705e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document megawika +5.36773610843632e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document megawika +1.360574433501002e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document megawika +1.3076540344853244e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document megawika +1.3386534334886313e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document megawika +1.2498103719605153e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document megawika +1.403763836949682e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document megawika +1.3636756723495417e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document megawika +1.2242489446940814e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document megawika +1.2398255818973339e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document megawika +1.2972616994216281e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document megawika +1.3947809855914134e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document megawika +1.3144843787829514e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document megawika +1.1693809976572487e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document megawika +1.3677252682893802e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document megawika +1.3940876719849597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document megawika +1.4222245138730965e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document megawika +1.3201677767919704e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document megawika +1.1421717796486169e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document megawika +1.2890514724498703e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document megawika +1.3649507648749037e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document megawika +1.2400732563490717e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document megawika +1.1557681453277616e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document megawika +1.2294483595964517e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document megawika +1.2137484472122283e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document megawika +1.3299663426456e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document megawika +1.2461984216479532e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document megawika +1.4666434217609636e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document megawika +1.1876997894686238e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document megawika +1.2939155338964078e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document megawika +1.3859590039728515e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document megawika +1.317917848615668e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document megawika +1.1335281536110342e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document megawika +1.2889923952861426e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document megawika +1.3471671647053326e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document megawika +1.2221720014475102e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document megawika +1.2632647276287541e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document megawika +1.28276219004076e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document megawika +1.36213704321643e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document megawika +1.2414858625261553e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document megawika +1.3173700421883744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document megawika +1.295597796725686e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document megawika +1.242783936442904e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document megawika +1.2417374088427464e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document megawika +1.2134479405400744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document megawika +1.3090040663304255e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document megawika +1.2713470581614905e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document megawika +5.5750231378906594e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document megawika +5.777597358425469e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document megawika +5.349786767471258e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document megawika +5.675165050453583e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document megawika +5.482611216158831e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document megawika +5.065421899890121e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document megawika +5.384718357480146e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document megawika +4.872037363236061e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document megawika +4.532709250783155e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document megawika +5.7257963030489613e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document megawika +4.9014365579652036e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document megawika +5.722863552770969e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document megawika +6.149911636146833e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document megawika +5.2178057608273506e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document megawika +4.990228161160431e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document megawika +5.866186875255134e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document megawika +5.004185734360719e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document megawika +4.79401853705107e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document megawika +5.435219965052376e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document megawika +5.035997225792266e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document megawika +5.622401774211625e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document megawika +5.028826157387559e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document megawika +5.596379470128795e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document megawika +6.027824493191489e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document megawika +5.5358270009931474e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document megawika +5.9839051807685496e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document megawika +5.1221077499249595e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document megawika +5.517228560620279e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document megawika +5.1687858285052305e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document megawika +5.684188244145645e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document megawika +5.212693275535878e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document megawika +4.8551007022784084e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document megawika +5.4888506639203145e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document megawika +5.345098688527242e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document megawika +4.8506420625516594e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document megawika +5.132168603397676e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document megawika +5.719476795114223e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document megawika +5.7448621149792696e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document megawika +4.9068410568059265e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document megawika +5.382937299647678e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document megawika +4.8288432136304634e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document megawika +5.841703200305416e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document megawika +5.1589611587885584e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document megawika +6.031113829732574e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document megawika +5.4558202844532094e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document megawika +5.341852317196142e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document megawika +5.1402942738369954e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document megawika +5.735421384377395e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document megawika +5.473629863586958e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document megawika +5.4708993245733936e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document megawika +4.931161863634078e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document megawika +5.104173022127248e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document megawika +5.510157161510824e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document megawika +5.652501401782597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document megawika +5.7273656573031666e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document megawika +5.638363224821738e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document megawika +5.6128115396668704e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document megawika +5.00304877998141e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document megawika +5.596120554779096e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document megawika +5.5280923889040006e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document megawika +5.223477917938408e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document megawika +5.29472809986569e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document megawika +2.205682378243213e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document megawika +1.4367563720603185e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document megawika +3.5506193487931076e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document megawika +3.0442910855821778e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document megawika +2.2540042508019627e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document megawika +2.6880163202623216e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document megawika +2.534473148048727e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document megawika +2.6560945431318916e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document megawika +2.547470248967691e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document megawika +2.5248825388073738e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document megawika +2.5828729575000054e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document megawika +2.4026583817957736e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document megawika +2.3930425429834413e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document megawika +2.5037365362599724e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document megawika +2.6696745470595603e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document megawika +2.140323051341762e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document megawika +2.617354786691592e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document megawika +1.538359101762691e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document megawika +1.2871029252377856e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document megawika +2.255195411289217e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document megawika +2.4832313897952067e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document megawika +9.303873918189968e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document megawika +2.179532302620228e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document megawika +1.9750517506901206e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document megawika +2.7740420380648435e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document megawika +2.7813714782319335e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document megawika +4.1595357937609806e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document megawika +2.741365122389175e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document megawika +2.117451071361901e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document megawika +1.7132649760565998e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document megawika +1.7492547092602047e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document megawika +1.7499951097392276e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document megawika +1.6632444789170958e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document megawika +1.6678802252361607e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document megawika +1.5519208704558896e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document megawika +1.652420992967167e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document megawika +1.6119931034508755e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document megawika +1.6638882076736552e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document megawika +1.7198076782652946e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document megawika +1.572927860565175e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document megawika +1.5194822618169918e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document megawika +1.6677776832669846e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document megawika +1.595612492245688e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document megawika +1.682350633181197e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document megawika +1.663983380609724e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document megawika +1.710187842689243e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document megawika +1.5733697527539038e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document megawika +1.6972104757911438e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document megawika +1.6610142847616577e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document megawika +1.61094882403031e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document megawika +1.4789207305138325e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document megawika +1.639299617676302e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document megawika +1.3241204512116132e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document megawika +8.582260726625535e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document megawika +8.213000975576739e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document megawika +9.549247732811947e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document megawika +9.17242785339013e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document megawika +7.632868223725218e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document megawika +8.674401118222175e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document megawika +9.124384255505347e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document megawika +8.344222222417358e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document megawika +8.992299957499065e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document megawika +8.76689497361025e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document megawika +7.973396239586015e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document megawika +9.006935606644125e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document megawika +8.725545954955498e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document megawika +1.215449694669174e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document megawika +3.3041720284158646e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document megawika +2.0593512412624502e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document megawika +1.893608946986248e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document megawika +1.737111666788535e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document megawika +1.4915923449873955e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document megawika +2.289370239067605e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document megawika +2.8615335689614638e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document megawika +8.847283630883125e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document megawika +1.8175470362373804e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document megawika +1.8152226683368038e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document megawika +1.789149655314284e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document megawika +1.7690523036477663e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document megawika +1.8333732213753644e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document megawika +1.8794105687718654e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document megawika +1.721841156706417e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document megawika +2.0612008685724796e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document megawika +1.9297370681336376e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document megawika +2.0188440409661018e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document megawika +5.1741216329695265e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document megawika +1.3417913926038429e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document megawika +1.1010813016469651e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document megawika +1.1252416134320087e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document megawika +1.2801744104313002e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document megawika +1.3041514955795817e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document megawika +1.3428837580879075e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document megawika +1.320809382267804e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document megawika +1.3451566676555968e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document megawika +1.228284926657501e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document megawika +1.2410599573923043e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document megawika +1.3815343367377182e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document megawika +1.3895126265148832e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document megawika +1.2306773644401741e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document megawika +1.32981021906281e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document megawika +1.101337469221607e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document megawika +1.513094184404692e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document megawika +1.1073759547073234e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document megawika +1.2879348765857567e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document megawika +9.619595770228435e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document megawika +1.2384340836286436e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document megawika +1.1766667232211577e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document megawika +1.2871049236196452e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document megawika +1.2010645926497744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document megawika +1.3971428231518597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document megawika +1.2283733550547932e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document megawika +1.2659530508255308e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document megawika +1.551775613074462e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document megawika +1.1169413343776979e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document megawika +1.1433700593712463e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document megawika +4.964773647323492e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document megawika +1.0995586595687313e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document megawika +1.2957393071411267e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document megawika +2.75899247407709e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document megawika +2.8269344597344854e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document megawika +2.329108187246831e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document megawika +2.4231761430460284e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document megawika +1.2434140512230442e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document megawika +1.638718338352859e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document megawika +3.272953556801187e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document megawika +6.061314500486327e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document megawika +1.2465979731210292e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document megawika +1.2737557327967737e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document megawika +1.038428658075627e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document megawika +2.61666472045566e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document megawika +3.6506873212272224e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document megawika +1.5066359138295701e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document megawika +1.1166290872121178e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document megawika +1.5546966228590285e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document megawika +1.2583434625014828e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document megawika +1.3398826881300862e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document megawika +1.2944933160515968e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document megawika +1.0971437399901365e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document megawika +1.2787922795775774e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document megawika +1.404979227816985e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document megawika +1.3344734431324463e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document megawika +4.886031157107555e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document megawika +3.277261443596394e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document megawika +3.5057957685786495e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document megawika +3.287625301718589e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document megawika +3.1370056372668855e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document megawika +3.186092015785841e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document megawika +7.271819324142512e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document megawika +0.001451215788905126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document open-web-math-train +0.0014486847196258788 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document open-web-math-train +0.0008861032722895899 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document open-web-math-train +0.0018119590809459816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document open-web-math-train +0.0008916937917547129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document open-web-math-train +6.960128832809415e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document open-web-math-train +0.002008403651063623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document open-web-math-train +0.0014374900742131454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document open-web-math-train +0.00180213596996716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document open-web-math-train +0.001956178877532413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document open-web-math-train +0.0008829547017667033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document open-web-math-train +0.0008910853619157279 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document open-web-math-train +0.0018260998845299973 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document open-web-math-train +0.0012499632072059553 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document pes2o +0.00125398260359913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document pes2o +0.0012541704774729071 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document pes2o +0.0012527268234360602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document pes2o +0.0012532925243737164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document pes2o +0.0012456396241204315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document pes2o +0.0012589894424352072 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document pes2o +0.001508020123999618 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document pes2o +0.00333096950781965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document pes2o +0.0033233414614415547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document pes2o +0.003512387990689828 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document pes2o +0.0035091382940513126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document pes2o +0.003514155927147005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document pes2o +0.003327108000579638 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document pes2o +0.003329106196589836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document pes2o +0.003505604148738077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document pes2o +0.003324825759567855 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document pes2o +0.0033248240149804913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document pes2o +0.0033385962112851358 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document pes2o +0.0035043186296553615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document pes2o +0.003340469505431529 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document pes2o +0.0035106889084796276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document pes2o +0.0033309469281030167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document pes2o +0.003340337858029757 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document pes2o +0.003505919861097801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document pes2o +0.0003882924098240512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document pes2o +0.0005759963691850877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document reddit +0.0005959971675332674 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document reddit +0.0006026179290353799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document reddit +0.0005824184320784846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document reddit +0.0005854598548616037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document reddit +0.0005903767055633473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document reddit +0.0005930306490982049 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document reddit +0.000569425602700746 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document reddit +0.0005675060415179408 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document reddit +0.0005772431621253389 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document reddit +0.0005678026053826858 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document reddit +0.0005700398263483378 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document reddit +0.0005669467963528824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document reddit +0.0005701015953324305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document reddit +0.0005795907287413296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document reddit +0.0005735602737531164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document reddit +0.0005749862745842101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document reddit +0.0005693257015931971 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document reddit +0.0005716568794795563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document reddit +0.0005761083919774021 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document reddit +0.0005688343169797355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document reddit +0.0005807913190929842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document reddit +0.0005710229258078636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document reddit +0.0005704083039826862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document reddit +0.0005862132348308056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document reddit +0.0005717662049559556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document reddit +0.0005858155213694451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document reddit +0.0005812012281792392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document reddit +0.0005803981414588498 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document reddit +0.0005700102108287723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document reddit +0.0005719243459052329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document reddit +0.0005867253401661752 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document reddit +0.0005731087218860733 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document reddit +0.0005712197789109317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document reddit +0.0005702376926310089 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document reddit +0.0005700411527742972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document reddit +0.0005828090098178196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document reddit +0.0005770140826168056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document reddit +0.0005723509664597896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document reddit +0.0005755499231836962 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document reddit +0.0005636407438471367 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document reddit +0.0005640281556500104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document reddit +0.0005633159058766496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document reddit +0.0005638034311151449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document reddit +0.0005630066273073224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document reddit +0.0005631803831128559 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document reddit +0.0005631228881679657 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document reddit +0.0005628178701487633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document reddit +0.0005624448092256196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document reddit +0.0005620957024062329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document reddit +0.0005614201504177484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document reddit +0.0005616890951464056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document reddit +0.0005611348559279058 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document reddit +0.0005604238061828518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document reddit +0.0005603301490194237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document reddit +0.0005607291294548833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document reddit +0.0005605234569930727 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document reddit +0.0005613778566640694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document reddit +0.0005610248539992471 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document reddit +0.0005599977416780475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document reddit +0.0005603632562116935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document reddit +0.0005599177479509897 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document reddit +0.0005595202318298379 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document reddit +0.0005600975633499175 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document reddit +0.0005614075491213365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document reddit +0.000612563885043477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document reddit +0.0005515469909644413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document reddit +0.0005526782014946906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document reddit +0.0005472463408095445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document reddit +0.0005502284746004587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document reddit +0.0005414514790555363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document reddit +0.0005513499500134784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document reddit +0.0005391391454105187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document reddit +0.0005415836910001838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document reddit +0.0005208132468536551 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document reddit +0.0005889827143132871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document reddit +0.0005822520817765276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document reddit +0.0004173155230758696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document reddit +0.0009994361338078242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document stackexchange +0.001087156194657966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document stackexchange +0.0010667737163656816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document stackexchange +0.0009602877882124873 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document stackexchange +0.0008968956271971105 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document stackexchange +0.0009198034843762967 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document stackexchange +0.0009423901016715341 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document stackexchange +0.0009674094553686345 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document stackexchange +0.0009858331322519164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document stackexchange +0.0009970593645879198 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document stackexchange +0.0010027035193731686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document stackexchange +0.0010128291154221853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document stackexchange +0.0010215631382631918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document stackexchange +0.0010288663771461238 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document stackexchange +0.0010346219929285867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document stackexchange +0.00104544019940344 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document stackexchange +0.0010525172676724333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document stackexchange +0.0010609529620775127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document stackexchange +0.0010725892748610153 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document stackexchange +0.0010818563598181568 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document stackexchange +0.0010992760196793917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document stackexchange +0.0011178992762079917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document stackexchange +0.001124687532085676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document stackexchange +0.001118303661267191 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document stackexchange +0.0010206825575416534 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document stackexchange +0.0005512280117499715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document stackexchange +0.004474659408857016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document starcoder +0.00409944473890653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document starcoder +0.005137179939941845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document starcoder +0.005143172251066109 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document starcoder +0.005206134363352808 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document starcoder +0.004892747858974329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document starcoder +0.004844731352552902 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document starcoder +0.005308320169123755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document starcoder +0.005124709815666577 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document starcoder +0.005424710744483826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document starcoder +0.00538244648861977 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document starcoder +0.0029107284679086853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document starcoder +0.0026825258998444705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document starcoder +0.0026904503191419243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document starcoder +0.002687906577174073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document starcoder +0.002850165346048818 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document starcoder +0.005322698571717847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document starcoder +0.004450334290869719 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document starcoder +0.004700990083440683 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document starcoder +0.003903568556500995 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document starcoder +0.00390561515396931 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document starcoder +0.0039046402900912262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document starcoder +0.003907454839379547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document starcoder +0.0038583224578603824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document starcoder +0.0037914116657695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document starcoder +0.003786665266798682 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document starcoder +0.003792000802430658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document starcoder +0.00319266847466091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document starcoder +0.0032658716699838944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document starcoder +0.0034801959532460023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document starcoder +0.0028307012092022594 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document starcoder +0.0028420360878146276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document starcoder +0.0028410455248484914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document starcoder +0.00283497183526842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document starcoder +0.002840187195459487 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document starcoder +0.0028398709431369834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document starcoder +0.004364722843422023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document starcoder +0.004093255713117101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document starcoder +0.004092331079566252 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document starcoder +0.004005326985579649 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document starcoder +0.0036205502856964207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document starcoder +0.003625316793034984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document starcoder +0.003604743435602363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document starcoder +0.0035405823343673125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document starcoder +0.0041601413517253945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document starcoder +0.005886303658937057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document starcoder +0.003600909532810332 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document starcoder +0.0034941365817168658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document starcoder +0.0004992164842980224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document starcoder +0.00032927705604725614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document tulu +0.0002860154190878753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document tulu +0.0002845217585425619 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document tulu +0.0002743528685497456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document tulu +0.00026025323737738766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document tulu +0.00023493876414603155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document tulu +0.00029665994994226705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document tulu +0.00031808102075993956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document tulu +0.00031813573046011285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document tulu +0.0002711905171855542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document tulu +0.00028892513401817095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document tulu +0.00030003908676979083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document tulu +0.00026839878771944684 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document tulu +0.00029155935002690497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document tulu +0.0002998624927624209 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document tulu +0.0003091705447974841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document tulu +0.00026873195794309786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document tulu +0.00027721873498527547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document tulu +0.0002841662554024377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document tulu +0.0002839461156551537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document tulu +0.0002861705604659811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document tulu +0.0002460995649635886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document tulu +0.00019420142619795496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document tulu +0.00021967677816173628 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document tulu +0.0002620283200480949 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document tulu +0.0002433390542188936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document tulu +0.00021254976608350767 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document tulu +0.00022094815569522115 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document tulu +0.000342862378668244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document tulu +0.00033784225259118157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document tulu +0.0003367278459543952 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document tulu +0.00029843279042852765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document tulu +0.0002926583661257988 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document tulu +0.00029320337282010673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document tulu +0.00029281450669483455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document tulu +0.0002915338187002653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document tulu +0.0002864226923084572 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document tulu +0.00028643439083586396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document tulu +0.00028253710956299054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document tulu +0.0002810856078805806 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document tulu +0.00031474941344656715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document tulu +0.0002139130222205655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document tulu +0.0003084648871862831 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document tulu +0.0003309477872140129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document tulu +0.0003360096824695161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document tulu +0.0003355452655196557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document tulu +0.00038119390366386037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document tulu +0.00038078927630086064 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document tulu +0.0003386200917551554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document tulu +0.0002158905159938882 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document tulu +0.00021621682877018768 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document tulu +0.00021553306942740535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document tulu +0.00021581563462722296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document tulu +0.0002157694110556169 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document tulu +0.000215643699847159 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document tulu +0.00021532716715168094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document tulu +0.00021531221326022472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document tulu +0.0002831801179028896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document tulu +0.0002514844936507595 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document tulu +0.00031638782778107964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document tulu +0.0002749197545278445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document tulu +0.00026159721512464495 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document tulu +0.0002630052420096968 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document tulu +0.00031106811228913666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document tulu +0.0002852973415334161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document tulu +3.7555372465932136e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document tulu +0.003548077173506675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document wiki +0.0018372203137874265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document wiki diff --git a/ALCF/data-lists/aurora/dolma_v1_7_file_list.txt b/ALCF/data-lists/aurora/dolma_v1_7_file_list.txt new file mode 100644 index 0000000000..2cc52b55f6 --- /dev/null +++ b/ALCF/data-lists/aurora/dolma_v1_7_file_list.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document +0.0002583902668716813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document +0.0031025147279277244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.003102019887362634 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.0009996745994661548 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document +0.0002406272620255565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document +0.0003742481815405742 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document +0.0003547982093445404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document +6.322825248625475e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document +0.001451215788905126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document +0.0012499632072059553 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document +0.0005759963691850877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document +0.0009994361338078242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document +0.004474659408857016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document +0.00032927705604725614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document +0.003548077173506675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document diff --git a/ALCF/data-lists/aurora/falcon.txt b/ALCF/data-lists/aurora/falcon.txt new file mode 100644 index 0000000000..997038fb38 --- /dev/null +++ b/ALCF/data-lists/aurora/falcon.txt @@ -0,0 +1,501 @@ +0.0003547982093445404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document falcon +0.00035934014428504944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document falcon +0.00035707704501371544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document falcon +0.00035287930712815354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document falcon +0.00035977166728996823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document falcon +0.0003581675664109838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document falcon +0.0003548617059697185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document falcon +0.0003639582000286208 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document falcon +0.00035375839698688127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document falcon +0.0003743722020080678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document falcon +0.0003530399715341242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document falcon +0.00035511875882752406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document falcon +0.0003618733574783154 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document falcon +0.00035185243285420104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document falcon +0.0003541503739732106 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document falcon +0.0003631679485751914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document falcon +0.00035748045578182274 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document falcon +0.0003606490690555877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document falcon +0.0003626383296610091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document falcon +0.00035442644361264756 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document falcon +0.00035978370170539796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document falcon +0.0003585562375341541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document falcon +0.0003601958372888019 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document falcon +0.000350277765402227 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document falcon +0.0003616521184211704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document falcon +0.0003620625543608188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document falcon +0.0003560781983850704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document falcon +0.0003553209610592676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document falcon +0.00035905348643915075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document falcon +0.00034744258805696526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document falcon +0.00035462784035661496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document falcon +0.00034768186175100895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document falcon +0.0003568534635532736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document falcon +0.00035586511544371234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document falcon +0.0003524567827568137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document falcon +0.0003512453770426313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document falcon +0.0003591792726468799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document falcon +0.0003514024529343127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document falcon +0.0003584880112586934 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document falcon +0.00035133552916418045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document falcon +0.0003600811981350215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document falcon +0.0003571663974228119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document falcon +0.00035768103378874214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document falcon +0.00035939205561113694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document falcon +0.00035186773916029825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document falcon +0.0003542829672490847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document falcon +0.0003592783642898726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document falcon +0.0003556367340099302 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document falcon +0.00035391392271377027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document falcon +0.00035486725707484836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document falcon +0.00034866743396828035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document falcon +0.0003517219808644735 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document falcon +0.00034874458549673823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document falcon +0.000355773136961014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document falcon +0.00035611750387841917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document falcon +0.00035305602013916315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document falcon +0.0003578207127071924 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document falcon +0.00035514635841943707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document falcon +0.00034816946212866206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document falcon +0.0003512707269761496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document falcon +0.0003483392117980654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document falcon +0.0003572169607204321 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document falcon +0.00035139153281660794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document falcon +0.00035536422129036537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document falcon +0.000352017164107143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document falcon +0.000351889550179365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document falcon +0.000358759689953589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document falcon +0.0003569286079869268 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document falcon +0.0003657752958602099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document falcon +0.00035396127934790697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document falcon +0.0003618565071224743 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document falcon +0.00035146051531973204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document falcon +0.00036107135765783567 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document falcon +0.00035019554279994576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document falcon +0.00035567858879904983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document falcon +0.0003504753174793183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document falcon +0.00035931140831329194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document falcon +0.0003502967866002823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document falcon +0.0003532911801041972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document falcon +0.0003583543013070199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document falcon +0.0003566243489931224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document falcon +0.0003468752314799221 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document falcon +0.0003597840618138091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document falcon +0.00035128822484768084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document falcon +0.00035889496943437507 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document falcon +0.000352400524650424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document falcon +0.0003518689536768735 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document falcon +0.00035866864741303467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document falcon +0.0003454687659106334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document falcon +0.00035348007259317576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document falcon +0.0003539752270940644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document falcon +0.00035146495994081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document falcon +0.00035397212846310423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document falcon +0.00035208246467162587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document falcon +0.0003490843168676626 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document falcon +0.00035299633658644394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document falcon +0.00034868327466167065 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document falcon +0.00035941351365601583 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document falcon +0.0003545343062735255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document falcon +0.0003528956380445978 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document falcon +0.0003553355770443352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document falcon +0.0003644224004937743 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document falcon +0.00035234291036216907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document falcon +0.0003596237469847771 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document falcon +0.0003531996065735989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document falcon +0.0003547177054106099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document falcon +0.0003575586499260483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document falcon +0.00035262635135283667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document falcon +0.0003624191962188944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document falcon +0.0003488398052948616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document falcon +0.0003598294093147917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document falcon +0.00035583006534466323 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document falcon +0.00035403139653225103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document falcon +0.00036134702642187156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document falcon +0.0003573689927162834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document falcon +0.0003577141131435527 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document falcon +0.00035208814419277406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document falcon +0.00035996720683665625 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document falcon +0.00035415304658912596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document falcon +0.00036353353029443546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document falcon +0.0003537326003150983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document falcon +0.00036053976358299083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document falcon +0.000352380489373494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document falcon +0.00036154661616900994 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document falcon +0.00035959332325963614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document falcon +0.0003597954667189692 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document falcon +0.0003563108270597542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document falcon +0.0003582891940460143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document falcon +0.0003497728210484297 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document falcon +0.0003549834902179354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document falcon +0.0003529828233484542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document falcon +0.00034627483903285777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document falcon +0.00035569006572589215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document falcon +0.00035449377946910314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document falcon +0.00035802844396194623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document falcon +0.0003617277809353208 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document falcon +0.00035034118898654814 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document falcon +0.000351091193908611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document falcon +0.0003527914342210668 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document falcon +0.00035028288369781376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document falcon +0.00035775745592780506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document falcon +0.0003449630690661468 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document falcon +0.0003583490698830361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document falcon +0.0003476995746684122 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document falcon +0.0003535632505019212 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document falcon +0.00035640180641147417 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document falcon +0.000361731045691765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document falcon +0.0003534082129597368 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document falcon +0.0003550344149828664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document falcon +0.00035363002411364057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document falcon +0.0003537265579677396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document falcon +0.00034950531383577937 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document falcon +0.00035008511827347514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document falcon +0.00035594533400871325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document falcon +0.00035266312861335946 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document falcon +0.00035280268794863923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document falcon +0.0003565470391528536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document falcon +0.0003588492322689137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document falcon +0.00035469909697832775 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document falcon +0.00034712082813410526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document falcon +0.000348701157101807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document falcon +0.0003500192014479944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document falcon +0.00035120560544669755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document falcon +0.00035403656850437445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document falcon +0.00035852376560749366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document falcon +0.0003534754068111774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document falcon +0.00035591740046720765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document falcon +0.000348522354782563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document falcon +0.0003533533959664415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document falcon +0.00035631425964030697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document falcon +0.0003485886551574741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document falcon +0.00035917652631065777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document falcon +0.0003482975272111288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document falcon +0.00035580661277480167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document falcon +0.0003492290722955348 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document falcon +0.00034989284450240613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document falcon +0.0003545677216162781 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document falcon +0.00034622286859463484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document falcon +0.00036070626989861965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document falcon +0.00035518365036320786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document falcon +0.00035272907057848406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document falcon +0.0003547343638218734 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document falcon +0.0003496450144966242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document falcon +0.0003537407829294287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document falcon +0.0003489722653985685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document falcon +0.00035057186899911295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document falcon +0.0003507566548933051 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document falcon +0.00035630360179023747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document falcon +0.00035631362503416367 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document falcon +0.0003490204248026821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document falcon +0.00035761724058371226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document falcon +0.00035037664777467137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document falcon +0.000353402110481068 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document falcon +0.00034524163568371745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document falcon +0.00035528523728570974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document falcon +0.00034784916132431703 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document falcon +0.00034928476408048925 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document falcon +0.00034989205973784984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document falcon +0.00034201664404094254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document falcon +0.0003529676016338611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document falcon +0.00034643433682346637 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document falcon +0.0003511666373001904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document falcon +0.00034828669066575333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document falcon +0.0003494625207264413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document falcon +0.0003458957535879216 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document falcon +0.0003543020478990003 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document falcon +0.00034754384069014956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document falcon +0.0003598856392240133 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document falcon +0.0003503335458553846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document falcon +0.00035919595619778716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document falcon +0.00035767737970754404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document falcon +0.00035197152783998165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document falcon +0.0003549609834422404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document falcon +0.0003568184100569753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document falcon +0.0003512652818651935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document falcon +0.00035912648958665754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document falcon +0.00034764526964056546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document falcon +0.000352439784960359 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document falcon +0.00035295886560764226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document falcon +0.0003518132693658672 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document falcon +0.00035589987915465713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document falcon +0.00034923863317385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document falcon +0.0003457987267929692 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document falcon +0.0003560928663480501 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document falcon +0.0003529603811204932 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document falcon +0.0003524438555443043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document falcon +0.0003438847030263783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document falcon +0.00035981978898461613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document falcon +0.0003446342778566972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document falcon +0.00035529584995236537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document falcon +0.00034855740895831116 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document falcon +0.00034932634912802544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document falcon +0.00035805518303064666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document falcon +0.0003497941877073061 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document falcon +0.00035774398685405447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document falcon +0.0003560421780316607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document falcon +0.0003508844468369392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document falcon +0.00035731928892270107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document falcon +0.0003557884626314314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document falcon +0.00034992996760289355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document falcon +0.000360752554360921 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document falcon +0.0003452321668708545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document falcon +0.0003591745226131023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document falcon +0.00035256981433229084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document falcon +0.00035378123159712034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document falcon +0.000350464354895999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document falcon +0.00035074625557389677 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document falcon +0.00035025894701994667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document falcon +0.00035437902514857614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document falcon +0.0003514684519732232 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document falcon +0.00035449717909633905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document falcon +0.0003436816402714221 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document falcon +0.00035139158071782116 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document falcon +0.0003509424079843335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document falcon +0.000343894618577506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document falcon +0.0003500789770661659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document falcon +0.0003407788080680086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document falcon +0.0003581908175239701 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document falcon +0.0003465541618780918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document falcon +0.00034600228792437736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document falcon +0.00034416738982773204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document falcon +0.0003519900340150641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document falcon +0.000343369616864659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document falcon +0.0003544993883274688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document falcon +0.0003504441365073392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document falcon +0.00034859160702727056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document falcon +0.00035355909532647185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document falcon +0.0003471900922691849 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document falcon +0.0003563015508709187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document falcon +0.0003487888744148821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document falcon +0.00034711767548688336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document falcon +0.0003530734609369085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document falcon +0.00035123969242560935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document falcon +0.0003517127620891489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document falcon +0.00035232835416868673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document falcon +0.0003524437481912308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document falcon +0.0003525996167005602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document falcon +0.00035064770545242043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document falcon +0.00035311558274981226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document falcon +0.00034952204800569914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document falcon +0.0003541471367344846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document falcon +0.00035418812454561825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document falcon +0.0003528951372900714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document falcon +0.0003542338042975688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document falcon +0.00034937738939942796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document falcon +0.0003522182190878447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document falcon +0.0003501406466507449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document falcon +0.00034973079877492633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document falcon +0.0003485274567713538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document falcon +0.00034999308679368985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document falcon +0.0003570051724707296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document falcon +0.00034567230462019706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document falcon +0.00035529000940160696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document falcon +0.00034956512308671755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document falcon +0.0003496962834028953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document falcon +0.0003468745282493457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document falcon +0.0003502717155809202 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document falcon +0.0003556240880896514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document falcon +0.0003515109488424343 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document falcon +0.0003563156688192592 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document falcon +0.00035040277363989817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document falcon +0.0003481408593290717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document falcon +0.0003624575124332874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document falcon +0.0003522684124250313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document falcon +0.00035286996027653544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document falcon +0.00034967623997256725 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document falcon +0.00035182649587602765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document falcon +0.0003524892557026489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document falcon +0.0003507642477451811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document falcon +0.00036190408389835666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document falcon +0.00035102739424880766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document falcon +0.00035239718753257265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document falcon +0.00035298076121821316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document falcon +0.0003478704389752654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document falcon +0.0003503109191567942 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document falcon +0.00035143250975654426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document falcon +0.0003480663923069012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document falcon +0.00035691540219998623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document falcon +0.000348815437166351 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document falcon +0.00035202073257766225 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document falcon +0.0003491569096274706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document falcon +0.00035277390475511834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document falcon +0.0003524972090026609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document falcon +0.0003504854249750236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document falcon +0.00034740238025423914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document falcon +0.00034968015462277606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document falcon +0.0003493798632762674 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document falcon +0.0003488202537862122 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document falcon +0.0003525461864643725 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document falcon +0.00034903815232825664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document falcon +0.00035536982539258216 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document falcon +0.00034858083265155483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document falcon +0.0003505014973608067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document falcon +0.00035327984042622104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document falcon +0.0003503286677453136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document falcon +0.00035835274842442816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document falcon +0.00034970302660275595 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document falcon +0.000357929573140149 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document falcon +0.0003517238649788585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document falcon +0.00036097027318848475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document falcon +0.0003502734074110026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document falcon +0.00035801510806036273 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document falcon +0.0003568006373479869 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document falcon +0.00036128108717454636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document falcon +0.0003563436883111686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document falcon +0.00035559725321852463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document falcon +0.00035089656006854944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document falcon +0.000359453964362057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document falcon +0.00035629498059104033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document falcon +0.0003622207707090437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document falcon +0.0003540946784512821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document falcon +0.0003594750565232011 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document falcon +0.0003566007415086991 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document falcon +0.0003562142599126134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document falcon +0.0003569948186744601 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document falcon +0.00035166554847920186 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document falcon +0.00035047994419295137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document falcon +0.0003561578193739437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document falcon +0.00035470866838811544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document falcon +0.00034216920464876335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document falcon +0.0003550021513075795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document falcon +0.0003488045105938729 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document falcon +0.0003513340720840151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document falcon +0.0003448558566387584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document falcon +0.0003460966026953241 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document falcon +0.0003488157616036459 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document falcon +0.0003446120387842362 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document falcon +0.000351528602987427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document falcon +0.00035661118227454713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document falcon +0.0003551342699877457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document falcon +0.0003478953397924445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document falcon +0.00034625782458988215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document falcon +0.0003527515447405871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document falcon +0.00034823744889805696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document falcon +0.00034823314560254406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document falcon +0.00035162668292961944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document falcon +0.0003477307716074623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document falcon +0.0003446457989477787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document falcon +0.00034782916273767795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document falcon +0.0003517249130302248 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document falcon +0.0003449873430908556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document falcon +0.00034841291749669877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document falcon +0.0003466028498941749 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document falcon +0.0003486436831199424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document falcon +0.0003478279234211838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document falcon +0.0003495903653274374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document falcon +0.00034896893881218957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document falcon +0.000348941645312426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document falcon +0.0003474221308416894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document falcon +0.0003462621543839385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document falcon +0.0003669373860863891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document falcon +0.00034691156268163006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document falcon +0.0003527774103765281 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document falcon +0.00034684565672734663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document falcon +0.0003454250599604457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document falcon +0.0003541536557159006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document falcon +0.000345735737037366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document falcon +0.0003524669816385214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document falcon +0.0003441817133096468 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document falcon +0.0003519093265859089 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document falcon +0.00035080085480352095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document falcon +0.00035285227929327434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document falcon +0.00034354836346901676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document falcon +0.00034789770937373467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document falcon +0.000343665920520102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document falcon +0.0003490884931060568 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document falcon +0.00034380029463398654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document falcon +0.00034874768005099945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document falcon +0.0003457058510967673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document falcon +0.00034644265227023904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document falcon +0.00035008339858594957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document falcon +0.0003462377193296194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document falcon +0.0003620491787114201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document falcon +0.000348717011044469 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document falcon +0.00034370072363913706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document falcon +0.0003551981066775649 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document falcon +0.0003500119496799342 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document falcon +0.0003485082952669081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document falcon +0.0003508155580978919 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document falcon +0.00035311375163251416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document falcon +0.00034945972003423253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document falcon +0.0003474220353789879 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document falcon +0.0003536443686585001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document falcon +0.0003560350489042953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document falcon +0.0003493655927914396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document falcon +0.0003528423977146383 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document falcon +0.00035255554724471217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document falcon +0.0003479760010190111 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document falcon +0.00035458598862501956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document falcon +0.0003458990560538315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document falcon +0.00035157946422379875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document falcon +0.00034736860650169996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document falcon +0.0003529152313394119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document falcon +0.00034586294329524465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document falcon +0.00035707214923794877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document falcon +0.0003509580363496512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document falcon +0.00035244176725524474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document falcon +0.0003467539557999047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document falcon +0.00034919687962275546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document falcon +0.00035094031731719953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document falcon +0.0003484309008351352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document falcon +0.0003485409424916253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document falcon +0.0003499590776117838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document falcon +0.0003492842758957848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document falcon +0.0003529712275178912 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document falcon +0.0003566141287087449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document falcon +0.0003649496522047409 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document falcon +0.0003563218912208234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document falcon +0.00035614782126966145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document falcon +0.0003531944298453266 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document falcon +0.0003535950949566616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document falcon +0.0003544295554928795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document falcon +0.0003519908503740376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document falcon +0.00035752817626134463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document falcon +0.0003515322689589972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document falcon +0.0003486893890307115 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document falcon +0.0003446520464889867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document falcon +0.0003509421562481707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document falcon +0.00035335015702909084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document falcon +0.0003490178167345008 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document falcon +0.0003520497821155174 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document falcon +0.0003549762618908944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document falcon +0.00035072190850833103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document falcon +0.0003542458638526423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document falcon +0.000352419194572916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document falcon +0.0003545102564672614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document falcon +0.0003495437992331806 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document falcon +0.0003542843376993964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document falcon +0.000352827529313958 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document falcon +0.00035442506093223886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document falcon +0.0003496970719044257 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document falcon +0.0003553096424442362 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document falcon +0.00034986845565067564 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document falcon +0.000352131055186658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document falcon +0.0003527021708198983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document falcon +0.00034905885414547214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document falcon +0.0003583433842468394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document falcon +0.00034409435202828383 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document falcon +0.00034846410520871483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document falcon +0.0003554459991927314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document falcon +0.00035310507471843076 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document falcon +0.000350028910786098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document falcon +0.00035049727458009896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document falcon +0.0003519047735925826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document falcon +0.0003513027429919726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document falcon +0.0003626947260354396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document falcon +0.0003500087324849783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document falcon +0.0003618315726725285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document falcon +0.0003535385113938023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document falcon +0.0003487064058517615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document falcon +0.0003618709124780938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document falcon +0.00035040070335625915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document falcon +0.0003506279032267829 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document falcon +0.0003498435310527524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document falcon +0.0003554634749821431 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document falcon +0.00035091209738758963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document falcon +0.00035034103678978573 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document falcon +0.00035398931854386146 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document falcon +0.00035495529304989485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document falcon +0.00036067883473356603 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document falcon + diff --git a/ALCF/data-lists/aurora/megawiki.txt b/ALCF/data-lists/aurora/megawiki.txt new file mode 100644 index 0000000000..635eba3d90 --- /dev/null +++ b/ALCF/data-lists/aurora/megawiki.txt @@ -0,0 +1,262 @@ +6.322825248625475e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document megawika +2.4432314037946264e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document megawika +5.6313888721313454e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document megawika +2.4208171781595055e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document megawika +2.325811856369237e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document megawika +2.4010790356322705e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document megawika +5.36773610843632e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document megawika +1.360574433501002e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document megawika +1.3076540344853244e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document megawika +1.3386534334886313e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document megawika +1.2498103719605153e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document megawika +1.403763836949682e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document megawika +1.3636756723495417e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document megawika +1.2242489446940814e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document megawika +1.2398255818973339e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document megawika +1.2972616994216281e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document megawika +1.3947809855914134e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document megawika +1.3144843787829514e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document megawika +1.1693809976572487e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document megawika +1.3677252682893802e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document megawika +1.3940876719849597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document megawika +1.4222245138730965e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document megawika +1.3201677767919704e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document megawika +1.1421717796486169e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document megawika +1.2890514724498703e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document megawika +1.3649507648749037e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document megawika +1.2400732563490717e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document megawika +1.1557681453277616e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document megawika +1.2294483595964517e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document megawika +1.2137484472122283e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document megawika +1.3299663426456e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document megawika +1.2461984216479532e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document megawika +1.4666434217609636e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document megawika +1.1876997894686238e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document megawika +1.2939155338964078e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document megawika +1.3859590039728515e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document megawika +1.317917848615668e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document megawika +1.1335281536110342e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document megawika +1.2889923952861426e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document megawika +1.3471671647053326e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document megawika +1.2221720014475102e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document megawika +1.2632647276287541e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document megawika +1.28276219004076e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document megawika +1.36213704321643e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document megawika +1.2414858625261553e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document megawika +1.3173700421883744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document megawika +1.295597796725686e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document megawika +1.242783936442904e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document megawika +1.2417374088427464e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document megawika +1.2134479405400744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document megawika +1.3090040663304255e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document megawika +1.2713470581614905e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document megawika +5.5750231378906594e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document megawika +5.777597358425469e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document megawika +5.349786767471258e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document megawika +5.675165050453583e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document megawika +5.482611216158831e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document megawika +5.065421899890121e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document megawika +5.384718357480146e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document megawika +4.872037363236061e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document megawika +4.532709250783155e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document megawika +5.7257963030489613e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document megawika +4.9014365579652036e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document megawika +5.722863552770969e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document megawika +6.149911636146833e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document megawika +5.2178057608273506e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document megawika +4.990228161160431e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document megawika +5.866186875255134e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document megawika +5.004185734360719e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document megawika +4.79401853705107e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document megawika +5.435219965052376e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document megawika +5.035997225792266e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document megawika +5.622401774211625e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document megawika +5.028826157387559e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document megawika +5.596379470128795e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document megawika +6.027824493191489e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document megawika +5.5358270009931474e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document megawika +5.9839051807685496e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document megawika +5.1221077499249595e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document megawika +5.517228560620279e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document megawika +5.1687858285052305e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document megawika +5.684188244145645e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document megawika +5.212693275535878e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document megawika +4.8551007022784084e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document megawika +5.4888506639203145e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document megawika +5.345098688527242e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document megawika +4.8506420625516594e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document megawika +5.132168603397676e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document megawika +5.719476795114223e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document megawika +5.7448621149792696e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document megawika +4.9068410568059265e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document megawika +5.382937299647678e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document megawika +4.8288432136304634e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document megawika +5.841703200305416e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document megawika +5.1589611587885584e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document megawika +6.031113829732574e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document megawika +5.4558202844532094e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document megawika +5.341852317196142e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document megawika +5.1402942738369954e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document megawika +5.735421384377395e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document megawika +5.473629863586958e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document megawika +5.4708993245733936e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document megawika +4.931161863634078e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document megawika +5.104173022127248e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document megawika +5.510157161510824e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document megawika +5.652501401782597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document megawika +5.7273656573031666e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document megawika +5.638363224821738e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document megawika +5.6128115396668704e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document megawika +5.00304877998141e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document megawika +5.596120554779096e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document megawika +5.5280923889040006e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document megawika +5.223477917938408e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document megawika +5.29472809986569e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document megawika +2.205682378243213e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document megawika +1.4367563720603185e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document megawika +3.5506193487931076e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document megawika +3.0442910855821778e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document megawika +2.2540042508019627e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document megawika +2.6880163202623216e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document megawika +2.534473148048727e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document megawika +2.6560945431318916e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document megawika +2.547470248967691e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document megawika +2.5248825388073738e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document megawika +2.5828729575000054e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document megawika +2.4026583817957736e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document megawika +2.3930425429834413e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document megawika +2.5037365362599724e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document megawika +2.6696745470595603e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document megawika +2.140323051341762e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document megawika +2.617354786691592e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document megawika +1.538359101762691e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document megawika +1.2871029252377856e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document megawika +2.255195411289217e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document megawika +2.4832313897952067e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document megawika +9.303873918189968e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document megawika +2.179532302620228e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document megawika +1.9750517506901206e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document megawika +2.7740420380648435e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document megawika +2.7813714782319335e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document megawika +4.1595357937609806e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document megawika +2.741365122389175e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document megawika +2.117451071361901e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document megawika +1.7132649760565998e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document megawika +1.7492547092602047e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document megawika +1.7499951097392276e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document megawika +1.6632444789170958e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document megawika +1.6678802252361607e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document megawika +1.5519208704558896e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document megawika +1.652420992967167e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document megawika +1.6119931034508755e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document megawika +1.6638882076736552e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document megawika +1.7198076782652946e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document megawika +1.572927860565175e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document megawika +1.5194822618169918e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document megawika +1.6677776832669846e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document megawika +1.595612492245688e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document megawika +1.682350633181197e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document megawika +1.663983380609724e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document megawika +1.710187842689243e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document megawika +1.5733697527539038e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document megawika +1.6972104757911438e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document megawika +1.6610142847616577e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document megawika +1.61094882403031e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document megawika +1.4789207305138325e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document megawika +1.639299617676302e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document megawika +1.3241204512116132e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document megawika +8.582260726625535e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document megawika +8.213000975576739e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document megawika +9.549247732811947e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document megawika +9.17242785339013e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document megawika +7.632868223725218e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document megawika +8.674401118222175e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document megawika +9.124384255505347e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document megawika +8.344222222417358e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document megawika +8.992299957499065e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document megawika +8.76689497361025e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document megawika +7.973396239586015e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document megawika +9.006935606644125e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document megawika +8.725545954955498e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document megawika +1.215449694669174e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document megawika +3.3041720284158646e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document megawika +2.0593512412624502e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document megawika +1.893608946986248e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document megawika +1.737111666788535e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document megawika +1.4915923449873955e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document megawika +2.289370239067605e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document megawika +2.8615335689614638e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document megawika +8.847283630883125e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document megawika +1.8175470362373804e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document megawika +1.8152226683368038e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document megawika +1.789149655314284e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document megawika +1.7690523036477663e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document megawika +1.8333732213753644e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document megawika +1.8794105687718654e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document megawika +1.721841156706417e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document megawika +2.0612008685724796e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document megawika +1.9297370681336376e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document megawika +2.0188440409661018e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document megawika +5.1741216329695265e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document megawika +1.3417913926038429e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document megawika +1.1010813016469651e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document megawika +1.1252416134320087e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document megawika +1.2801744104313002e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document megawika +1.3041514955795817e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document megawika +1.3428837580879075e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document megawika +1.320809382267804e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document megawika +1.3451566676555968e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document megawika +1.228284926657501e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document megawika +1.2410599573923043e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document megawika +1.3815343367377182e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document megawika +1.3895126265148832e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document megawika +1.2306773644401741e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document megawika +1.32981021906281e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document megawika +1.101337469221607e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document megawika +1.513094184404692e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document megawika +1.1073759547073234e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document megawika +1.2879348765857567e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document megawika +9.619595770228435e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document megawika +1.2384340836286436e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document megawika +1.1766667232211577e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document megawika +1.2871049236196452e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document megawika +1.2010645926497744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document megawika +1.3971428231518597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document megawika +1.2283733550547932e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document megawika +1.2659530508255308e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document megawika +1.551775613074462e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document megawika +1.1169413343776979e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document megawika +1.1433700593712463e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document megawika +4.964773647323492e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document megawika +1.0995586595687313e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document megawika +1.2957393071411267e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document megawika +2.75899247407709e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document megawika +2.8269344597344854e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document megawika +2.329108187246831e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document megawika +2.4231761430460284e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document megawika +1.2434140512230442e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document megawika +1.638718338352859e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document megawika +3.272953556801187e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document megawika +6.061314500486327e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document megawika +1.2465979731210292e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document megawika +1.2737557327967737e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document megawika +1.038428658075627e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document megawika +2.61666472045566e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document megawika +3.6506873212272224e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document megawika +1.5066359138295701e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document megawika +1.1166290872121178e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document megawika +1.5546966228590285e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document megawika +1.2583434625014828e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document megawika +1.3398826881300862e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document megawika +1.2944933160515968e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document megawika +1.0971437399901365e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document megawika +1.2787922795775774e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document megawika +1.404979227816985e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document megawika +1.3344734431324463e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document megawika +4.886031157107555e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document megawika +3.277261443596394e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document megawika +3.5057957685786495e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document megawika +3.287625301718589e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document megawika +3.1370056372668855e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document megawika +3.186092015785841e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document megawika +7.271819324142512e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document megawika diff --git a/ALCF/data-lists/aurora/open-web-math-train.txt b/ALCF/data-lists/aurora/open-web-math-train.txt new file mode 100644 index 0000000000..e0dfc30bd7 --- /dev/null +++ b/ALCF/data-lists/aurora/open-web-math-train.txt @@ -0,0 +1,13 @@ +0.001451215788905126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document open-web-math-train +0.0014486847196258788 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document open-web-math-train +0.0008861032722895899 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document open-web-math-train +0.0018119590809459816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document open-web-math-train +0.0008916937917547129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document open-web-math-train +6.960128832809415e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document open-web-math-train +0.002008403651063623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document open-web-math-train +0.0014374900742131454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document open-web-math-train +0.00180213596996716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document open-web-math-train +0.001956178877532413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document open-web-math-train +0.0008829547017667033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document open-web-math-train +0.0008910853619157279 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document open-web-math-train +0.0018260998845299973 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document open-web-math-train diff --git a/ALCF/data-lists/aurora/pes2o.txt b/ALCF/data-lists/aurora/pes2o.txt new file mode 100644 index 0000000000..3d0cdbe479 --- /dev/null +++ b/ALCF/data-lists/aurora/pes2o.txt @@ -0,0 +1,26 @@ +0.0012499632072059553 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document pes2o +0.00125398260359913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document pes2o +0.0012541704774729071 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document pes2o +0.0012527268234360602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document pes2o +0.0012532925243737164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document pes2o +0.0012456396241204315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document pes2o +0.0012589894424352072 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document pes2o +0.001508020123999618 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document pes2o +0.00333096950781965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document pes2o +0.0033233414614415547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document pes2o +0.003512387990689828 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document pes2o +0.0035091382940513126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document pes2o +0.003514155927147005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document pes2o +0.003327108000579638 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document pes2o +0.003329106196589836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document pes2o +0.003505604148738077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document pes2o +0.003324825759567855 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document pes2o +0.0033248240149804913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document pes2o +0.0033385962112851358 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document pes2o +0.0035043186296553615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document pes2o +0.003340469505431529 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document pes2o +0.0035106889084796276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document pes2o +0.0033309469281030167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document pes2o +0.003340337858029757 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document pes2o +0.003505919861097801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document pes2o +0.0003882924098240512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document pes2o diff --git a/ALCF/data-lists/aurora/reddit.txt b/ALCF/data-lists/aurora/reddit.txt new file mode 100644 index 0000000000..ebc1e15ada --- /dev/null +++ b/ALCF/data-lists/aurora/reddit.txt @@ -0,0 +1,78 @@ +0.0005759963691850877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document reddit +0.0005959971675332674 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document reddit +0.0006026179290353799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document reddit +0.0005824184320784846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document reddit +0.0005854598548616037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document reddit +0.0005903767055633473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document reddit +0.0005930306490982049 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document reddit +0.000569425602700746 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document reddit +0.0005675060415179408 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document reddit +0.0005772431621253389 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document reddit +0.0005678026053826858 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document reddit +0.0005700398263483378 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document reddit +0.0005669467963528824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document reddit +0.0005701015953324305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document reddit +0.0005795907287413296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document reddit +0.0005735602737531164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document reddit +0.0005749862745842101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document reddit +0.0005693257015931971 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document reddit +0.0005716568794795563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document reddit +0.0005761083919774021 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document reddit +0.0005688343169797355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document reddit +0.0005807913190929842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document reddit +0.0005710229258078636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document reddit +0.0005704083039826862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document reddit +0.0005862132348308056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document reddit +0.0005717662049559556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document reddit +0.0005858155213694451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document reddit +0.0005812012281792392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document reddit +0.0005803981414588498 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document reddit +0.0005700102108287723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document reddit +0.0005719243459052329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document reddit +0.0005867253401661752 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document reddit +0.0005731087218860733 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document reddit +0.0005712197789109317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document reddit +0.0005702376926310089 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document reddit +0.0005700411527742972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document reddit +0.0005828090098178196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document reddit +0.0005770140826168056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document reddit +0.0005723509664597896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document reddit +0.0005755499231836962 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document reddit +0.0005636407438471367 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document reddit +0.0005640281556500104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document reddit +0.0005633159058766496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document reddit +0.0005638034311151449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document reddit +0.0005630066273073224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document reddit +0.0005631803831128559 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document reddit +0.0005631228881679657 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document reddit +0.0005628178701487633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document reddit +0.0005624448092256196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document reddit +0.0005620957024062329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document reddit +0.0005614201504177484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document reddit +0.0005616890951464056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document reddit +0.0005611348559279058 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document reddit +0.0005604238061828518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document reddit +0.0005603301490194237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document reddit +0.0005607291294548833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document reddit +0.0005605234569930727 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document reddit +0.0005613778566640694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document reddit +0.0005610248539992471 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document reddit +0.0005599977416780475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document reddit +0.0005603632562116935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document reddit +0.0005599177479509897 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document reddit +0.0005595202318298379 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document reddit +0.0005600975633499175 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document reddit +0.0005614075491213365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document reddit +0.000612563885043477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document reddit +0.0005515469909644413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document reddit +0.0005526782014946906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document reddit +0.0005472463408095445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document reddit +0.0005502284746004587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document reddit +0.0005414514790555363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document reddit +0.0005513499500134784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document reddit +0.0005391391454105187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document reddit +0.0005415836910001838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document reddit +0.0005208132468536551 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document reddit +0.0005889827143132871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document reddit +0.0005822520817765276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document reddit +0.0004173155230758696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document reddit diff --git a/ALCF/data-lists/aurora/stack.txt b/ALCF/data-lists/aurora/stack.txt new file mode 100644 index 0000000000..d99516f5fb --- /dev/null +++ b/ALCF/data-lists/aurora/stack.txt @@ -0,0 +1,26 @@ +0.0009994361338078242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document stackexchange +0.001087156194657966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document stackexchange +0.0010667737163656816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document stackexchange +0.0009602877882124873 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document stackexchange +0.0008968956271971105 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document stackexchange +0.0009198034843762967 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document stackexchange +0.0009423901016715341 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document stackexchange +0.0009674094553686345 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document stackexchange +0.0009858331322519164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document stackexchange +0.0009970593645879198 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document stackexchange +0.0010027035193731686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document stackexchange +0.0010128291154221853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document stackexchange +0.0010215631382631918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document stackexchange +0.0010288663771461238 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document stackexchange +0.0010346219929285867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document stackexchange +0.00104544019940344 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document stackexchange +0.0010525172676724333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document stackexchange +0.0010609529620775127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document stackexchange +0.0010725892748610153 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document stackexchange +0.0010818563598181568 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document stackexchange +0.0010992760196793917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document stackexchange +0.0011178992762079917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document stackexchange +0.001124687532085676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document stackexchange +0.001118303661267191 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document stackexchange +0.0010206825575416534 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document stackexchange +0.0005512280117499715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document stackexchange diff --git a/ALCF/data-lists/aurora/starcoder.txt b/ALCF/data-lists/aurora/starcoder.txt new file mode 100644 index 0000000000..2a5be0cf72 --- /dev/null +++ b/ALCF/data-lists/aurora/starcoder.txt @@ -0,0 +1,50 @@ +0.004474659408857016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document starcoder +0.00409944473890653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document starcoder +0.005137179939941845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document starcoder +0.005143172251066109 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document starcoder +0.005206134363352808 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document starcoder +0.004892747858974329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document starcoder +0.004844731352552902 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document starcoder +0.005308320169123755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document starcoder +0.005124709815666577 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document starcoder +0.005424710744483826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document starcoder +0.00538244648861977 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document starcoder +0.0029107284679086853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document starcoder +0.0026825258998444705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document starcoder +0.0026904503191419243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document starcoder +0.002687906577174073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document starcoder +0.002850165346048818 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document starcoder +0.005322698571717847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document starcoder +0.004450334290869719 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document starcoder +0.004700990083440683 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document starcoder +0.003903568556500995 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document starcoder +0.00390561515396931 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document starcoder +0.0039046402900912262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document starcoder +0.003907454839379547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document starcoder +0.0038583224578603824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document starcoder +0.0037914116657695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document starcoder +0.003786665266798682 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document starcoder +0.003792000802430658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document starcoder +0.00319266847466091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document starcoder +0.0032658716699838944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document starcoder +0.0034801959532460023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document starcoder +0.0028307012092022594 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document starcoder +0.0028420360878146276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document starcoder +0.0028410455248484914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document starcoder +0.00283497183526842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document starcoder +0.002840187195459487 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document starcoder +0.0028398709431369834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document starcoder +0.004364722843422023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document starcoder +0.004093255713117101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document starcoder +0.004092331079566252 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document starcoder +0.004005326985579649 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document starcoder +0.0036205502856964207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document starcoder +0.003625316793034984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document starcoder +0.003604743435602363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document starcoder +0.0035405823343673125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document starcoder +0.0041601413517253945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document starcoder +0.005886303658937057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document starcoder +0.003600909532810332 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document starcoder +0.0034941365817168658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document starcoder +0.0004992164842980224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document starcoder + diff --git a/ALCF/data-lists/aurora/tulu.txt b/ALCF/data-lists/aurora/tulu.txt new file mode 100644 index 0000000000..46b3a91a40 --- /dev/null +++ b/ALCF/data-lists/aurora/tulu.txt @@ -0,0 +1,66 @@ +0.00032927705604725614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document tulu +0.0002860154190878753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document tulu +0.0002845217585425619 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document tulu +0.0002743528685497456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document tulu +0.00026025323737738766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document tulu +0.00023493876414603155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document tulu +0.00029665994994226705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document tulu +0.00031808102075993956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document tulu +0.00031813573046011285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document tulu +0.0002711905171855542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document tulu +0.00028892513401817095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document tulu +0.00030003908676979083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document tulu +0.00026839878771944684 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document tulu +0.00029155935002690497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document tulu +0.0002998624927624209 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document tulu +0.0003091705447974841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document tulu +0.00026873195794309786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document tulu +0.00027721873498527547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document tulu +0.0002841662554024377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document tulu +0.0002839461156551537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document tulu +0.0002861705604659811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document tulu +0.0002460995649635886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document tulu +0.00019420142619795496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document tulu +0.00021967677816173628 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document tulu +0.0002620283200480949 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document tulu +0.0002433390542188936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document tulu +0.00021254976608350767 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document tulu +0.00022094815569522115 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document tulu +0.000342862378668244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document tulu +0.00033784225259118157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document tulu +0.0003367278459543952 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document tulu +0.00029843279042852765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document tulu +0.0002926583661257988 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document tulu +0.00029320337282010673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document tulu +0.00029281450669483455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document tulu +0.0002915338187002653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document tulu +0.0002864226923084572 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document tulu +0.00028643439083586396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document tulu +0.00028253710956299054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document tulu +0.0002810856078805806 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document tulu +0.00031474941344656715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document tulu +0.0002139130222205655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document tulu +0.0003084648871862831 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document tulu +0.0003309477872140129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document tulu +0.0003360096824695161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document tulu +0.0003355452655196557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document tulu +0.00038119390366386037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document tulu +0.00038078927630086064 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document tulu +0.0003386200917551554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document tulu +0.0002158905159938882 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document tulu +0.00021621682877018768 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document tulu +0.00021553306942740535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document tulu +0.00021581563462722296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document tulu +0.0002157694110556169 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document tulu +0.000215643699847159 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document tulu +0.00021532716715168094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document tulu +0.00021531221326022472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document tulu +0.0002831801179028896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document tulu +0.0002514844936507595 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document tulu +0.00031638782778107964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document tulu +0.0002749197545278445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document tulu +0.00026159721512464495 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document tulu +0.0002630052420096968 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document tulu +0.00031106811228913666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document tulu +0.0002852973415334161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document tulu +3.7555372465932136e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document tulu diff --git a/ALCF/data-lists/aurora/wiki.txt b/ALCF/data-lists/aurora/wiki.txt new file mode 100644 index 0000000000..c70a54f598 --- /dev/null +++ b/ALCF/data-lists/aurora/wiki.txt @@ -0,0 +1,2 @@ +0.003548077173506675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document wiki +0.0018372203137874265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document wiki diff --git a/ALCF/data-lists/polaris/algebraic.txt b/ALCF/data-lists/polaris/algebraic.txt new file mode 100644 index 0000000000..394649fcf4 --- /dev/null +++ b/ALCF/data-lists/polaris/algebraic.txt @@ -0,0 +1,16 @@ +0.0018520780893211373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document algebraic-stack-train +0.0017591050606817512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document algebraic-stack-train +0.001459052794333798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document algebraic-stack-train +0.0007405667281569194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document algebraic-stack-train +0.00019420030110896795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document algebraic-stack-train +0.0009008668715801845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document algebraic-stack-train +0.00015115827957143057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document algebraic-stack-train +0.0014552844319220648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document algebraic-stack-train +0.0012469861325685161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document algebraic-stack-train +0.00136412011372413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document algebraic-stack-train +0.0007064279699221103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document algebraic-stack-train +0.0008472240000687427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document algebraic-stack-train +0.0001984375713341955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document algebraic-stack-train +0.0005472773881697123 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document algebraic-stack-train +0.001815779629850992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document algebraic-stack-train +0.0018313600689757324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document algebraic-stack-train diff --git a/ALCF/data-lists/polaris/arxiv.txt b/ALCF/data-lists/polaris/arxiv.txt new file mode 100644 index 0000000000..85e59adacd --- /dev/null +++ b/ALCF/data-lists/polaris/arxiv.txt @@ -0,0 +1,100 @@ +0.0002583902668716813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document arxiv +0.0002646575141232155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document arxiv +0.0003165521247456758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document arxiv +0.0002920706460176214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document arxiv +0.00028396813182810215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document arxiv +0.00030445161883108107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document arxiv +0.00031628781276576474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document arxiv +0.0003083776568189157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document arxiv +0.0003176359471472902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document arxiv +0.0002536009369131698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document arxiv +0.0003067491424681363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document arxiv +0.0002597217257557784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document arxiv +0.0003788556450109768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document arxiv +0.0002796563272052598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document arxiv +0.00033573826524290287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document arxiv +0.00030523658022800287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document arxiv +0.00032211552192240096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document arxiv +0.0003329295675164247 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document arxiv +0.0003101982186639862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document arxiv +0.00032361798234223355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document arxiv +0.0003495541581652915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document arxiv +0.0002821637448858042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document arxiv +0.00030399523537629673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document arxiv +0.0002955658968247219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document arxiv +0.00028942158502924254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document arxiv +0.00028769546171490733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document arxiv +0.0002938111057234182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document arxiv +0.0002711150403010948 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document arxiv +0.00031130095874747565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document arxiv +0.0003002996118160777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document arxiv +0.0003732757901604459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document arxiv +0.00026784205751795894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document arxiv +0.0002799626521661984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document arxiv +0.00034334276069078164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document arxiv +0.0003582469803674965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document arxiv +0.00031094844818418623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document arxiv +0.0002766228384977191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document arxiv +0.00030297116159471485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document arxiv +0.00027033888377464685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document arxiv +0.00030090862368377933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document arxiv +0.00028543875802490955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document arxiv +0.00027559768459074204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document arxiv +0.0003182185533962886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document arxiv +0.0003311392971435837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document arxiv +0.00028751652060804325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document arxiv +0.000303466863212589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document arxiv +0.00033400462801277524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document arxiv +0.0002589234031777426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document arxiv +0.0002913508598466723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document arxiv +0.0002670572450004856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document arxiv +0.00032027399105647656 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document arxiv +0.00032188376258379377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document arxiv +0.0003161585784100882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document arxiv +0.0003184249182974135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document arxiv +0.00030381336664000807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document arxiv +0.0003190437442184283 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document arxiv +0.0002537961798200545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document arxiv +0.0003017817117223326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document arxiv +0.00028685268513240224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document arxiv +0.00031265179094451165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document arxiv +0.00034708319096986816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document arxiv +0.00026650837943080664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document arxiv +0.00034588832248507335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document arxiv +0.0002416982248399037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document arxiv +0.0003089296918222243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document arxiv +0.00029137184185700827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document arxiv +0.00026464226846800774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document arxiv +0.00030545397919456627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document arxiv +0.0003206778460448875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document arxiv +0.00030968971641110967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document arxiv +0.00023325653928600864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document arxiv +0.00030526899198338555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document arxiv +0.00035376719076633584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document arxiv +0.000290224385981026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document arxiv +0.000294650083382008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document arxiv +0.00028768858128616436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document arxiv +0.00030856965235527843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document arxiv +0.00030579942447879054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document arxiv +0.0002863101084704357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document arxiv +0.0002870032092492213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document arxiv +0.000264182727569885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document arxiv +0.0002974012367036449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document arxiv +0.00032238412143059203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document arxiv +0.00031683716893819036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document arxiv +0.00031157434937617524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document arxiv +0.0003411742735695989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document arxiv +0.00026778444816570715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document arxiv +0.0003037045797275201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document arxiv +0.00027746114370081314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document arxiv +0.00027148285946862043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document arxiv +0.00028042950114678207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document arxiv +0.0003235607816590721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document arxiv +0.0003086692227306295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document arxiv +0.00033990349455148105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document arxiv +0.00030945053208470265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document arxiv +0.00027309074552265303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document arxiv +0.00028737393506316194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document arxiv +0.0003098868328009879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document arxiv +0.0002614229162588409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document arxiv +0.0002884388407820923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document arxiv diff --git a/ALCF/data-lists/polaris/books.txt b/ALCF/data-lists/polaris/books.txt new file mode 100644 index 0000000000..c222c32c07 --- /dev/null +++ b/ALCF/data-lists/polaris/books.txt @@ -0,0 +1,3 @@ +0.0031025147279277244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document books +0.003102019887362634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document books +0.0009996745994661548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document books diff --git a/ALCF/data-lists/polaris/c4.txt b/ALCF/data-lists/polaris/c4.txt new file mode 100644 index 0000000000..512556eafb --- /dev/null +++ b/ALCF/data-lists/polaris/c4.txt @@ -0,0 +1,171 @@ +0.0002406272620255565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document c4 +0.0002404825539493424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document c4 +0.00024062296575435581 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document c4 +0.00024069315766818953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document c4 +0.00024055829162263452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document c4 +0.00024062053397343032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document c4 +0.0002410715545206964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document c4 +0.00024024881846087368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document c4 +0.0002407074700790688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document c4 +0.00024072141428809043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document c4 +0.00024027710230872736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document c4 +0.0002409111299205489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document c4 +0.00024081954058275009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document c4 +0.00024086076794990912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document c4 +0.00024098672620832446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document c4 +0.00024068622303333862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document c4 +0.00024140627024291824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document c4 +0.0002414512033594384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document c4 +0.00024028742594941463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document c4 +0.00024018036089269645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document c4 +0.0002398347365034979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document c4 +0.00024006780153485276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document c4 +0.00024015620270419213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document c4 +0.0002408848259695227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document c4 +0.0002408023185278831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document c4 +0.00024021196580140326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document c4 +0.00024077677271297493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document c4 +0.00024087392454668027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document c4 +0.0002408071293824126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document c4 +0.00024042223828845715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document c4 +0.0002411484752360495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document c4 +0.00023605263746465907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document c4 +0.00023471222158326908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document c4 +0.00023432138580287644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document c4 +0.00023407385623382327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document c4 +0.00023487504174367091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document c4 +0.0002341843704976313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document c4 +0.00023421993170282486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document c4 +0.00023445057969132037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document c4 +0.0002337681680073047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document c4 +0.000234627964808109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document c4 +0.0002338942211888584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document c4 +0.00023403849286843386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document c4 +0.00023405641310796305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document c4 +0.00023349169562397965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document c4 +0.00023381157386048856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document c4 +0.00023388742993790587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document c4 +0.00023363103829469813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document c4 +0.00023421141834630477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document c4 +0.00023420564352232565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document c4 +0.00023367463699173143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document c4 +0.00023344969163567033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document c4 +0.00023372196941547188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document c4 +0.00023399207645297834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document c4 +0.00023357915605505856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document c4 +0.00023337585642190864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document c4 +0.00023385005470157914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document c4 +0.00023301533534493465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document c4 +0.00023377864302541782 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document c4 +0.00023323745848621437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document c4 +0.0002330594611151835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document c4 +0.0002334149675026783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document c4 +0.00023198945902291534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document c4 +0.00023023784834634142 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document c4 +0.00022985623060187217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document c4 +0.0002292605284569516 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document c4 +0.00022926593333048894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document c4 +0.00022922766406807777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document c4 +0.00022898153911167426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document c4 +0.0002292473111593315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document c4 +0.000228804579400424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document c4 +0.00022865485613513526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document c4 +0.00022937426835887895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document c4 +0.00022917388311587372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document c4 +0.0002291660582019043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document c4 +0.00022907895248360543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document c4 +0.0002294617879920205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document c4 +0.0002290452150516566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document c4 +0.00022943405619715553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document c4 +0.0002296271421006204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document c4 +0.00022854791372910372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document c4 +0.00022923123467686557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document c4 +0.00022852404355738494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document c4 +0.00022847798660086642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document c4 +0.0002289604586810316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document c4 +0.00022835479834950643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document c4 +0.0002289149402884243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document c4 +0.00022806655474763446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document c4 +0.00022826296420992974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document c4 +0.00022906829636213627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document c4 +0.0002287628414466998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document c4 +0.0002282673911253445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document c4 +0.00022869309841939134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document c4 +0.0002281540116815451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document c4 +0.0002259755756162738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document c4 +0.00022562331285233504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document c4 +0.0002259061146106053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document c4 +0.00022567670836663787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document c4 +0.00022573165387587061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document c4 +0.00022508514961670572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document c4 +0.00022564642513773356 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document c4 +0.00022563088621998788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document c4 +0.0002250438755373707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document c4 +0.00022524465346241134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document c4 +0.00022531737657666812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document c4 +0.00022444687519363458 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document c4 +0.00022460397498596298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document c4 +0.00022454218976501763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document c4 +0.00022447528843671366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document c4 +0.00022501666332178926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document c4 +0.00022453752304377972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document c4 +0.00022484451871163002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document c4 +0.00022465678847154914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document c4 +0.00022453180917044732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document c4 +0.0002247278486823009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document c4 +0.00022465794828242097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document c4 +0.00022431000701925386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document c4 +0.00022476020248460963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document c4 +0.00022467531771795015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document c4 +0.0002236391309945234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document c4 +0.00022458764920536007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document c4 +0.00022430877426744415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document c4 +0.0002247047786127192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document c4 +0.0002245298090400035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document c4 +0.0002245648831396188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document c4 +0.00022292894729820784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document c4 +0.00022236668082957533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document c4 +0.0002217622659895442 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document c4 +0.00022252452726732609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document c4 +0.00022135333211363678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document c4 +0.0002214571757787971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document c4 +0.0002217188139237798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document c4 +0.00022144214894640303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document c4 +0.00022100172806631854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document c4 +0.00022156392409199052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document c4 +0.00022134830143710272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document c4 +0.00022158598922529453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document c4 +0.00022142932483041377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document c4 +0.00022120980907786554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document c4 +0.00022117917738112441 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document c4 +0.00022077089397851235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document c4 +0.00022093265074996711 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document c4 +0.00022091299741377004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document c4 +0.0002205849150703338 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document c4 +0.0002210648204787979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document c4 +0.0002214235747364102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document c4 +0.00022083907302221787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document c4 +0.0002206334237915964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document c4 +0.00022065193929912214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document c4 +0.00022079775597767288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document c4 +0.00022091492909963518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document c4 +0.00022095009987097293 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document c4 +0.0002208150577180165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document c4 +0.00022085759102772088 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document c4 +0.00022073789170129016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document c4 +0.00022049322781182384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document c4 +0.00022083270617761285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document c4 +0.00021982452827473632 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document c4 +0.00021899870446514259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document c4 +0.00021890358773356361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document c4 +0.00021875556609042841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document c4 +0.00021861195987201226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document c4 +0.00021856782186167455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document c4 +0.00021912837771543515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document c4 +0.00021900213768517756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document c4 +0.00021871675851390374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document c4 +0.0002180537056545586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document c4 +0.0002188196714327129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document c4 +0.00021851362624523464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document c4 +0.0002183236795498736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document c4 +7.291153618675672e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document c4 diff --git a/ALCF/data-lists/polaris/cc.txt b/ALCF/data-lists/polaris/cc.txt new file mode 100644 index 0000000000..75485866e6 --- /dev/null +++ b/ALCF/data-lists/polaris/cc.txt @@ -0,0 +1,1108 @@ +0.0003742481815405742 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document cc +0.00038204855962733055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document cc +0.00038821818392663593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document cc +0.00038723332988783727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document cc +0.00038916141142149904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document cc +0.00038049542523949033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document cc +0.0003854755539534284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document cc +0.00024202756466512517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document cc +0.0003915405155008087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document cc +0.0003927382151931033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document cc +0.0003839151202260479 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document cc +0.00040006817468967907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document cc +0.00040318965964443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document cc +0.0003831013019452741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document cc +0.00039166638383204036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document cc +0.00039962784023961004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document cc +0.00039536707853602614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document cc +0.0004204304698247758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document cc +0.00041538899178693555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document cc +0.00039186953333675306 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document cc +0.00038945837196504305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document cc +0.0003919951238929062 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document cc +0.00044377065718528966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document cc +0.0004407759068603017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document cc +0.0002487811895843715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document cc +0.00039349432045556636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document cc +0.00041223198559462343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document cc +0.0004036573014830213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document cc +0.0003825982215521807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document cc +0.00040386867133151386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document cc +0.00024460575279105167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document cc +0.000269029789531335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document cc +0.0003573757493252864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document cc +0.0004600876681392076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document cc +0.0002605354166397086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document cc +0.0003882502452157999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document cc +0.0002466747612126512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document cc +0.0004024726105072402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document cc +0.00040820631128483644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document cc +0.0002691094350403538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document cc +0.00026916830387277267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document cc +0.0004204663297880574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document cc +0.00042379698687085554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document cc +0.0004502169227311871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document cc +0.0002661708937015295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document cc +0.00031239486948031334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document cc +0.0003109054589936201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document cc +0.00045873053079760646 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document cc +0.00022904931423244635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document cc +0.0003813462028433663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document cc +0.00039188129256500874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document cc +0.00045124222276983765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document cc +0.00048138658436853695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document cc +0.0003944178776279866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document cc +0.00039941569676754006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document cc +0.00037952761190240494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document cc +0.0003944870860881476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document cc +0.0003891842411856621 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document cc +0.000387688981934861 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document cc +0.00039197953876258005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document cc +0.00039007915280311206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document cc +0.0003995520363699188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document cc +0.00039230985654592406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document cc +0.0003929472067173851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document cc +0.0003924096172671473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document cc +0.0003881636143629905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document cc +0.000389790617937084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document cc +0.00037351762309221023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document cc +0.0003630196170929407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document cc +0.00033532465765142113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document cc +0.0003076088685761823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document cc +0.00039463850897720803 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document cc +0.0002843816115231449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document cc +0.0002909175709416474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document cc +0.00028867170997202486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document cc +0.0002838644617723659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document cc +0.00029027869525543416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document cc +0.0002821339567560056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document cc +0.0002922988877045601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document cc +0.0002866955958315786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document cc +0.0002865271754558126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document cc +0.0002861247475618473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document cc +0.0002826681072408606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document cc +0.0002849746458282827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document cc +0.0002816966633435316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document cc +0.00026255342235948463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document cc +0.0002552895098829678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document cc +0.00025990194083107813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document cc +0.0002524062657685835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document cc +0.0002538577379748611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document cc +0.0002561415177406761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document cc +0.00026206253059694905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document cc +0.00026168095406910565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document cc +0.0002601305742008613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document cc +0.00025200823006814814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document cc +0.0003229951981263502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document cc +0.00037289448266476045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document cc +0.0003807825862179898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document cc +0.0003616333738191483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document cc +0.0003665117918907636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document cc +0.0003684186453633228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document cc +0.0003589330610806066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document cc +0.00036383861418030395 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document cc +0.000359841363355303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document cc +0.00036431044063050464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document cc +0.0003668574090358279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document cc +0.000362768263620199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document cc +0.0003501888032771077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document cc +0.000352401968221528 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document cc +0.0003541019701869794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document cc +0.0003628121865546891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document cc +0.0003752582953758773 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document cc +0.00037902046230424966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document cc +0.0003777927146925147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document cc +0.0003760676130509053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document cc +0.00034046049078755405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document cc +0.0003338847563259091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document cc +0.00033294499102761794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document cc +0.0004912026198265864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document cc +0.00032064363474664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document cc +0.00032154190389541214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document cc +0.00032309660151746207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document cc +0.00031181143365304544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document cc +0.00031046092294569104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document cc +0.00031150165249068046 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document cc +0.0003041314265988224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document cc +0.0003024834909739394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document cc +0.0003019936835833604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document cc +0.000292329665283177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document cc +0.0002867061143144972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document cc +0.00028443615610701707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document cc +0.00028462291013755945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document cc +0.0002793538601205013 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document cc +0.00027306573977044246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document cc +0.00027097155673336525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document cc +0.0002752934202112985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document cc +0.00043042012694697647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document cc +0.00047495648822986177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document cc +0.00047755032493473855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document cc +0.0004706974343933747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document cc +0.00046682163297771817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document cc +0.0004616765425874178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document cc +0.00030644496751628097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document cc +0.0002909492555358308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document cc +0.00027272036068261724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document cc +0.0004101070217315588 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document cc +0.0003728914338834357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document cc +0.00036546911442305647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document cc +0.0003669945482407483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document cc +0.0003715902407424017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document cc +0.00035837486406683366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document cc +0.0003573318538685469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document cc +0.0003553784893071916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document cc +0.0004920659809912352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document cc +0.0004533619411303183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document cc +0.00045067066057818706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document cc +0.00044396985139270645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document cc +0.00043198288204468477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document cc +0.00043005174223738454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document cc +0.00041847118430776784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document cc +0.00042952036375796664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document cc +0.00043420594647324267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document cc +0.0003461123241053012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document cc +0.0003408581597849182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document cc +0.00033172705422182547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document cc +0.0003392566490686136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document cc +0.00033578341518385483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document cc +0.0003439196710518844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document cc +0.00034559163447085543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document cc +0.00033762478642902825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document cc +0.00033215210055107224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document cc +0.00033423579608014966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document cc +0.0004963355016025102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document cc +0.0004996862761456923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document cc +0.0005000551829325451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document cc +0.0005004212610098755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document cc +0.00027768695585500585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document cc +0.00028395983854338433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document cc +0.00027835826303062254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document cc +0.0002740073176010804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document cc +0.0002791830529274016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document cc +0.0002796863816194411 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document cc +0.00026697453022672804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document cc +0.0002594197440280141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document cc +0.0003779565697649222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document cc +0.00041835823476586606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document cc +0.00043788493575265915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document cc +0.0002731731970096006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document cc +0.000276305847423402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document cc +0.0002704955773958623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document cc +0.0002629635944827518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document cc +0.000260070956974436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document cc +0.00025661553791456334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document cc +0.00025794727207576157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document cc +0.00025295733980001527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document cc +0.0003788106407021029 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document cc +0.0004882344027669431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document cc +0.0003275324309642705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document cc +0.0004803401856640094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document cc +0.00046720138323433943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document cc +0.00043527810307095335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document cc +0.00043905395741627827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document cc +0.00048774175867331425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document cc +0.00048380704121346737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document cc +0.0004779011848346118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document cc +0.00046255587581908036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document cc +0.00045127922880511576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document cc +0.0004503891485256095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document cc +0.0004450142332303422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document cc +0.00044630282482516654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document cc +0.00044325014465743616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document cc +0.0004263874842796447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document cc +0.0004217530913646938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document cc +0.000415120314341852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document cc +0.00040987168279144537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document cc +0.00033468337266607834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document cc +0.0003353094464683005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document cc +0.0004833936821707294 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document cc +0.00047194878988920935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document cc +0.0004648324126996427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document cc +0.0004562345003964941 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document cc +0.0004933203505465098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document cc +0.0003530166075325466 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document cc +0.00035368548192804685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document cc +0.0004872620828289663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document cc +0.00048293889392426456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document cc +0.00047936768462267655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document cc +0.00047821013991587545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document cc +0.0004660610308564753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document cc +0.000394683430103437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document cc +0.00039165053441571324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document cc +0.0003906936040164381 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document cc +0.00038074803919159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document cc +0.0003686529291578143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document cc +0.00035832920428870976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document cc +0.00035929024535947033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document cc +0.0003538226556050544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document cc +0.0003584167868708799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document cc +0.0003480507542594234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document cc +0.0003413709023543034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document cc +0.00034001304759361455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document cc +0.00033430532902756514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document cc +0.00046519252660631277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document cc +0.0002938876402514769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document cc +0.00028676090994509047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document cc +0.00027296150117506716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document cc +0.00026513502621960483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document cc +0.0002680081327926125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document cc +0.00025831225828720344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document cc +0.00026647037295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document cc +0.0002525733734572654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document cc +0.00025831708887575375 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document cc +0.00042487627444443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document cc +0.0004951213245023891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document cc +0.0004804051413177752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document cc +0.0004662397611340532 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document cc +0.0004550138655253933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document cc +0.00044494909122746795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document cc +0.0002899112253051385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document cc +0.0004372879736279761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document cc +0.0004529568099252922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document cc +0.00045127826158829573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document cc +0.0004436558176737439 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document cc +0.0004419233237678378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document cc +0.000434589215880319 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document cc +0.00029153613207706566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document cc +0.0004312458058738854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document cc +0.00028741854968757313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document cc +0.00046853200754421234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document cc +0.0004949145252030074 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document cc +0.00044459683920483167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document cc +0.0003836095306696336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document cc +0.0003789760237872398 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document cc +0.0003749227438304427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document cc +0.0003628558277173369 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document cc +0.00039468301394041474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document cc +0.00038874701821614864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document cc +0.0004158492456077867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document cc +0.00042360504554060077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document cc +0.00040386729844317623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document cc +0.00027595096702902474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document cc +0.00043638766787829135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document cc +0.0002218691596850179 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document cc +0.0004437566108089954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document cc +0.0003889996411609667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document cc +0.00043454421906537704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document cc +0.0004522564392830988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document cc +0.00041517835659357416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document cc +0.0002614360863446896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document cc +0.00037543522111463596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document cc +0.0004386190133514781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document cc +0.00046358333286115075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document cc +0.00043186261317942404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document cc +0.0002377581602097957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document cc +0.00025973334085074254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document cc +0.00040139099332000796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document cc +0.00043674860686687174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document cc +0.00040853289309329373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document cc +0.000242910191729688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document cc +0.0004431071731750582 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document cc +0.0004388092670482523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document cc +0.000381418866255965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document cc +0.0004100117296419717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document cc +0.00042469230366022745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document cc +0.00041744151905374254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document cc +0.00022835699906752945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document cc +0.0004380161085387397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document cc +0.00044803212381807456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document cc +0.00040554932796137236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document cc +0.0004234508646347761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document cc +0.00043341209652360653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document cc +0.00023966604734537185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document cc +0.000259165907316014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document cc +0.0004270653021833602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document cc +0.0004341547032162028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document cc +0.0004111478117275994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document cc +0.0004299383567984396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document cc +0.0004241899124590779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document cc +0.0004502719349364145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document cc +0.00038994621469645615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document cc +0.0003859912398894952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document cc +0.0004247535950310557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document cc +0.000386982084327716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document cc +0.0004196451040053251 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document cc +0.0004096278509782259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document cc +0.0004373334932695721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document cc +0.0004180889975240641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document cc +0.00042079636929672745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document cc +0.00038063574611812913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document cc +0.0003817505891515542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document cc +0.0004420096268860222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document cc +0.00039182670726410623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document cc +0.0003635667850372299 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document cc +0.00041564996472055667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document cc +0.000400529358757286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document cc +0.0003939113874958451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document cc +0.00039066622068940996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document cc +0.0004290098538807143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document cc +0.0004240739958197099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document cc +0.00040775392659215333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document cc +0.0004091634200396925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document cc +0.00042299190476617914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document cc +0.0003701492680344151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document cc +0.0003807353844384635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document cc +0.00038813507771983156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document cc +0.00040072346558408346 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document cc +0.0003603595180423597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document cc +0.00038799421353112465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document cc +0.00037575235582264926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document cc +0.0004239190342959713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document cc +0.0004606044799136546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document cc +0.00045107950652529253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document cc +0.0004391947201871058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document cc +0.0004457516661123035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document cc +0.0004301297170991686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document cc +0.00044661704164586694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document cc +0.0004438849846114837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document cc +0.0004444205734316823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document cc +0.0004190924165303394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document cc +0.00043942581131677875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document cc +0.00021568459798090663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document cc +0.0003814929225407199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document cc +0.0003217453179359235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document cc +0.00031719591470267974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document cc +0.00032434115726922137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document cc +0.0004079911120371051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document cc +0.000329492766381148 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document cc +0.0003845916162001633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document cc +0.0003835208964390098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document cc +0.00037847334157173194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document cc +0.00038296039903791865 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document cc +0.00037896336828472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document cc +0.00037620974396391355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document cc +0.00037420590727111843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document cc +0.000340490625886403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document cc +0.0003078314411035827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document cc +0.00034153990750656097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document cc +0.0003308858103982067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document cc +0.0003452640607156025 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document cc +0.00033095276418403455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document cc +0.0003116308995860414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document cc +0.00032446713226408477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document cc +0.0003015816821912984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document cc +0.00031612418775706894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document cc +0.0003278516344971041 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document cc +0.00033079446736097217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document cc +0.00032278977146550837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document cc +0.00032065272988207914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document cc +0.0003936696452406576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document cc +0.0003450109536627789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document cc +0.0003339787189919641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document cc +0.0003284303856176974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document cc +0.00033652677276843477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document cc +0.0003257822443845694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document cc +0.0003293985569149334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document cc +0.0003310360260148262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document cc +0.0003233770986418526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document cc +0.0003172280092149422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document cc +0.0003160674744292835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document cc +0.00030931090289598506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document cc +0.0003093173886443107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document cc +0.00033167847081104083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document cc +0.00031131501311729723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document cc +0.00031046608876279845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document cc +0.00030569235942207244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document cc +0.00030777943671285197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document cc +0.00029303314290956683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document cc +0.0003045824546400205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document cc +0.00030360880677729793 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document cc +0.00031646239964835433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document cc +0.0003129122300603785 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document cc +0.00031060464956661433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document cc +0.000311819032500067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document cc +0.0002977872483902282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document cc +0.0003009448600922438 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document cc +0.00028610292098537774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document cc +0.0002988326876216654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document cc +0.00028550828372819075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document cc +0.0002830381750875739 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document cc +0.0002848495855927156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document cc +0.0002856443760308144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document cc +0.00027442895344188584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document cc +0.0002681160554049462 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document cc +0.0003421482544126989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document cc +0.0004005872948449718 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document cc +0.0003930123959320308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document cc +0.0003867271832275778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document cc +0.000380805140455254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document cc +0.0003814769861947819 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document cc +0.00038025170883282324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document cc +0.0003738026647867475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document cc +0.00018960856915036276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document cc +0.0003697177501953134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document cc +0.00036674194328136693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document cc +0.00036447406838697555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document cc +0.00036686410861101255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document cc +0.00035915267825103423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document cc +0.0003624758404026675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document cc +0.0002822812140180794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document cc +0.00030620512946920813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document cc +0.000294249776520589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document cc +0.00030238536967523434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document cc +0.00029509593361580754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document cc +0.0002906912701830899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document cc +0.0002921944165474959 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document cc +0.00028358919691127954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document cc +0.0002813182772323272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document cc +0.00027442640800299205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document cc +0.0002747820342933984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document cc +0.0002747584403979717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document cc +0.00027499129634862444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document cc +0.0002712050404257197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document cc +0.0002616256943143254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document cc +0.00026769938929002815 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document cc +0.00038396081322727017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document cc +0.0003863140490027991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document cc +0.00037702277513203237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document cc +0.0003633274156107032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document cc +0.0003587473889240435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document cc +0.0003507672084278415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document cc +0.00033776425499780385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document cc +0.0003377914127574796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document cc +0.00032948015659161326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document cc +0.00033245638541392985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document cc +0.00031080707640648695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document cc +0.0002976903331149755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document cc +0.0002965121463725523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document cc +0.0002933849695266647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document cc +0.0002837035078508233 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document cc +0.00028684569079589323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document cc +0.0003145192320802359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document cc +0.0003566937253273515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document cc +0.0003470199109592918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document cc +0.0003060245312041868 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document cc +0.0002650817213818789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document cc +0.0002643604938780134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document cc +0.000299350876031416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document cc +0.0003178540797697938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document cc +0.000271850367887767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document cc +0.00031349896596549 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document cc +0.00031749734412765755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document cc +0.0003791137842391209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document cc +0.0003742334169957992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document cc +0.0003705639757351107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document cc +0.0003126986769797042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document cc +0.00031038132814561196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document cc +0.00036464437173804883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document cc +0.0003569480488951322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document cc +0.0003541239221619106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document cc +0.00035315297411308053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document cc +0.0003572451925404141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document cc +0.0003514986129411253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document cc +0.0003521798298425866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document cc +0.00034553677439244716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document cc +0.000349004719809412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document cc +0.0003468247484872769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document cc +0.0003465822608356558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document cc +0.00035410983132162007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document cc +0.0003487908354969444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document cc +0.0003479024763238147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document cc +0.000341412530646823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document cc +0.00034451316273667034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document cc +0.0002618849993484869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document cc +0.00026788679978901144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document cc +0.00027450670773227214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document cc +0.0002661273129899329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document cc +0.00026836569676402957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document cc +0.00026155876975483236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document cc +0.0002609276830117151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document cc +0.0002644161630512771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document cc +0.00036789208972872557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document cc +0.00037829849439990513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document cc +0.0003788894943523098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document cc +0.0003617207777959397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document cc +0.0002541334487248998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document cc +0.0002707945538071073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document cc +0.00027046282716455214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document cc +0.0002652443167243215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document cc +0.0002685859923850986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document cc +0.00025734961751176414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document cc +0.000259041720872915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document cc +0.00025340107274823446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document cc +0.00025757135121837893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document cc +0.00025617700500574084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document cc +0.0002566931670562857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document cc +0.0002543871190716101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document cc +0.00024997565589481713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document cc +0.0002954079779456287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document cc +0.00034890741135252835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document cc +0.0003473298137731525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document cc +0.0003296959618486435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document cc +0.0003304520061604598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document cc +0.00032377956175729824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document cc +0.00031700696295168713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document cc +0.0003060382346081943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document cc +0.0003012003005056863 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document cc +0.0002981074073993884 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document cc +0.0002922128825950705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document cc +0.000348901087722931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document cc +0.0003408286289467841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document cc +0.0003410649680770183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document cc +0.0003358524215576502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document cc +0.0003343661874989231 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document cc +0.00032810573699389156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document cc +0.00032261449539097497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document cc +0.0003162694866049203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document cc +0.0003158381156468853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document cc +0.000317376061083603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document cc +0.0003125788639953052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document cc +0.0003010105041885602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document cc +0.0003065865059090678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document cc +0.0003084275726508053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document cc +0.00030966560718296085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document cc +0.0002957728057853081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document cc +0.00029904164542325336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document cc +0.0002955358888729187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document cc +0.00028692976446931544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document cc +0.0002923476214935797 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document cc +0.0002893691697212419 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document cc +0.0002855895211981585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document cc +0.00027968347097626246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document cc +0.0002810783462604979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document cc +0.00027794080455729715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document cc +0.00034784376461416953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document cc +0.0003488347959010943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document cc +0.00034790583710250724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document cc +0.000345913166618151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document cc +0.00033801936268066675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document cc +0.0003290591130212315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document cc +0.00034051399521366823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document cc +0.00032470943131841784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document cc +0.00031679540050914276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document cc +0.00031814596342422325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document cc +0.0003156466289485036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document cc +0.00029985010879003633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document cc +0.0002905176377776361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document cc +0.0004206836775460856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document cc +0.00020660449162246918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document cc +0.0003461727254468087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document cc +0.00020592870907067763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document cc +0.00034173505299233005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document cc +0.0004052437256652738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document cc +0.0004080650901351697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document cc +0.00039778184149144276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document cc +0.00039046311464950275 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document cc +0.00039043444911071384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document cc +0.000388575704932843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document cc +0.00019737533145666597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document cc +0.00037610755595812403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document cc +0.00037315400127598317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document cc +0.00037415028580922163 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document cc +0.00036694041707212337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document cc +0.00018947219857306515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document cc +0.00037046050826533545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document cc +0.0003587440768559087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document cc +0.00034623936498708903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document cc +0.0003502289592617922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document cc +0.00034692398063649823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document cc +0.000339340809421849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document cc +0.0003360510394816983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document cc +0.0003354673850814145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document cc +0.00032937682875877047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document cc +0.00032844505049317715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document cc +0.00028287199339908627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document cc +0.0002795217197003578 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document cc +0.00028048955601883463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document cc +0.0002769326396439027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document cc +0.0002727090021299243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document cc +0.0002726577841024554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document cc +0.00026663619593455374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document cc +0.00026068042672138127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document cc +0.0002637704114326801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document cc +0.0002593043567100412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document cc +0.0002599897110113453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document cc +0.0002435078682758859 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document cc +0.0002450530071379054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document cc +0.00024233331983743606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document cc +0.0002934750947999535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document cc +0.00033241226364044474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document cc +0.00032938406090272075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document cc +0.00032778705403953246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document cc +0.00032184551480398754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document cc +0.00031874002264945737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document cc +0.0003165319685666433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document cc +0.00031307071173376295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document cc +0.00031119524184911957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document cc +0.0003102253344576429 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document cc +0.0003088976240383192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document cc +0.0002951410823077708 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document cc +0.00029772657676757413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document cc +0.0003056048989909935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document cc +0.00031991305381648026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document cc +0.00030890256978362426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document cc +0.0003109382904091933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document cc +0.00031035798529690644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document cc +0.00030741666395911753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document cc +0.0002989918594861846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document cc +0.00029569635443989434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document cc +0.0002973992445667285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document cc +0.000293397351001072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document cc +0.00028737817438047954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document cc +0.00028252738144009747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document cc +0.0002805511898623541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document cc +0.0003718020784620472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document cc +0.0003499713845765235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document cc +0.00034283547445326676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document cc +0.00031464759888838765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document cc +0.00033188946446414833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document cc +0.000326084432195463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document cc +0.0003764568303917893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document cc +0.0003604955598858414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document cc +0.0003655654554133222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document cc +0.00035762304033750504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document cc +0.00038478883950347103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document cc +0.00027735714341247454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document cc +0.00028139534607773563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document cc +0.00019777292251713763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document cc +0.000285571704874486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document cc +0.00028543482146244363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document cc +0.00019434234484256758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document cc +0.00027854908176986763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document cc +0.0002847068039566143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document cc +0.00028672356943064853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document cc +0.00027782687605808177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document cc +0.0002843539634105203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document cc +0.0002894748379090401 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document cc +0.0002868852440186493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document cc +0.0002818504885373851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document cc +0.00028680112812941034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document cc +0.00019258978168723977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document cc +0.00028760637934715155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document cc +0.0002820439443912918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document cc +0.0002831001054410018 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document cc +0.00029001901552467397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document cc +0.00027779449377883156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document cc +0.00019949837437516796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document cc +0.0002907306472984446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document cc +0.00027814858381318327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document cc +0.00019472790889161432 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document cc +0.00020472626596924125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document cc +0.0002870045081974301 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document cc +0.00019812241927078482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document cc +0.0002817553333369554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document cc +0.00027829782796642117 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document cc +0.00028289431732284113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document cc +0.0002795526296717729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document cc +0.00027682829988044574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document cc +0.0002895432402719184 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document cc +0.0002823174903941811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document cc +0.00028170972351837796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document cc +0.00027807915877838826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document cc +0.00028588515681452956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document cc +0.00028112324090816726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document cc +0.00020636178289985485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document cc +0.00019447255290980535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document cc +0.0002850824220591452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document cc +0.00027856429520116784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document cc +0.0002820880676635633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document cc +0.00028943902215995714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document cc +0.0002676366291085329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document cc +0.00023806333809954687 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document cc +0.00024526460430233455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document cc +0.00023876876664622726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document cc +0.00023379770334179805 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document cc +0.00024175151269138382 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document cc +0.00023386583242595706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document cc +0.00023771797150160827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document cc +0.0002262748967483896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document cc +0.0002408148346432682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document cc +0.00023398651720444235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document cc +0.00022989433874474592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document cc +0.00023948500543957772 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document cc +0.0002331594076859196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document cc +0.00023375132439600242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document cc +0.00023923410909668642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document cc +0.00023952796315562954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document cc +0.0002327466076905069 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document cc +0.00023082758956797212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document cc +0.0002240509275524448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document cc +0.00022798879995765268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document cc +0.000221172516774386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document cc +0.00021767045123534623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document cc +0.00021982832794804484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document cc +0.00021971626543789102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document cc +0.00022566565206920132 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document cc +0.0002181984894194856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document cc +0.00021831417549554653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document cc +0.00021601405421187145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document cc +0.00022275733725519607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document cc +0.00021847734911973986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document cc +0.0002243591012664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document cc +0.00021688758139483833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document cc +0.0002182953624789215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document cc +0.00020475155724026002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document cc +0.00021498078062960065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document cc +0.0002157914337233064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document cc +0.00021781838494967963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document cc +0.00021723242266814558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document cc +0.0002176782686553837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document cc +0.0003486179404943968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document cc +0.00034882846352857634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document cc +0.00031400868448352596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document cc +0.00030273484020011963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document cc +0.00029895889118145404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document cc +0.00029770764609621714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document cc +0.0002990181332116852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document cc +0.00029653733972285996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document cc +0.00029624649222942476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document cc +0.00029625609720203576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document cc +0.00029731928930852147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document cc +0.00029011721326148513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document cc +0.00028849788197494655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document cc +0.00021601278623858145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document cc +0.00021319599281739178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document cc +0.0002153325290600083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document cc +0.00018566946174516558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document cc +0.00020736824394291617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document cc +0.00020857419820128004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document cc +0.00020058526129536423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document cc +0.00020745812166665217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document cc +0.00020652171015271702 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document cc +0.00020643808911278608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document cc +0.00020040513914482103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document cc +0.00020598050188272898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document cc +0.0001969184139343296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document cc +0.0001972748812937012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document cc +0.0002038556751586195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document cc +0.00020245186011313464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document cc +0.00019950381422038783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document cc +0.00020837055459665258 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document cc +0.00020371856218246096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document cc +0.00019537612301625791 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document cc +0.00019914984508813857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document cc +0.0002053787713691309 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document cc +0.00019082100541008637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document cc +0.00020397153334531813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document cc +0.0002021462693077317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document cc +0.00019609357008124035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document cc +0.00019693256622486236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document cc +0.00020007239732428112 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document cc +0.00020467075741591954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document cc +0.00019584883400022932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document cc +0.00019135050391176972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document cc +0.0003362829834208298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document cc +0.00034013691154784095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document cc +0.00033215887031941976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document cc +0.00032681189065396707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document cc +0.0003149138485493094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document cc +0.00030179177307540077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document cc +0.0002923278437581119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document cc +0.00029470052278994486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document cc +0.0002994095093045731 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document cc +0.00029033525096085037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document cc +0.00029390798852496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document cc +0.0002916230924130842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document cc +0.00029419886374594913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document cc +0.0002865469756730764 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document cc +0.00021191292549942086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document cc +0.00021369664817409847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document cc +0.00021612485624266726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document cc +0.00022242192634588478 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document cc +0.00014605095659989698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document cc +0.00022070626106341693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document cc +0.0002174420774054071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document cc +0.00021325858963116995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document cc +0.0002124322999488052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document cc +0.0002081218896969054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document cc +0.0002108710211556957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document cc +0.00020686867095978426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document cc +0.00020895752681041895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document cc +0.00020741922266415738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document cc +0.0002069112657197308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document cc +0.00020644627473468118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document cc +0.00020332991338121604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document cc +0.0003560895677789848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document cc +0.00032915779111908214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document cc +0.00033810613317040864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document cc +0.00033729626594036923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document cc +0.00033550342864602944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document cc +0.00034173474024556906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document cc +0.000331505340748827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document cc +0.0003270050330117195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document cc +0.00032585275329172556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document cc +0.0003143383203190604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document cc +0.00031655199110388894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document cc +0.00030738872158476413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document cc +0.00030838388352699285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document cc +0.0003053596995351888 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document cc +0.00031836304739584593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document cc +0.000315315435873905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document cc +0.0003087116248965243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document cc +0.00030396790625537645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document cc +0.0003335812246032149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document cc +0.00034570956323095843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document cc +0.00034563035636675786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document cc +0.00033411265479076335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document cc +0.00034439191141692787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document cc +0.0003364483125496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document cc +0.0003299500453608033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document cc +0.00033163377700074837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document cc +0.00032638649660627673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document cc +0.00032616167939645234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document cc +0.0003205289298760723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document cc +0.00031939393740815355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document cc +0.00031593164066731296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document cc +0.00031928871111254405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document cc +0.00029670189073175004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document cc +0.00020517703846735904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document cc +0.00020128418186172073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document cc +0.00019662723895606717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document cc +0.0001981157042081407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document cc +0.00019703489037041608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document cc +0.00019079796331785068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document cc +0.0001909352306690079 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document cc +0.00018824662295261396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document cc +0.00019864275319325954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document cc +0.00018818516521649587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document cc +0.00018875694972812844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document cc +0.00018231621170645482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document cc +0.00018349407845798273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document cc +0.00018088971427746906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document cc +0.00018296284236327237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document cc +0.0001876011825819916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document cc +0.000329052068725176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document cc +0.00032223616273648536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document cc +0.00031272564089633955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document cc +0.00031621609908414494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document cc +0.0003117213560911235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document cc +0.00030218064069945934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document cc +0.00030658916600512085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document cc +0.0002915863534115821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document cc +0.0002940280138374372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document cc +0.00029067860468866085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document cc +0.00028529228063135635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document cc +0.00028336893301452256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document cc +0.0002794668089130099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document cc +0.00021681361378827842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document cc +0.0001484664674497246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document cc +0.00021950558378215133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document cc +0.00021806860758808645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document cc +0.00021819568718852282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document cc +0.00021626925931585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document cc +0.0001464536143077762 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document cc +0.00021432777088808917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document cc +0.000213473805865147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document cc +0.00021397067253964538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document cc +0.00020758957647437263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document cc +0.00020687124337683314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document cc +0.00020630057046511005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document cc +0.0002091166859352538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document cc +0.00020777355025615267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document cc +0.00020709287641496176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document cc +0.00020736464660577094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document cc +0.00020062246741862607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document cc +0.00020693207561942915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document cc +0.00021151004871893024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document cc +0.00019930249098689716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document cc +0.00021589710041231824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document cc +0.00021369204789905741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document cc +0.0002147099923936778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document cc +0.00021077531190389536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document cc +0.0002100509829113836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document cc +0.00021185362601571124 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document cc +0.00020722136637339565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document cc +0.00020300093701169531 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document cc +0.00019859737993313477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document cc +0.00019971314372100164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document cc +0.00019549908270269278 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document cc +0.00019649820843534028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document cc +0.00019619415513498067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document cc +0.00019493006120377898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document cc +0.00019499409035775506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document cc +0.00019252988593634277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document cc +0.00019440768268686405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document cc +0.00018747161324755577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document cc +0.0001879575932372779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document cc +0.00019040707058357506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document cc +0.0001871931095090703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document cc +0.00020112966223017096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document cc +0.00020516878165311017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document cc +0.00020664735191740533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document cc +0.00021041398572882962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document cc +0.00020397992929690396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document cc +0.0002039978580295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document cc +0.00020592785601142126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document cc +0.0001990755527445265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document cc +0.00019729564847798732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document cc +0.00019958182230527032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document cc +0.0001985037302636386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document cc +0.00020204130355115716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document cc +0.0002000296401958085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document cc +0.0001983064832295463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document cc +0.00019663108484195617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document cc +0.00019510678560556523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document cc +0.0001873284057063206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document cc +0.00019311553072495885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document cc +0.00034652137288816547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document cc +0.0002813690318850024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document cc +0.00027697649713138685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document cc +0.0002755419092534421 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document cc +0.0002681583054440219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document cc +0.00026945753192750824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document cc +0.00026169470768245737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document cc +0.00026437008960810825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document cc +0.0002637294838228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document cc +0.00026491867965088836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document cc +0.00025504483625138986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document cc +0.0002545040623796586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document cc +0.0002546682814073622 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document cc +0.00025545439487142615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document cc +0.0002626896557978271 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document cc +0.00025092040940402784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document cc +0.0002589154885863872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document cc +0.00024106160482721467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document cc +0.0002483289690087987 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document cc +0.0002388930282784437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document cc +0.00024006340759273874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document cc +0.00023765248178029045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document cc +0.00023061351965578936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document cc +0.00024954224883546477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document cc +0.00017861017233018525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document cc +0.00017810832743667658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document cc +0.00017599709170759497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document cc +0.00017462723516505223 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document cc +0.0002906316527068669 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document cc +0.00033762141066247166 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document cc +0.00017170670574152494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document cc +0.00017258674515137717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document cc +0.0002815386173173926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document cc +0.0002996845935618989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document cc +0.0002735268488987296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document cc +0.0002971738713071517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document cc +0.0002942690674002763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document cc +0.0003322222207729567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document cc +0.0003378721656198464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document cc +0.00018307262621851067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document cc +0.00033956081502775057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document cc +0.00031604820927876276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document cc +0.00028805657681088917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document cc +0.00026312293321215633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document cc +0.00034366936722921455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document cc +0.0002865256504406559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document cc +0.0003063615195861786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document cc +0.00028412791619666136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document cc +0.00028060835132727154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document cc +0.00032544974761560506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document cc +0.0002647177833217225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document cc +0.0003152621884896575 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document cc +0.0003054625140336913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document cc +0.00031183308312292263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document cc +0.00018175026696621178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document cc +0.00017699918328872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document cc +0.00018222339261441908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document cc +0.00018348005930964137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document cc +0.0001810735993810541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document cc +0.00030846441282038914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document cc +0.0002972326889310354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document cc +0.00017433421318235594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document cc +0.00032799458649525895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document cc +0.00032482130048512673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document cc +0.00031943465668672475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document cc +0.00029615593630484517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document cc +0.0002893126939511001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document cc +0.0002849288351723284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document cc +0.00028383906633569267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document cc +0.00028072526091262615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document cc +0.000284239564292377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document cc +0.0002778903109432523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document cc +0.0002771644389501471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document cc +0.0002733316182319337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document cc +0.00026362539185869363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document cc +0.0002636325383220217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document cc +0.00026740622442302886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document cc +0.0002646771971853427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document cc +0.0002628566720605389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document cc +0.0002644760695434766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document cc +0.0002623837702310999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document cc +0.00026088722976772894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document cc +0.0002567065374799158 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document cc +0.00018857382101207726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document cc +0.00019036580399817203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document cc +0.00018348828065261222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document cc +0.00018491851780345073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document cc +0.00018904887260080187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document cc +0.0001875609304251801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document cc +0.00018393034720015817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document cc +0.00018419795526114903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document cc +0.00018699955623404795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document cc +0.00018276256902965128 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document cc +0.00017698045695190812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document cc +0.00018104650132303642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document cc +0.00017758206731279688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document cc +0.00017131402995103497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document cc +0.000175944428350446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document cc +0.0003416745727147391 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document cc +0.0003163259373952889 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document cc +0.0002804489269172448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document cc +0.00028748272397403175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document cc +0.00027603318345630605 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document cc +0.000271638824679648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document cc +0.0002763761210210942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document cc +0.00026501984873172717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document cc +0.00026422486894694714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document cc +0.0002686339100849262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document cc +0.0002610837453940606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document cc +0.000260974343729353 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document cc +0.0002599403837029134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document cc +0.0002937273113238609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document cc +0.0003341790732600504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document cc +0.0002620661576600244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document cc +0.0003027929169239288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document cc +0.00031944039129326894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document cc +0.00019025676304139009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document cc +0.00018680910145009907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document cc +0.00034215840419416437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document cc +0.00018618120812119364 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document cc +0.00018605853095599425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document cc +0.00018120712626096538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document cc +0.00018315079292495327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document cc +0.00018362556449041974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document cc +0.0001780024456718171 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document cc +0.00033296526436178697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document cc +0.0001802398632282846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document cc +0.00017340263100798256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document cc +0.00017755840547238697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document cc +0.00018419413735260606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document cc +0.00017869518174591322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document cc +0.00017526271460129484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document cc +0.00017852168597981907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document cc +0.00017566536156787157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document cc +0.00017589867964432936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document cc +0.00017831487394075305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document cc +0.00017837310528935862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document cc +0.00018200908814216548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document cc +0.0001795136627511612 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document cc +0.0003414021775300033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document cc +0.00017177291787788502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document cc +0.0003441900648571877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document cc +0.0003394534597060673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document cc +0.0003236887233114832 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document cc +0.0001639544129688747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document cc +0.00019137443753211255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document cc +0.00018575146284680153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document cc +0.00019184792863440243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document cc +0.00018966043065679055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document cc +0.00017968851317035848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document cc +0.00018479881897661546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document cc +0.0001813642692683015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document cc +0.0001686449798983066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document cc +0.00018516104592230446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document cc +0.00031283726601066385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document cc +0.0003248607542883853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document cc +0.00031583241601202365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document cc +0.00031238270857730376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document cc +0.000307150592403979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document cc +0.00029443829986847044 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document cc +0.0002942723732234677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document cc +0.00023514930666443422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document cc +0.0020776328951453444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document cc +0.0021768234410538883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document cc +0.002106973549276289 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document cc +0.002110915756171751 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document cc +0.0017032382109816464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document cc +0.0019047944877712286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document cc +0.0019402711744016077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document cc +0.0006264790011223686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document cc +0.0017885401938106643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document cc + diff --git a/ALCF/data-lists/polaris/dolma.txt b/ALCF/data-lists/polaris/dolma.txt new file mode 100644 index 0000000000..f2f98ab12c --- /dev/null +++ b/ALCF/data-lists/polaris/dolma.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document algebraic-stack-train +0.0017591050606817512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document algebraic-stack-train +0.001459052794333798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document algebraic-stack-train +0.0007405667281569194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document algebraic-stack-train +0.00019420030110896795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document algebraic-stack-train +0.0009008668715801845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document algebraic-stack-train +0.00015115827957143057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document algebraic-stack-train +0.0014552844319220648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document algebraic-stack-train +0.0012469861325685161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document algebraic-stack-train +0.00136412011372413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document algebraic-stack-train +0.0007064279699221103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document algebraic-stack-train +0.0008472240000687427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document algebraic-stack-train +0.0001984375713341955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document algebraic-stack-train +0.0005472773881697123 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document algebraic-stack-train +0.001815779629850992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document algebraic-stack-train +0.0018313600689757324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document algebraic-stack-train +0.0002583902668716813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document arxiv +0.0002646575141232155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document arxiv +0.0003165521247456758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document arxiv +0.0002920706460176214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document arxiv +0.00028396813182810215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document arxiv +0.00030445161883108107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document arxiv +0.00031628781276576474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document arxiv +0.0003083776568189157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document arxiv +0.0003176359471472902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document arxiv +0.0002536009369131698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document arxiv +0.0003067491424681363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document arxiv +0.0002597217257557784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document arxiv +0.0003788556450109768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document arxiv +0.0002796563272052598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document arxiv +0.00033573826524290287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document arxiv +0.00030523658022800287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document arxiv +0.00032211552192240096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document arxiv +0.0003329295675164247 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document arxiv +0.0003101982186639862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document arxiv +0.00032361798234223355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document arxiv +0.0003495541581652915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document arxiv +0.0002821637448858042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document arxiv +0.00030399523537629673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document arxiv +0.0002955658968247219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document arxiv +0.00028942158502924254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document arxiv +0.00028769546171490733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document arxiv +0.0002938111057234182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document arxiv +0.0002711150403010948 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document arxiv +0.00031130095874747565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document arxiv +0.0003002996118160777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document arxiv +0.0003732757901604459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document arxiv +0.00026784205751795894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document arxiv +0.0002799626521661984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document arxiv +0.00034334276069078164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document arxiv +0.0003582469803674965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document arxiv +0.00031094844818418623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document arxiv +0.0002766228384977191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document arxiv +0.00030297116159471485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document arxiv +0.00027033888377464685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document arxiv +0.00030090862368377933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document arxiv +0.00028543875802490955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document arxiv +0.00027559768459074204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document arxiv +0.0003182185533962886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document arxiv +0.0003311392971435837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document arxiv +0.00028751652060804325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document arxiv +0.000303466863212589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document arxiv +0.00033400462801277524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document arxiv +0.0002589234031777426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document arxiv +0.0002913508598466723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document arxiv +0.0002670572450004856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document arxiv +0.00032027399105647656 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document arxiv +0.00032188376258379377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document arxiv +0.0003161585784100882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document arxiv +0.0003184249182974135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document arxiv +0.00030381336664000807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document arxiv +0.0003190437442184283 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document arxiv +0.0002537961798200545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document arxiv +0.0003017817117223326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document arxiv +0.00028685268513240224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document arxiv +0.00031265179094451165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document arxiv +0.00034708319096986816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document arxiv +0.00026650837943080664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document arxiv +0.00034588832248507335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document arxiv +0.0002416982248399037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document arxiv +0.0003089296918222243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document arxiv +0.00029137184185700827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document arxiv +0.00026464226846800774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document arxiv +0.00030545397919456627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document arxiv +0.0003206778460448875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document arxiv +0.00030968971641110967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document arxiv +0.00023325653928600864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document arxiv +0.00030526899198338555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document arxiv +0.00035376719076633584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document arxiv +0.000290224385981026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document arxiv +0.000294650083382008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document arxiv +0.00028768858128616436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document arxiv +0.00030856965235527843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document arxiv +0.00030579942447879054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document arxiv +0.0002863101084704357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document arxiv +0.0002870032092492213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document arxiv +0.000264182727569885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document arxiv +0.0002974012367036449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document arxiv +0.00032238412143059203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document arxiv +0.00031683716893819036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document arxiv +0.00031157434937617524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document arxiv +0.0003411742735695989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document arxiv +0.00026778444816570715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document arxiv +0.0003037045797275201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document arxiv +0.00027746114370081314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document arxiv +0.00027148285946862043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document arxiv +0.00028042950114678207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document arxiv +0.0003235607816590721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document arxiv +0.0003086692227306295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document arxiv +0.00033990349455148105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document arxiv +0.00030945053208470265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document arxiv +0.00027309074552265303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document arxiv +0.00028737393506316194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document arxiv +0.0003098868328009879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document arxiv +0.0002614229162588409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document arxiv +0.0002884388407820923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document arxiv +0.0031025147279277244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document books +0.003102019887362634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document books +0.0009996745994661548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document books +0.0002406272620255565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document c4 +0.0002404825539493424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document c4 +0.00024062296575435581 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document c4 +0.00024069315766818953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document c4 +0.00024055829162263452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document c4 +0.00024062053397343032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document c4 +0.0002410715545206964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document c4 +0.00024024881846087368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document c4 +0.0002407074700790688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document c4 +0.00024072141428809043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document c4 +0.00024027710230872736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document c4 +0.0002409111299205489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document c4 +0.00024081954058275009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document c4 +0.00024086076794990912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document c4 +0.00024098672620832446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document c4 +0.00024068622303333862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document c4 +0.00024140627024291824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document c4 +0.0002414512033594384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document c4 +0.00024028742594941463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document c4 +0.00024018036089269645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document c4 +0.0002398347365034979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document c4 +0.00024006780153485276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document c4 +0.00024015620270419213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document c4 +0.0002408848259695227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document c4 +0.0002408023185278831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document c4 +0.00024021196580140326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document c4 +0.00024077677271297493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document c4 +0.00024087392454668027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document c4 +0.0002408071293824126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document c4 +0.00024042223828845715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document c4 +0.0002411484752360495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document c4 +0.00023605263746465907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document c4 +0.00023471222158326908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document c4 +0.00023432138580287644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document c4 +0.00023407385623382327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document c4 +0.00023487504174367091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document c4 +0.0002341843704976313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document c4 +0.00023421993170282486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document c4 +0.00023445057969132037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document c4 +0.0002337681680073047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document c4 +0.000234627964808109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document c4 +0.0002338942211888584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document c4 +0.00023403849286843386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document c4 +0.00023405641310796305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document c4 +0.00023349169562397965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document c4 +0.00023381157386048856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document c4 +0.00023388742993790587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document c4 +0.00023363103829469813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document c4 +0.00023421141834630477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document c4 +0.00023420564352232565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document c4 +0.00023367463699173143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document c4 +0.00023344969163567033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document c4 +0.00023372196941547188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document c4 +0.00023399207645297834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document c4 +0.00023357915605505856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document c4 +0.00023337585642190864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document c4 +0.00023385005470157914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document c4 +0.00023301533534493465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document c4 +0.00023377864302541782 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document c4 +0.00023323745848621437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document c4 +0.0002330594611151835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document c4 +0.0002334149675026783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document c4 +0.00023198945902291534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document c4 +0.00023023784834634142 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document c4 +0.00022985623060187217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document c4 +0.0002292605284569516 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document c4 +0.00022926593333048894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document c4 +0.00022922766406807777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document c4 +0.00022898153911167426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document c4 +0.0002292473111593315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document c4 +0.000228804579400424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document c4 +0.00022865485613513526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document c4 +0.00022937426835887895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document c4 +0.00022917388311587372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document c4 +0.0002291660582019043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document c4 +0.00022907895248360543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document c4 +0.0002294617879920205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document c4 +0.0002290452150516566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document c4 +0.00022943405619715553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document c4 +0.0002296271421006204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document c4 +0.00022854791372910372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document c4 +0.00022923123467686557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document c4 +0.00022852404355738494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document c4 +0.00022847798660086642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document c4 +0.0002289604586810316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document c4 +0.00022835479834950643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document c4 +0.0002289149402884243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document c4 +0.00022806655474763446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document c4 +0.00022826296420992974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document c4 +0.00022906829636213627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document c4 +0.0002287628414466998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document c4 +0.0002282673911253445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document c4 +0.00022869309841939134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document c4 +0.0002281540116815451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document c4 +0.0002259755756162738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document c4 +0.00022562331285233504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document c4 +0.0002259061146106053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document c4 +0.00022567670836663787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document c4 +0.00022573165387587061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document c4 +0.00022508514961670572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document c4 +0.00022564642513773356 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document c4 +0.00022563088621998788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document c4 +0.0002250438755373707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document c4 +0.00022524465346241134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document c4 +0.00022531737657666812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document c4 +0.00022444687519363458 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document c4 +0.00022460397498596298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document c4 +0.00022454218976501763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document c4 +0.00022447528843671366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document c4 +0.00022501666332178926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document c4 +0.00022453752304377972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document c4 +0.00022484451871163002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document c4 +0.00022465678847154914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document c4 +0.00022453180917044732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document c4 +0.0002247278486823009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document c4 +0.00022465794828242097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document c4 +0.00022431000701925386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document c4 +0.00022476020248460963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document c4 +0.00022467531771795015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document c4 +0.0002236391309945234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document c4 +0.00022458764920536007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document c4 +0.00022430877426744415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document c4 +0.0002247047786127192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document c4 +0.0002245298090400035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document c4 +0.0002245648831396188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document c4 +0.00022292894729820784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document c4 +0.00022236668082957533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document c4 +0.0002217622659895442 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document c4 +0.00022252452726732609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document c4 +0.00022135333211363678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document c4 +0.0002214571757787971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document c4 +0.0002217188139237798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document c4 +0.00022144214894640303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document c4 +0.00022100172806631854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document c4 +0.00022156392409199052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document c4 +0.00022134830143710272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document c4 +0.00022158598922529453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document c4 +0.00022142932483041377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document c4 +0.00022120980907786554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document c4 +0.00022117917738112441 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document c4 +0.00022077089397851235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document c4 +0.00022093265074996711 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document c4 +0.00022091299741377004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document c4 +0.0002205849150703338 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document c4 +0.0002210648204787979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document c4 +0.0002214235747364102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document c4 +0.00022083907302221787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document c4 +0.0002206334237915964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document c4 +0.00022065193929912214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document c4 +0.00022079775597767288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document c4 +0.00022091492909963518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document c4 +0.00022095009987097293 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document c4 +0.0002208150577180165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document c4 +0.00022085759102772088 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document c4 +0.00022073789170129016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document c4 +0.00022049322781182384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document c4 +0.00022083270617761285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document c4 +0.00021982452827473632 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document c4 +0.00021899870446514259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document c4 +0.00021890358773356361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document c4 +0.00021875556609042841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document c4 +0.00021861195987201226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document c4 +0.00021856782186167455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document c4 +0.00021912837771543515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document c4 +0.00021900213768517756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document c4 +0.00021871675851390374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document c4 +0.0002180537056545586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document c4 +0.0002188196714327129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document c4 +0.00021851362624523464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document c4 +0.0002183236795498736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document c4 +7.291153618675672e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document c4 +0.0003742481815405742 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document cc +0.00038204855962733055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document cc +0.00038821818392663593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document cc +0.00038723332988783727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document cc +0.00038916141142149904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document cc +0.00038049542523949033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document cc +0.0003854755539534284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document cc +0.00024202756466512517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document cc +0.0003915405155008087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document cc +0.0003927382151931033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document cc +0.0003839151202260479 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document cc +0.00040006817468967907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document cc +0.00040318965964443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document cc +0.0003831013019452741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document cc +0.00039166638383204036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document cc +0.00039962784023961004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document cc +0.00039536707853602614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document cc +0.0004204304698247758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document cc +0.00041538899178693555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document cc +0.00039186953333675306 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document cc +0.00038945837196504305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document cc +0.0003919951238929062 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document cc +0.00044377065718528966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document cc +0.0004407759068603017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document cc +0.0002487811895843715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document cc +0.00039349432045556636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document cc +0.00041223198559462343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document cc +0.0004036573014830213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document cc +0.0003825982215521807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document cc +0.00040386867133151386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document cc +0.00024460575279105167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document cc +0.000269029789531335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document cc +0.0003573757493252864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document cc +0.0004600876681392076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document cc +0.0002605354166397086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document cc +0.0003882502452157999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document cc +0.0002466747612126512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document cc +0.0004024726105072402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document cc +0.00040820631128483644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document cc +0.0002691094350403538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document cc +0.00026916830387277267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document cc +0.0004204663297880574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document cc +0.00042379698687085554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document cc +0.0004502169227311871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document cc +0.0002661708937015295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document cc +0.00031239486948031334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document cc +0.0003109054589936201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document cc +0.00045873053079760646 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document cc +0.00022904931423244635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document cc +0.0003813462028433663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document cc +0.00039188129256500874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document cc +0.00045124222276983765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document cc +0.00048138658436853695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document cc +0.0003944178776279866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document cc +0.00039941569676754006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document cc +0.00037952761190240494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document cc +0.0003944870860881476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document cc +0.0003891842411856621 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document cc +0.000387688981934861 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document cc +0.00039197953876258005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document cc +0.00039007915280311206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document cc +0.0003995520363699188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document cc +0.00039230985654592406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document cc +0.0003929472067173851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document cc +0.0003924096172671473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document cc +0.0003881636143629905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document cc +0.000389790617937084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document cc +0.00037351762309221023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document cc +0.0003630196170929407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document cc +0.00033532465765142113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document cc +0.0003076088685761823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document cc +0.00039463850897720803 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document cc +0.0002843816115231449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document cc +0.0002909175709416474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document cc +0.00028867170997202486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document cc +0.0002838644617723659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document cc +0.00029027869525543416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document cc +0.0002821339567560056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document cc +0.0002922988877045601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document cc +0.0002866955958315786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document cc +0.0002865271754558126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document cc +0.0002861247475618473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document cc +0.0002826681072408606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document cc +0.0002849746458282827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document cc +0.0002816966633435316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document cc +0.00026255342235948463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document cc +0.0002552895098829678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document cc +0.00025990194083107813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document cc +0.0002524062657685835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document cc +0.0002538577379748611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document cc +0.0002561415177406761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document cc +0.00026206253059694905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document cc +0.00026168095406910565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document cc +0.0002601305742008613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document cc +0.00025200823006814814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document cc +0.0003229951981263502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document cc +0.00037289448266476045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document cc +0.0003807825862179898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document cc +0.0003616333738191483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document cc +0.0003665117918907636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document cc +0.0003684186453633228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document cc +0.0003589330610806066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document cc +0.00036383861418030395 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document cc +0.000359841363355303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document cc +0.00036431044063050464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document cc +0.0003668574090358279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document cc +0.000362768263620199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document cc +0.0003501888032771077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document cc +0.000352401968221528 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document cc +0.0003541019701869794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document cc +0.0003628121865546891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document cc +0.0003752582953758773 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document cc +0.00037902046230424966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document cc +0.0003777927146925147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document cc +0.0003760676130509053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document cc +0.00034046049078755405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document cc +0.0003338847563259091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document cc +0.00033294499102761794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document cc +0.0004912026198265864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document cc +0.00032064363474664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document cc +0.00032154190389541214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document cc +0.00032309660151746207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document cc +0.00031181143365304544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document cc +0.00031046092294569104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document cc +0.00031150165249068046 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document cc +0.0003041314265988224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document cc +0.0003024834909739394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document cc +0.0003019936835833604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document cc +0.000292329665283177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document cc +0.0002867061143144972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document cc +0.00028443615610701707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document cc +0.00028462291013755945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document cc +0.0002793538601205013 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document cc +0.00027306573977044246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document cc +0.00027097155673336525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document cc +0.0002752934202112985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document cc +0.00043042012694697647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document cc +0.00047495648822986177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document cc +0.00047755032493473855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document cc +0.0004706974343933747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document cc +0.00046682163297771817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document cc +0.0004616765425874178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document cc +0.00030644496751628097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document cc +0.0002909492555358308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document cc +0.00027272036068261724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document cc +0.0004101070217315588 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document cc +0.0003728914338834357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document cc +0.00036546911442305647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document cc +0.0003669945482407483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document cc +0.0003715902407424017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document cc +0.00035837486406683366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document cc +0.0003573318538685469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document cc +0.0003553784893071916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document cc +0.0004920659809912352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document cc +0.0004533619411303183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document cc +0.00045067066057818706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document cc +0.00044396985139270645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document cc +0.00043198288204468477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document cc +0.00043005174223738454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document cc +0.00041847118430776784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document cc +0.00042952036375796664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document cc +0.00043420594647324267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document cc +0.0003461123241053012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document cc +0.0003408581597849182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document cc +0.00033172705422182547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document cc +0.0003392566490686136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document cc +0.00033578341518385483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document cc +0.0003439196710518844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document cc +0.00034559163447085543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document cc +0.00033762478642902825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document cc +0.00033215210055107224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document cc +0.00033423579608014966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document cc +0.0004963355016025102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document cc +0.0004996862761456923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document cc +0.0005000551829325451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document cc +0.0005004212610098755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document cc +0.00027768695585500585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document cc +0.00028395983854338433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document cc +0.00027835826303062254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document cc +0.0002740073176010804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document cc +0.0002791830529274016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document cc +0.0002796863816194411 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document cc +0.00026697453022672804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document cc +0.0002594197440280141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document cc +0.0003779565697649222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document cc +0.00041835823476586606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document cc +0.00043788493575265915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document cc +0.0002731731970096006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document cc +0.000276305847423402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document cc +0.0002704955773958623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document cc +0.0002629635944827518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document cc +0.000260070956974436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document cc +0.00025661553791456334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document cc +0.00025794727207576157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document cc +0.00025295733980001527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document cc +0.0003788106407021029 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document cc +0.0004882344027669431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document cc +0.0003275324309642705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document cc +0.0004803401856640094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document cc +0.00046720138323433943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document cc +0.00043527810307095335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document cc +0.00043905395741627827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document cc +0.00048774175867331425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document cc +0.00048380704121346737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document cc +0.0004779011848346118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document cc +0.00046255587581908036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document cc +0.00045127922880511576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document cc +0.0004503891485256095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document cc +0.0004450142332303422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document cc +0.00044630282482516654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document cc +0.00044325014465743616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document cc +0.0004263874842796447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document cc +0.0004217530913646938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document cc +0.000415120314341852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document cc +0.00040987168279144537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document cc +0.00033468337266607834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document cc +0.0003353094464683005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document cc +0.0004833936821707294 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document cc +0.00047194878988920935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document cc +0.0004648324126996427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document cc +0.0004562345003964941 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document cc +0.0004933203505465098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document cc +0.0003530166075325466 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document cc +0.00035368548192804685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document cc +0.0004872620828289663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document cc +0.00048293889392426456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document cc +0.00047936768462267655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document cc +0.00047821013991587545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document cc +0.0004660610308564753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document cc +0.000394683430103437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document cc +0.00039165053441571324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document cc +0.0003906936040164381 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document cc +0.00038074803919159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document cc +0.0003686529291578143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document cc +0.00035832920428870976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document cc +0.00035929024535947033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document cc +0.0003538226556050544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document cc +0.0003584167868708799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document cc +0.0003480507542594234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document cc +0.0003413709023543034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document cc +0.00034001304759361455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document cc +0.00033430532902756514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document cc +0.00046519252660631277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document cc +0.0002938876402514769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document cc +0.00028676090994509047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document cc +0.00027296150117506716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document cc +0.00026513502621960483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document cc +0.0002680081327926125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document cc +0.00025831225828720344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document cc +0.00026647037295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document cc +0.0002525733734572654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document cc +0.00025831708887575375 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document cc +0.00042487627444443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document cc +0.0004951213245023891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document cc +0.0004804051413177752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document cc +0.0004662397611340532 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document cc +0.0004550138655253933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document cc +0.00044494909122746795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document cc +0.0002899112253051385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document cc +0.0004372879736279761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document cc +0.0004529568099252922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document cc +0.00045127826158829573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document cc +0.0004436558176737439 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document cc +0.0004419233237678378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document cc +0.000434589215880319 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document cc +0.00029153613207706566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document cc +0.0004312458058738854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document cc +0.00028741854968757313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document cc +0.00046853200754421234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document cc +0.0004949145252030074 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document cc +0.00044459683920483167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document cc +0.0003836095306696336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document cc +0.0003789760237872398 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document cc +0.0003749227438304427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document cc +0.0003628558277173369 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document cc +0.00039468301394041474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document cc +0.00038874701821614864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document cc +0.0004158492456077867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document cc +0.00042360504554060077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document cc +0.00040386729844317623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document cc +0.00027595096702902474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document cc +0.00043638766787829135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document cc +0.0002218691596850179 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document cc +0.0004437566108089954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document cc +0.0003889996411609667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document cc +0.00043454421906537704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document cc +0.0004522564392830988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document cc +0.00041517835659357416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document cc +0.0002614360863446896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document cc +0.00037543522111463596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document cc +0.0004386190133514781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document cc +0.00046358333286115075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document cc +0.00043186261317942404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document cc +0.0002377581602097957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document cc +0.00025973334085074254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document cc +0.00040139099332000796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document cc +0.00043674860686687174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document cc +0.00040853289309329373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document cc +0.000242910191729688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document cc +0.0004431071731750582 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document cc +0.0004388092670482523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document cc +0.000381418866255965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document cc +0.0004100117296419717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document cc +0.00042469230366022745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document cc +0.00041744151905374254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document cc +0.00022835699906752945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document cc +0.0004380161085387397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document cc +0.00044803212381807456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document cc +0.00040554932796137236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document cc +0.0004234508646347761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document cc +0.00043341209652360653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document cc +0.00023966604734537185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document cc +0.000259165907316014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document cc +0.0004270653021833602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document cc +0.0004341547032162028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document cc +0.0004111478117275994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document cc +0.0004299383567984396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document cc +0.0004241899124590779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document cc +0.0004502719349364145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document cc +0.00038994621469645615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document cc +0.0003859912398894952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document cc +0.0004247535950310557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document cc +0.000386982084327716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document cc +0.0004196451040053251 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document cc +0.0004096278509782259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document cc +0.0004373334932695721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document cc +0.0004180889975240641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document cc +0.00042079636929672745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document cc +0.00038063574611812913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document cc +0.0003817505891515542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document cc +0.0004420096268860222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document cc +0.00039182670726410623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document cc +0.0003635667850372299 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document cc +0.00041564996472055667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document cc +0.000400529358757286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document cc +0.0003939113874958451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document cc +0.00039066622068940996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document cc +0.0004290098538807143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document cc +0.0004240739958197099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document cc +0.00040775392659215333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document cc +0.0004091634200396925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document cc +0.00042299190476617914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document cc +0.0003701492680344151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document cc +0.0003807353844384635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document cc +0.00038813507771983156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document cc +0.00040072346558408346 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document cc +0.0003603595180423597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document cc +0.00038799421353112465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document cc +0.00037575235582264926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document cc +0.0004239190342959713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document cc +0.0004606044799136546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document cc +0.00045107950652529253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document cc +0.0004391947201871058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document cc +0.0004457516661123035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document cc +0.0004301297170991686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document cc +0.00044661704164586694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document cc +0.0004438849846114837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document cc +0.0004444205734316823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document cc +0.0004190924165303394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document cc +0.00043942581131677875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document cc +0.00021568459798090663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document cc +0.0003814929225407199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document cc +0.0003217453179359235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document cc +0.00031719591470267974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document cc +0.00032434115726922137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document cc +0.0004079911120371051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document cc +0.000329492766381148 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document cc +0.0003845916162001633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document cc +0.0003835208964390098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document cc +0.00037847334157173194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document cc +0.00038296039903791865 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document cc +0.00037896336828472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document cc +0.00037620974396391355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document cc +0.00037420590727111843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document cc +0.000340490625886403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document cc +0.0003078314411035827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document cc +0.00034153990750656097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document cc +0.0003308858103982067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document cc +0.0003452640607156025 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document cc +0.00033095276418403455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document cc +0.0003116308995860414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document cc +0.00032446713226408477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document cc +0.0003015816821912984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document cc +0.00031612418775706894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document cc +0.0003278516344971041 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document cc +0.00033079446736097217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document cc +0.00032278977146550837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document cc +0.00032065272988207914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document cc +0.0003936696452406576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document cc +0.0003450109536627789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document cc +0.0003339787189919641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document cc +0.0003284303856176974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document cc +0.00033652677276843477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document cc +0.0003257822443845694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document cc +0.0003293985569149334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document cc +0.0003310360260148262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document cc +0.0003233770986418526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document cc +0.0003172280092149422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document cc +0.0003160674744292835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document cc +0.00030931090289598506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document cc +0.0003093173886443107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document cc +0.00033167847081104083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document cc +0.00031131501311729723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document cc +0.00031046608876279845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document cc +0.00030569235942207244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document cc +0.00030777943671285197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document cc +0.00029303314290956683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document cc +0.0003045824546400205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document cc +0.00030360880677729793 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document cc +0.00031646239964835433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document cc +0.0003129122300603785 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document cc +0.00031060464956661433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document cc +0.000311819032500067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document cc +0.0002977872483902282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document cc +0.0003009448600922438 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document cc +0.00028610292098537774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document cc +0.0002988326876216654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document cc +0.00028550828372819075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document cc +0.0002830381750875739 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document cc +0.0002848495855927156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document cc +0.0002856443760308144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document cc +0.00027442895344188584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document cc +0.0002681160554049462 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document cc +0.0003421482544126989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document cc +0.0004005872948449718 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document cc +0.0003930123959320308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document cc +0.0003867271832275778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document cc +0.000380805140455254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document cc +0.0003814769861947819 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document cc +0.00038025170883282324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document cc +0.0003738026647867475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document cc +0.00018960856915036276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document cc +0.0003697177501953134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document cc +0.00036674194328136693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document cc +0.00036447406838697555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document cc +0.00036686410861101255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document cc +0.00035915267825103423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document cc +0.0003624758404026675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document cc +0.0002822812140180794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document cc +0.00030620512946920813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document cc +0.000294249776520589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document cc +0.00030238536967523434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document cc +0.00029509593361580754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document cc +0.0002906912701830899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document cc +0.0002921944165474959 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document cc +0.00028358919691127954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document cc +0.0002813182772323272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document cc +0.00027442640800299205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document cc +0.0002747820342933984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document cc +0.0002747584403979717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document cc +0.00027499129634862444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document cc +0.0002712050404257197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document cc +0.0002616256943143254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document cc +0.00026769938929002815 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document cc +0.00038396081322727017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document cc +0.0003863140490027991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document cc +0.00037702277513203237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document cc +0.0003633274156107032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document cc +0.0003587473889240435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document cc +0.0003507672084278415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document cc +0.00033776425499780385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document cc +0.0003377914127574796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document cc +0.00032948015659161326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document cc +0.00033245638541392985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document cc +0.00031080707640648695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document cc +0.0002976903331149755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document cc +0.0002965121463725523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document cc +0.0002933849695266647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document cc +0.0002837035078508233 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document cc +0.00028684569079589323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document cc +0.0003145192320802359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document cc +0.0003566937253273515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document cc +0.0003470199109592918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document cc +0.0003060245312041868 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document cc +0.0002650817213818789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document cc +0.0002643604938780134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document cc +0.000299350876031416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document cc +0.0003178540797697938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document cc +0.000271850367887767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document cc +0.00031349896596549 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document cc +0.00031749734412765755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document cc +0.0003791137842391209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document cc +0.0003742334169957992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document cc +0.0003705639757351107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document cc +0.0003126986769797042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document cc +0.00031038132814561196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document cc +0.00036464437173804883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document cc +0.0003569480488951322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document cc +0.0003541239221619106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document cc +0.00035315297411308053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document cc +0.0003572451925404141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document cc +0.0003514986129411253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document cc +0.0003521798298425866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document cc +0.00034553677439244716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document cc +0.000349004719809412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document cc +0.0003468247484872769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document cc +0.0003465822608356558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document cc +0.00035410983132162007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document cc +0.0003487908354969444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document cc +0.0003479024763238147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document cc +0.000341412530646823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document cc +0.00034451316273667034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document cc +0.0002618849993484869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document cc +0.00026788679978901144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document cc +0.00027450670773227214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document cc +0.0002661273129899329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document cc +0.00026836569676402957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document cc +0.00026155876975483236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document cc +0.0002609276830117151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document cc +0.0002644161630512771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document cc +0.00036789208972872557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document cc +0.00037829849439990513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document cc +0.0003788894943523098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document cc +0.0003617207777959397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document cc +0.0002541334487248998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document cc +0.0002707945538071073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document cc +0.00027046282716455214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document cc +0.0002652443167243215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document cc +0.0002685859923850986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document cc +0.00025734961751176414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document cc +0.000259041720872915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document cc +0.00025340107274823446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document cc +0.00025757135121837893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document cc +0.00025617700500574084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document cc +0.0002566931670562857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document cc +0.0002543871190716101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document cc +0.00024997565589481713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document cc +0.0002954079779456287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document cc +0.00034890741135252835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document cc +0.0003473298137731525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document cc +0.0003296959618486435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document cc +0.0003304520061604598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document cc +0.00032377956175729824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document cc +0.00031700696295168713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document cc +0.0003060382346081943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document cc +0.0003012003005056863 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document cc +0.0002981074073993884 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document cc +0.0002922128825950705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document cc +0.000348901087722931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document cc +0.0003408286289467841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document cc +0.0003410649680770183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document cc +0.0003358524215576502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document cc +0.0003343661874989231 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document cc +0.00032810573699389156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document cc +0.00032261449539097497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document cc +0.0003162694866049203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document cc +0.0003158381156468853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document cc +0.000317376061083603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document cc +0.0003125788639953052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document cc +0.0003010105041885602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document cc +0.0003065865059090678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document cc +0.0003084275726508053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document cc +0.00030966560718296085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document cc +0.0002957728057853081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document cc +0.00029904164542325336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document cc +0.0002955358888729187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document cc +0.00028692976446931544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document cc +0.0002923476214935797 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document cc +0.0002893691697212419 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document cc +0.0002855895211981585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document cc +0.00027968347097626246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document cc +0.0002810783462604979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document cc +0.00027794080455729715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document cc +0.00034784376461416953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document cc +0.0003488347959010943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document cc +0.00034790583710250724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document cc +0.000345913166618151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document cc +0.00033801936268066675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document cc +0.0003290591130212315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document cc +0.00034051399521366823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document cc +0.00032470943131841784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document cc +0.00031679540050914276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document cc +0.00031814596342422325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document cc +0.0003156466289485036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document cc +0.00029985010879003633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document cc +0.0002905176377776361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document cc +0.0004206836775460856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document cc +0.00020660449162246918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document cc +0.0003461727254468087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document cc +0.00020592870907067763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document cc +0.00034173505299233005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document cc +0.0004052437256652738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document cc +0.0004080650901351697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document cc +0.00039778184149144276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document cc +0.00039046311464950275 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document cc +0.00039043444911071384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document cc +0.000388575704932843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document cc +0.00019737533145666597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document cc +0.00037610755595812403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document cc +0.00037315400127598317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document cc +0.00037415028580922163 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document cc +0.00036694041707212337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document cc +0.00018947219857306515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document cc +0.00037046050826533545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document cc +0.0003587440768559087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document cc +0.00034623936498708903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document cc +0.0003502289592617922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document cc +0.00034692398063649823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document cc +0.000339340809421849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document cc +0.0003360510394816983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document cc +0.0003354673850814145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document cc +0.00032937682875877047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document cc +0.00032844505049317715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document cc +0.00028287199339908627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document cc +0.0002795217197003578 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document cc +0.00028048955601883463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document cc +0.0002769326396439027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document cc +0.0002727090021299243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document cc +0.0002726577841024554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document cc +0.00026663619593455374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document cc +0.00026068042672138127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document cc +0.0002637704114326801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document cc +0.0002593043567100412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document cc +0.0002599897110113453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document cc +0.0002435078682758859 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document cc +0.0002450530071379054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document cc +0.00024233331983743606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document cc +0.0002934750947999535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document cc +0.00033241226364044474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document cc +0.00032938406090272075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document cc +0.00032778705403953246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document cc +0.00032184551480398754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document cc +0.00031874002264945737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document cc +0.0003165319685666433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document cc +0.00031307071173376295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document cc +0.00031119524184911957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document cc +0.0003102253344576429 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document cc +0.0003088976240383192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document cc +0.0002951410823077708 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document cc +0.00029772657676757413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document cc +0.0003056048989909935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document cc +0.00031991305381648026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document cc +0.00030890256978362426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document cc +0.0003109382904091933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document cc +0.00031035798529690644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document cc +0.00030741666395911753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document cc +0.0002989918594861846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document cc +0.00029569635443989434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document cc +0.0002973992445667285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document cc +0.000293397351001072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document cc +0.00028737817438047954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document cc +0.00028252738144009747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document cc +0.0002805511898623541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document cc +0.0003718020784620472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document cc +0.0003499713845765235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document cc +0.00034283547445326676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document cc +0.00031464759888838765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document cc +0.00033188946446414833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document cc +0.000326084432195463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document cc +0.0003764568303917893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document cc +0.0003604955598858414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document cc +0.0003655654554133222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document cc +0.00035762304033750504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document cc +0.00038478883950347103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document cc +0.00027735714341247454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document cc +0.00028139534607773563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document cc +0.00019777292251713763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document cc +0.000285571704874486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document cc +0.00028543482146244363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document cc +0.00019434234484256758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document cc +0.00027854908176986763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document cc +0.0002847068039566143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document cc +0.00028672356943064853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document cc +0.00027782687605808177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document cc +0.0002843539634105203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document cc +0.0002894748379090401 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document cc +0.0002868852440186493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document cc +0.0002818504885373851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document cc +0.00028680112812941034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document cc +0.00019258978168723977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document cc +0.00028760637934715155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document cc +0.0002820439443912918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document cc +0.0002831001054410018 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document cc +0.00029001901552467397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document cc +0.00027779449377883156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document cc +0.00019949837437516796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document cc +0.0002907306472984446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document cc +0.00027814858381318327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document cc +0.00019472790889161432 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document cc +0.00020472626596924125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document cc +0.0002870045081974301 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document cc +0.00019812241927078482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document cc +0.0002817553333369554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document cc +0.00027829782796642117 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document cc +0.00028289431732284113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document cc +0.0002795526296717729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document cc +0.00027682829988044574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document cc +0.0002895432402719184 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document cc +0.0002823174903941811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document cc +0.00028170972351837796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document cc +0.00027807915877838826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document cc +0.00028588515681452956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document cc +0.00028112324090816726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document cc +0.00020636178289985485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document cc +0.00019447255290980535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document cc +0.0002850824220591452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document cc +0.00027856429520116784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document cc +0.0002820880676635633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document cc +0.00028943902215995714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document cc +0.0002676366291085329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document cc +0.00023806333809954687 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document cc +0.00024526460430233455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document cc +0.00023876876664622726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document cc +0.00023379770334179805 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document cc +0.00024175151269138382 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document cc +0.00023386583242595706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document cc +0.00023771797150160827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document cc +0.0002262748967483896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document cc +0.0002408148346432682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document cc +0.00023398651720444235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document cc +0.00022989433874474592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document cc +0.00023948500543957772 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document cc +0.0002331594076859196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document cc +0.00023375132439600242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document cc +0.00023923410909668642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document cc +0.00023952796315562954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document cc +0.0002327466076905069 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document cc +0.00023082758956797212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document cc +0.0002240509275524448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document cc +0.00022798879995765268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document cc +0.000221172516774386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document cc +0.00021767045123534623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document cc +0.00021982832794804484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document cc +0.00021971626543789102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document cc +0.00022566565206920132 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document cc +0.0002181984894194856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document cc +0.00021831417549554653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document cc +0.00021601405421187145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document cc +0.00022275733725519607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document cc +0.00021847734911973986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document cc +0.0002243591012664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document cc +0.00021688758139483833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document cc +0.0002182953624789215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document cc +0.00020475155724026002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document cc +0.00021498078062960065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document cc +0.0002157914337233064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document cc +0.00021781838494967963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document cc +0.00021723242266814558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document cc +0.0002176782686553837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document cc +0.0003486179404943968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document cc +0.00034882846352857634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document cc +0.00031400868448352596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document cc +0.00030273484020011963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document cc +0.00029895889118145404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document cc +0.00029770764609621714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document cc +0.0002990181332116852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document cc +0.00029653733972285996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document cc +0.00029624649222942476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document cc +0.00029625609720203576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document cc +0.00029731928930852147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document cc +0.00029011721326148513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document cc +0.00028849788197494655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document cc +0.00021601278623858145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document cc +0.00021319599281739178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document cc +0.0002153325290600083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document cc +0.00018566946174516558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document cc +0.00020736824394291617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document cc +0.00020857419820128004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document cc +0.00020058526129536423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document cc +0.00020745812166665217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document cc +0.00020652171015271702 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document cc +0.00020643808911278608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document cc +0.00020040513914482103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document cc +0.00020598050188272898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document cc +0.0001969184139343296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document cc +0.0001972748812937012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document cc +0.0002038556751586195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document cc +0.00020245186011313464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document cc +0.00019950381422038783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document cc +0.00020837055459665258 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document cc +0.00020371856218246096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document cc +0.00019537612301625791 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document cc +0.00019914984508813857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document cc +0.0002053787713691309 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document cc +0.00019082100541008637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document cc +0.00020397153334531813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document cc +0.0002021462693077317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document cc +0.00019609357008124035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document cc +0.00019693256622486236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document cc +0.00020007239732428112 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document cc +0.00020467075741591954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document cc +0.00019584883400022932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document cc +0.00019135050391176972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document cc +0.0003362829834208298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document cc +0.00034013691154784095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document cc +0.00033215887031941976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document cc +0.00032681189065396707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document cc +0.0003149138485493094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document cc +0.00030179177307540077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document cc +0.0002923278437581119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document cc +0.00029470052278994486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document cc +0.0002994095093045731 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document cc +0.00029033525096085037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document cc +0.00029390798852496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document cc +0.0002916230924130842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document cc +0.00029419886374594913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document cc +0.0002865469756730764 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document cc +0.00021191292549942086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document cc +0.00021369664817409847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document cc +0.00021612485624266726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document cc +0.00022242192634588478 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document cc +0.00014605095659989698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document cc +0.00022070626106341693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document cc +0.0002174420774054071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document cc +0.00021325858963116995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document cc +0.0002124322999488052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document cc +0.0002081218896969054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document cc +0.0002108710211556957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document cc +0.00020686867095978426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document cc +0.00020895752681041895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document cc +0.00020741922266415738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document cc +0.0002069112657197308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document cc +0.00020644627473468118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document cc +0.00020332991338121604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document cc +0.0003560895677789848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document cc +0.00032915779111908214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document cc +0.00033810613317040864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document cc +0.00033729626594036923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document cc +0.00033550342864602944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document cc +0.00034173474024556906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document cc +0.000331505340748827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document cc +0.0003270050330117195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document cc +0.00032585275329172556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document cc +0.0003143383203190604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document cc +0.00031655199110388894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document cc +0.00030738872158476413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document cc +0.00030838388352699285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document cc +0.0003053596995351888 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document cc +0.00031836304739584593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document cc +0.000315315435873905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document cc +0.0003087116248965243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document cc +0.00030396790625537645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document cc +0.0003335812246032149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document cc +0.00034570956323095843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document cc +0.00034563035636675786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document cc +0.00033411265479076335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document cc +0.00034439191141692787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document cc +0.0003364483125496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document cc +0.0003299500453608033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document cc +0.00033163377700074837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document cc +0.00032638649660627673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document cc +0.00032616167939645234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document cc +0.0003205289298760723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document cc +0.00031939393740815355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document cc +0.00031593164066731296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document cc +0.00031928871111254405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document cc +0.00029670189073175004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document cc +0.00020517703846735904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document cc +0.00020128418186172073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document cc +0.00019662723895606717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document cc +0.0001981157042081407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document cc +0.00019703489037041608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document cc +0.00019079796331785068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document cc +0.0001909352306690079 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document cc +0.00018824662295261396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document cc +0.00019864275319325954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document cc +0.00018818516521649587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document cc +0.00018875694972812844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document cc +0.00018231621170645482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document cc +0.00018349407845798273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document cc +0.00018088971427746906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document cc +0.00018296284236327237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document cc +0.0001876011825819916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document cc +0.000329052068725176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document cc +0.00032223616273648536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document cc +0.00031272564089633955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document cc +0.00031621609908414494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document cc +0.0003117213560911235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document cc +0.00030218064069945934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document cc +0.00030658916600512085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document cc +0.0002915863534115821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document cc +0.0002940280138374372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document cc +0.00029067860468866085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document cc +0.00028529228063135635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document cc +0.00028336893301452256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document cc +0.0002794668089130099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document cc +0.00021681361378827842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document cc +0.0001484664674497246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document cc +0.00021950558378215133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document cc +0.00021806860758808645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document cc +0.00021819568718852282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document cc +0.00021626925931585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document cc +0.0001464536143077762 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document cc +0.00021432777088808917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document cc +0.000213473805865147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document cc +0.00021397067253964538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document cc +0.00020758957647437263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document cc +0.00020687124337683314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document cc +0.00020630057046511005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document cc +0.0002091166859352538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document cc +0.00020777355025615267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document cc +0.00020709287641496176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document cc +0.00020736464660577094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document cc +0.00020062246741862607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document cc +0.00020693207561942915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document cc +0.00021151004871893024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document cc +0.00019930249098689716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document cc +0.00021589710041231824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document cc +0.00021369204789905741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document cc +0.0002147099923936778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document cc +0.00021077531190389536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document cc +0.0002100509829113836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document cc +0.00021185362601571124 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document cc +0.00020722136637339565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document cc +0.00020300093701169531 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document cc +0.00019859737993313477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document cc +0.00019971314372100164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document cc +0.00019549908270269278 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document cc +0.00019649820843534028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document cc +0.00019619415513498067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document cc +0.00019493006120377898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document cc +0.00019499409035775506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document cc +0.00019252988593634277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document cc +0.00019440768268686405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document cc +0.00018747161324755577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document cc +0.0001879575932372779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document cc +0.00019040707058357506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document cc +0.0001871931095090703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document cc +0.00020112966223017096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document cc +0.00020516878165311017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document cc +0.00020664735191740533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document cc +0.00021041398572882962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document cc +0.00020397992929690396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document cc +0.0002039978580295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document cc +0.00020592785601142126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document cc +0.0001990755527445265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document cc +0.00019729564847798732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document cc +0.00019958182230527032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document cc +0.0001985037302636386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document cc +0.00020204130355115716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document cc +0.0002000296401958085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document cc +0.0001983064832295463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document cc +0.00019663108484195617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document cc +0.00019510678560556523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document cc +0.0001873284057063206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document cc +0.00019311553072495885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document cc +0.00034652137288816547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document cc +0.0002813690318850024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document cc +0.00027697649713138685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document cc +0.0002755419092534421 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document cc +0.0002681583054440219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document cc +0.00026945753192750824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document cc +0.00026169470768245737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document cc +0.00026437008960810825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document cc +0.0002637294838228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document cc +0.00026491867965088836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document cc +0.00025504483625138986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document cc +0.0002545040623796586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document cc +0.0002546682814073622 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document cc +0.00025545439487142615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document cc +0.0002626896557978271 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document cc +0.00025092040940402784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document cc +0.0002589154885863872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document cc +0.00024106160482721467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document cc +0.0002483289690087987 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document cc +0.0002388930282784437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document cc +0.00024006340759273874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document cc +0.00023765248178029045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document cc +0.00023061351965578936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document cc +0.00024954224883546477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document cc +0.00017861017233018525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document cc +0.00017810832743667658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document cc +0.00017599709170759497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document cc +0.00017462723516505223 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document cc +0.0002906316527068669 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document cc +0.00033762141066247166 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document cc +0.00017170670574152494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document cc +0.00017258674515137717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document cc +0.0002815386173173926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document cc +0.0002996845935618989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document cc +0.0002735268488987296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document cc +0.0002971738713071517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document cc +0.0002942690674002763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document cc +0.0003322222207729567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document cc +0.0003378721656198464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document cc +0.00018307262621851067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document cc +0.00033956081502775057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document cc +0.00031604820927876276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document cc +0.00028805657681088917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document cc +0.00026312293321215633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document cc +0.00034366936722921455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document cc +0.0002865256504406559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document cc +0.0003063615195861786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document cc +0.00028412791619666136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document cc +0.00028060835132727154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document cc +0.00032544974761560506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document cc +0.0002647177833217225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document cc +0.0003152621884896575 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document cc +0.0003054625140336913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document cc +0.00031183308312292263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document cc +0.00018175026696621178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document cc +0.00017699918328872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document cc +0.00018222339261441908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document cc +0.00018348005930964137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document cc +0.0001810735993810541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document cc +0.00030846441282038914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document cc +0.0002972326889310354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document cc +0.00017433421318235594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document cc +0.00032799458649525895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document cc +0.00032482130048512673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document cc +0.00031943465668672475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document cc +0.00029615593630484517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document cc +0.0002893126939511001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document cc +0.0002849288351723284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document cc +0.00028383906633569267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document cc +0.00028072526091262615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document cc +0.000284239564292377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document cc +0.0002778903109432523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document cc +0.0002771644389501471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document cc +0.0002733316182319337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document cc +0.00026362539185869363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document cc +0.0002636325383220217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document cc +0.00026740622442302886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document cc +0.0002646771971853427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document cc +0.0002628566720605389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document cc +0.0002644760695434766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document cc +0.0002623837702310999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document cc +0.00026088722976772894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document cc +0.0002567065374799158 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document cc +0.00018857382101207726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document cc +0.00019036580399817203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document cc +0.00018348828065261222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document cc +0.00018491851780345073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document cc +0.00018904887260080187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document cc +0.0001875609304251801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document cc +0.00018393034720015817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document cc +0.00018419795526114903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document cc +0.00018699955623404795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document cc +0.00018276256902965128 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document cc +0.00017698045695190812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document cc +0.00018104650132303642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document cc +0.00017758206731279688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document cc +0.00017131402995103497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document cc +0.000175944428350446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document cc +0.0003416745727147391 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document cc +0.0003163259373952889 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document cc +0.0002804489269172448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document cc +0.00028748272397403175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document cc +0.00027603318345630605 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document cc +0.000271638824679648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document cc +0.0002763761210210942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document cc +0.00026501984873172717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document cc +0.00026422486894694714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document cc +0.0002686339100849262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document cc +0.0002610837453940606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document cc +0.000260974343729353 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document cc +0.0002599403837029134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document cc +0.0002937273113238609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document cc +0.0003341790732600504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document cc +0.0002620661576600244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document cc +0.0003027929169239288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document cc +0.00031944039129326894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document cc +0.00019025676304139009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document cc +0.00018680910145009907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document cc +0.00034215840419416437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document cc +0.00018618120812119364 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document cc +0.00018605853095599425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document cc +0.00018120712626096538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document cc +0.00018315079292495327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document cc +0.00018362556449041974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document cc +0.0001780024456718171 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document cc +0.00033296526436178697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document cc +0.0001802398632282846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document cc +0.00017340263100798256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document cc +0.00017755840547238697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document cc +0.00018419413735260606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document cc +0.00017869518174591322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document cc +0.00017526271460129484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document cc +0.00017852168597981907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document cc +0.00017566536156787157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document cc +0.00017589867964432936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document cc +0.00017831487394075305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document cc +0.00017837310528935862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document cc +0.00018200908814216548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document cc +0.0001795136627511612 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document cc +0.0003414021775300033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document cc +0.00017177291787788502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document cc +0.0003441900648571877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document cc +0.0003394534597060673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document cc +0.0003236887233114832 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document cc +0.0001639544129688747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document cc +0.00019137443753211255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document cc +0.00018575146284680153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document cc +0.00019184792863440243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document cc +0.00018966043065679055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document cc +0.00017968851317035848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document cc +0.00018479881897661546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document cc +0.0001813642692683015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document cc +0.0001686449798983066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document cc +0.00018516104592230446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document cc +0.00031283726601066385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document cc +0.0003248607542883853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document cc +0.00031583241601202365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document cc +0.00031238270857730376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document cc +0.000307150592403979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document cc +0.00029443829986847044 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document cc +0.0002942723732234677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document cc +0.00023514930666443422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document cc +0.0020776328951453444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document cc +0.0021768234410538883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document cc +0.002106973549276289 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document cc +0.002110915756171751 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document cc +0.0017032382109816464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document cc +0.0019047944877712286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document cc +0.0019402711744016077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document cc +0.0006264790011223686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document cc +0.0017885401938106643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document cc +0.0003547982093445404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document falcon +0.00035934014428504944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document falcon +0.00035707704501371544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document falcon +0.00035287930712815354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document falcon +0.00035977166728996823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document falcon +0.0003581675664109838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document falcon +0.0003548617059697185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document falcon +0.0003639582000286208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document falcon +0.00035375839698688127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document falcon +0.0003743722020080678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document falcon +0.0003530399715341242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document falcon +0.00035511875882752406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document falcon +0.0003618733574783154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document falcon +0.00035185243285420104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document falcon +0.0003541503739732106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document falcon +0.0003631679485751914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document falcon +0.00035748045578182274 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document falcon +0.0003606490690555877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document falcon +0.0003626383296610091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document falcon +0.00035442644361264756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document falcon +0.00035978370170539796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document falcon +0.0003585562375341541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document falcon +0.0003601958372888019 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document falcon +0.000350277765402227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document falcon +0.0003616521184211704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document falcon +0.0003620625543608188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document falcon +0.0003560781983850704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document falcon +0.0003553209610592676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document falcon +0.00035905348643915075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document falcon +0.00034744258805696526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document falcon +0.00035462784035661496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document falcon +0.00034768186175100895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document falcon +0.0003568534635532736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document falcon +0.00035586511544371234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document falcon +0.0003524567827568137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document falcon +0.0003512453770426313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document falcon +0.0003591792726468799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document falcon +0.0003514024529343127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document falcon +0.0003584880112586934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document falcon +0.00035133552916418045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document falcon +0.0003600811981350215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document falcon +0.0003571663974228119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document falcon +0.00035768103378874214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document falcon +0.00035939205561113694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document falcon +0.00035186773916029825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document falcon +0.0003542829672490847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document falcon +0.0003592783642898726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document falcon +0.0003556367340099302 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document falcon +0.00035391392271377027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document falcon +0.00035486725707484836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document falcon +0.00034866743396828035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document falcon +0.0003517219808644735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document falcon +0.00034874458549673823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document falcon +0.000355773136961014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document falcon +0.00035611750387841917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document falcon +0.00035305602013916315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document falcon +0.0003578207127071924 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document falcon +0.00035514635841943707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document falcon +0.00034816946212866206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document falcon +0.0003512707269761496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document falcon +0.0003483392117980654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document falcon +0.0003572169607204321 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document falcon +0.00035139153281660794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document falcon +0.00035536422129036537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document falcon +0.000352017164107143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document falcon +0.000351889550179365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document falcon +0.000358759689953589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document falcon +0.0003569286079869268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document falcon +0.0003657752958602099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document falcon +0.00035396127934790697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document falcon +0.0003618565071224743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document falcon +0.00035146051531973204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document falcon +0.00036107135765783567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document falcon +0.00035019554279994576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document falcon +0.00035567858879904983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document falcon +0.0003504753174793183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document falcon +0.00035931140831329194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document falcon +0.0003502967866002823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document falcon +0.0003532911801041972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document falcon +0.0003583543013070199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document falcon +0.0003566243489931224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document falcon +0.0003468752314799221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document falcon +0.0003597840618138091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document falcon +0.00035128822484768084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document falcon +0.00035889496943437507 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document falcon +0.000352400524650424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document falcon +0.0003518689536768735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document falcon +0.00035866864741303467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document falcon +0.0003454687659106334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document falcon +0.00035348007259317576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document falcon +0.0003539752270940644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document falcon +0.00035146495994081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document falcon +0.00035397212846310423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document falcon +0.00035208246467162587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document falcon +0.0003490843168676626 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document falcon +0.00035299633658644394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document falcon +0.00034868327466167065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document falcon +0.00035941351365601583 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document falcon +0.0003545343062735255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document falcon +0.0003528956380445978 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document falcon +0.0003553355770443352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document falcon +0.0003644224004937743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document falcon +0.00035234291036216907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document falcon +0.0003596237469847771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document falcon +0.0003531996065735989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document falcon +0.0003547177054106099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document falcon +0.0003575586499260483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document falcon +0.00035262635135283667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document falcon +0.0003624191962188944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document falcon +0.0003488398052948616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document falcon +0.0003598294093147917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document falcon +0.00035583006534466323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document falcon +0.00035403139653225103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document falcon +0.00036134702642187156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document falcon +0.0003573689927162834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document falcon +0.0003577141131435527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document falcon +0.00035208814419277406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document falcon +0.00035996720683665625 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document falcon +0.00035415304658912596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document falcon +0.00036353353029443546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document falcon +0.0003537326003150983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document falcon +0.00036053976358299083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document falcon +0.000352380489373494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document falcon +0.00036154661616900994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document falcon +0.00035959332325963614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document falcon +0.0003597954667189692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document falcon +0.0003563108270597542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document falcon +0.0003582891940460143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document falcon +0.0003497728210484297 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document falcon +0.0003549834902179354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document falcon +0.0003529828233484542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document falcon +0.00034627483903285777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document falcon +0.00035569006572589215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document falcon +0.00035449377946910314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document falcon +0.00035802844396194623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document falcon +0.0003617277809353208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document falcon +0.00035034118898654814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document falcon +0.000351091193908611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document falcon +0.0003527914342210668 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document falcon +0.00035028288369781376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document falcon +0.00035775745592780506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document falcon +0.0003449630690661468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document falcon +0.0003583490698830361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document falcon +0.0003476995746684122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document falcon +0.0003535632505019212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document falcon +0.00035640180641147417 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document falcon +0.000361731045691765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document falcon +0.0003534082129597368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document falcon +0.0003550344149828664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document falcon +0.00035363002411364057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document falcon +0.0003537265579677396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document falcon +0.00034950531383577937 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document falcon +0.00035008511827347514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document falcon +0.00035594533400871325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document falcon +0.00035266312861335946 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document falcon +0.00035280268794863923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document falcon +0.0003565470391528536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document falcon +0.0003588492322689137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document falcon +0.00035469909697832775 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document falcon +0.00034712082813410526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document falcon +0.000348701157101807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document falcon +0.0003500192014479944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document falcon +0.00035120560544669755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document falcon +0.00035403656850437445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document falcon +0.00035852376560749366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document falcon +0.0003534754068111774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document falcon +0.00035591740046720765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document falcon +0.000348522354782563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document falcon +0.0003533533959664415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document falcon +0.00035631425964030697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document falcon +0.0003485886551574741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document falcon +0.00035917652631065777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document falcon +0.0003482975272111288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document falcon +0.00035580661277480167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document falcon +0.0003492290722955348 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document falcon +0.00034989284450240613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document falcon +0.0003545677216162781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document falcon +0.00034622286859463484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document falcon +0.00036070626989861965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document falcon +0.00035518365036320786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document falcon +0.00035272907057848406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document falcon +0.0003547343638218734 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document falcon +0.0003496450144966242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document falcon +0.0003537407829294287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document falcon +0.0003489722653985685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document falcon +0.00035057186899911295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document falcon +0.0003507566548933051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document falcon +0.00035630360179023747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document falcon +0.00035631362503416367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document falcon +0.0003490204248026821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document falcon +0.00035761724058371226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document falcon +0.00035037664777467137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document falcon +0.000353402110481068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document falcon +0.00034524163568371745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document falcon +0.00035528523728570974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document falcon +0.00034784916132431703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document falcon +0.00034928476408048925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document falcon +0.00034989205973784984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document falcon +0.00034201664404094254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document falcon +0.0003529676016338611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document falcon +0.00034643433682346637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document falcon +0.0003511666373001904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document falcon +0.00034828669066575333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document falcon +0.0003494625207264413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document falcon +0.0003458957535879216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document falcon +0.0003543020478990003 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document falcon +0.00034754384069014956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document falcon +0.0003598856392240133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document falcon +0.0003503335458553846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document falcon +0.00035919595619778716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document falcon +0.00035767737970754404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document falcon +0.00035197152783998165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document falcon +0.0003549609834422404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document falcon +0.0003568184100569753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document falcon +0.0003512652818651935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document falcon +0.00035912648958665754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document falcon +0.00034764526964056546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document falcon +0.000352439784960359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document falcon +0.00035295886560764226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document falcon +0.0003518132693658672 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document falcon +0.00035589987915465713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document falcon +0.00034923863317385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document falcon +0.0003457987267929692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document falcon +0.0003560928663480501 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document falcon +0.0003529603811204932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document falcon +0.0003524438555443043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document falcon +0.0003438847030263783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document falcon +0.00035981978898461613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document falcon +0.0003446342778566972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document falcon +0.00035529584995236537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document falcon +0.00034855740895831116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document falcon +0.00034932634912802544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document falcon +0.00035805518303064666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document falcon +0.0003497941877073061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document falcon +0.00035774398685405447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document falcon +0.0003560421780316607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document falcon +0.0003508844468369392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document falcon +0.00035731928892270107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document falcon +0.0003557884626314314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document falcon +0.00034992996760289355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document falcon +0.000360752554360921 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document falcon +0.0003452321668708545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document falcon +0.0003591745226131023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document falcon +0.00035256981433229084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document falcon +0.00035378123159712034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document falcon +0.000350464354895999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document falcon +0.00035074625557389677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document falcon +0.00035025894701994667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document falcon +0.00035437902514857614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document falcon +0.0003514684519732232 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document falcon +0.00035449717909633905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document falcon +0.0003436816402714221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document falcon +0.00035139158071782116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document falcon +0.0003509424079843335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document falcon +0.000343894618577506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document falcon +0.0003500789770661659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document falcon +0.0003407788080680086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document falcon +0.0003581908175239701 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document falcon +0.0003465541618780918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document falcon +0.00034600228792437736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document falcon +0.00034416738982773204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document falcon +0.0003519900340150641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document falcon +0.000343369616864659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document falcon +0.0003544993883274688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document falcon +0.0003504441365073392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document falcon +0.00034859160702727056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document falcon +0.00035355909532647185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document falcon +0.0003471900922691849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document falcon +0.0003563015508709187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document falcon +0.0003487888744148821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document falcon +0.00034711767548688336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document falcon +0.0003530734609369085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document falcon +0.00035123969242560935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document falcon +0.0003517127620891489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document falcon +0.00035232835416868673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document falcon +0.0003524437481912308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document falcon +0.0003525996167005602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document falcon +0.00035064770545242043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document falcon +0.00035311558274981226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document falcon +0.00034952204800569914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document falcon +0.0003541471367344846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document falcon +0.00035418812454561825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document falcon +0.0003528951372900714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document falcon +0.0003542338042975688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document falcon +0.00034937738939942796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document falcon +0.0003522182190878447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document falcon +0.0003501406466507449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document falcon +0.00034973079877492633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document falcon +0.0003485274567713538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document falcon +0.00034999308679368985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document falcon +0.0003570051724707296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document falcon +0.00034567230462019706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document falcon +0.00035529000940160696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document falcon +0.00034956512308671755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document falcon +0.0003496962834028953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document falcon +0.0003468745282493457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document falcon +0.0003502717155809202 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document falcon +0.0003556240880896514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document falcon +0.0003515109488424343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document falcon +0.0003563156688192592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document falcon +0.00035040277363989817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document falcon +0.0003481408593290717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document falcon +0.0003624575124332874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document falcon +0.0003522684124250313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document falcon +0.00035286996027653544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document falcon +0.00034967623997256725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document falcon +0.00035182649587602765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document falcon +0.0003524892557026489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document falcon +0.0003507642477451811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document falcon +0.00036190408389835666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document falcon +0.00035102739424880766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document falcon +0.00035239718753257265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document falcon +0.00035298076121821316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document falcon +0.0003478704389752654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document falcon +0.0003503109191567942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document falcon +0.00035143250975654426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document falcon +0.0003480663923069012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document falcon +0.00035691540219998623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document falcon +0.000348815437166351 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document falcon +0.00035202073257766225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document falcon +0.0003491569096274706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document falcon +0.00035277390475511834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document falcon +0.0003524972090026609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document falcon +0.0003504854249750236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document falcon +0.00034740238025423914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document falcon +0.00034968015462277606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document falcon +0.0003493798632762674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document falcon +0.0003488202537862122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document falcon +0.0003525461864643725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document falcon +0.00034903815232825664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document falcon +0.00035536982539258216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document falcon +0.00034858083265155483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document falcon +0.0003505014973608067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document falcon +0.00035327984042622104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document falcon +0.0003503286677453136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document falcon +0.00035835274842442816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document falcon +0.00034970302660275595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document falcon +0.000357929573140149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document falcon +0.0003517238649788585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document falcon +0.00036097027318848475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document falcon +0.0003502734074110026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document falcon +0.00035801510806036273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document falcon +0.0003568006373479869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document falcon +0.00036128108717454636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document falcon +0.0003563436883111686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document falcon +0.00035559725321852463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document falcon +0.00035089656006854944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document falcon +0.000359453964362057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document falcon +0.00035629498059104033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document falcon +0.0003622207707090437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document falcon +0.0003540946784512821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document falcon +0.0003594750565232011 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document falcon +0.0003566007415086991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document falcon +0.0003562142599126134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document falcon +0.0003569948186744601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document falcon +0.00035166554847920186 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document falcon +0.00035047994419295137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document falcon +0.0003561578193739437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document falcon +0.00035470866838811544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document falcon +0.00034216920464876335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document falcon +0.0003550021513075795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document falcon +0.0003488045105938729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document falcon +0.0003513340720840151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document falcon +0.0003448558566387584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document falcon +0.0003460966026953241 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document falcon +0.0003488157616036459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document falcon +0.0003446120387842362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document falcon +0.000351528602987427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document falcon +0.00035661118227454713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document falcon +0.0003551342699877457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document falcon +0.0003478953397924445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document falcon +0.00034625782458988215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document falcon +0.0003527515447405871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document falcon +0.00034823744889805696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document falcon +0.00034823314560254406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document falcon +0.00035162668292961944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document falcon +0.0003477307716074623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document falcon +0.0003446457989477787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document falcon +0.00034782916273767795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document falcon +0.0003517249130302248 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document falcon +0.0003449873430908556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document falcon +0.00034841291749669877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document falcon +0.0003466028498941749 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document falcon +0.0003486436831199424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document falcon +0.0003478279234211838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document falcon +0.0003495903653274374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document falcon +0.00034896893881218957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document falcon +0.000348941645312426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document falcon +0.0003474221308416894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document falcon +0.0003462621543839385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document falcon +0.0003669373860863891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document falcon +0.00034691156268163006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document falcon +0.0003527774103765281 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document falcon +0.00034684565672734663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document falcon +0.0003454250599604457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document falcon +0.0003541536557159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document falcon +0.000345735737037366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document falcon +0.0003524669816385214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document falcon +0.0003441817133096468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document falcon +0.0003519093265859089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document falcon +0.00035080085480352095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document falcon +0.00035285227929327434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document falcon +0.00034354836346901676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document falcon +0.00034789770937373467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document falcon +0.000343665920520102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document falcon +0.0003490884931060568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document falcon +0.00034380029463398654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document falcon +0.00034874768005099945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document falcon +0.0003457058510967673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document falcon +0.00034644265227023904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document falcon +0.00035008339858594957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document falcon +0.0003462377193296194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document falcon +0.0003620491787114201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document falcon +0.000348717011044469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document falcon +0.00034370072363913706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document falcon +0.0003551981066775649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document falcon +0.0003500119496799342 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document falcon +0.0003485082952669081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document falcon +0.0003508155580978919 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document falcon +0.00035311375163251416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document falcon +0.00034945972003423253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document falcon +0.0003474220353789879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document falcon +0.0003536443686585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document falcon +0.0003560350489042953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document falcon +0.0003493655927914396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document falcon +0.0003528423977146383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document falcon +0.00035255554724471217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document falcon +0.0003479760010190111 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document falcon +0.00035458598862501956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document falcon +0.0003458990560538315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document falcon +0.00035157946422379875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document falcon +0.00034736860650169996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document falcon +0.0003529152313394119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document falcon +0.00034586294329524465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document falcon +0.00035707214923794877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document falcon +0.0003509580363496512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document falcon +0.00035244176725524474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document falcon +0.0003467539557999047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document falcon +0.00034919687962275546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document falcon +0.00035094031731719953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document falcon +0.0003484309008351352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document falcon +0.0003485409424916253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document falcon +0.0003499590776117838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document falcon +0.0003492842758957848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document falcon +0.0003529712275178912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document falcon +0.0003566141287087449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document falcon +0.0003649496522047409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document falcon +0.0003563218912208234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document falcon +0.00035614782126966145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document falcon +0.0003531944298453266 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document falcon +0.0003535950949566616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document falcon +0.0003544295554928795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document falcon +0.0003519908503740376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document falcon +0.00035752817626134463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document falcon +0.0003515322689589972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document falcon +0.0003486893890307115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document falcon +0.0003446520464889867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document falcon +0.0003509421562481707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document falcon +0.00035335015702909084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document falcon +0.0003490178167345008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document falcon +0.0003520497821155174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document falcon +0.0003549762618908944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document falcon +0.00035072190850833103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document falcon +0.0003542458638526423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document falcon +0.000352419194572916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document falcon +0.0003545102564672614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document falcon +0.0003495437992331806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document falcon +0.0003542843376993964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document falcon +0.000352827529313958 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document falcon +0.00035442506093223886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document falcon +0.0003496970719044257 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document falcon +0.0003553096424442362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document falcon +0.00034986845565067564 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document falcon +0.000352131055186658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document falcon +0.0003527021708198983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document falcon +0.00034905885414547214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document falcon +0.0003583433842468394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document falcon +0.00034409435202828383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document falcon +0.00034846410520871483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document falcon +0.0003554459991927314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document falcon +0.00035310507471843076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document falcon +0.000350028910786098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document falcon +0.00035049727458009896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document falcon +0.0003519047735925826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document falcon +0.0003513027429919726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document falcon +0.0003626947260354396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document falcon +0.0003500087324849783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document falcon +0.0003618315726725285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document falcon +0.0003535385113938023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document falcon +0.0003487064058517615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document falcon +0.0003618709124780938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document falcon +0.00035040070335625915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document falcon +0.0003506279032267829 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document falcon +0.0003498435310527524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document falcon +0.0003554634749821431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document falcon +0.00035091209738758963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document falcon +0.00035034103678978573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document falcon +0.00035398931854386146 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document falcon +0.00035495529304989485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document falcon +0.00036067883473356603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document falcon +6.322825248625475e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document megawika +2.4432314037946264e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document megawika +5.6313888721313454e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document megawika +2.4208171781595055e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document megawika +2.325811856369237e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document megawika +2.4010790356322705e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document megawika +5.36773610843632e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document megawika +1.360574433501002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document megawika +1.3076540344853244e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document megawika +1.3386534334886313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document megawika +1.2498103719605153e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document megawika +1.403763836949682e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document megawika +1.3636756723495417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document megawika +1.2242489446940814e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document megawika +1.2398255818973339e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document megawika +1.2972616994216281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document megawika +1.3947809855914134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document megawika +1.3144843787829514e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document megawika +1.1693809976572487e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document megawika +1.3677252682893802e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document megawika +1.3940876719849597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document megawika +1.4222245138730965e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document megawika +1.3201677767919704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document megawika +1.1421717796486169e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document megawika +1.2890514724498703e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document megawika +1.3649507648749037e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document megawika +1.2400732563490717e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document megawika +1.1557681453277616e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document megawika +1.2294483595964517e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document megawika +1.2137484472122283e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document megawika +1.3299663426456e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document megawika +1.2461984216479532e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document megawika +1.4666434217609636e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document megawika +1.1876997894686238e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document megawika +1.2939155338964078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document megawika +1.3859590039728515e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document megawika +1.317917848615668e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document megawika +1.1335281536110342e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document megawika +1.2889923952861426e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document megawika +1.3471671647053326e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document megawika +1.2221720014475102e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document megawika +1.2632647276287541e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document megawika +1.28276219004076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document megawika +1.36213704321643e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document megawika +1.2414858625261553e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document megawika +1.3173700421883744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document megawika +1.295597796725686e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document megawika +1.242783936442904e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document megawika +1.2417374088427464e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document megawika +1.2134479405400744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document megawika +1.3090040663304255e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document megawika +1.2713470581614905e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document megawika +5.5750231378906594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document megawika +5.777597358425469e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document megawika +5.349786767471258e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document megawika +5.675165050453583e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document megawika +5.482611216158831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document megawika +5.065421899890121e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document megawika +5.384718357480146e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document megawika +4.872037363236061e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document megawika +4.532709250783155e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document megawika +5.7257963030489613e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document megawika +4.9014365579652036e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document megawika +5.722863552770969e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document megawika +6.149911636146833e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document megawika +5.2178057608273506e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document megawika +4.990228161160431e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document megawika +5.866186875255134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document megawika +5.004185734360719e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document megawika +4.79401853705107e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document megawika +5.435219965052376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document megawika +5.035997225792266e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document megawika +5.622401774211625e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document megawika +5.028826157387559e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document megawika +5.596379470128795e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document megawika +6.027824493191489e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document megawika +5.5358270009931474e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document megawika +5.9839051807685496e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document megawika +5.1221077499249595e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document megawika +5.517228560620279e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document megawika +5.1687858285052305e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document megawika +5.684188244145645e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document megawika +5.212693275535878e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document megawika +4.8551007022784084e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document megawika +5.4888506639203145e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document megawika +5.345098688527242e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document megawika +4.8506420625516594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document megawika +5.132168603397676e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document megawika +5.719476795114223e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document megawika +5.7448621149792696e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document megawika +4.9068410568059265e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document megawika +5.382937299647678e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document megawika +4.8288432136304634e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document megawika +5.841703200305416e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document megawika +5.1589611587885584e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document megawika +6.031113829732574e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document megawika +5.4558202844532094e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document megawika +5.341852317196142e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document megawika +5.1402942738369954e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document megawika +5.735421384377395e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document megawika +5.473629863586958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document megawika +5.4708993245733936e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document megawika +4.931161863634078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document megawika +5.104173022127248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document megawika +5.510157161510824e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document megawika +5.652501401782597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document megawika +5.7273656573031666e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document megawika +5.638363224821738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document megawika +5.6128115396668704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document megawika +5.00304877998141e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document megawika +5.596120554779096e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document megawika +5.5280923889040006e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document megawika +5.223477917938408e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document megawika +5.29472809986569e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document megawika +2.205682378243213e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document megawika +1.4367563720603185e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document megawika +3.5506193487931076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document megawika +3.0442910855821778e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document megawika +2.2540042508019627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document megawika +2.6880163202623216e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document megawika +2.534473148048727e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document megawika +2.6560945431318916e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document megawika +2.547470248967691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document megawika +2.5248825388073738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document megawika +2.5828729575000054e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document megawika +2.4026583817957736e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document megawika +2.3930425429834413e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document megawika +2.5037365362599724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document megawika +2.6696745470595603e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document megawika +2.140323051341762e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document megawika +2.617354786691592e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document megawika +1.538359101762691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document megawika +1.2871029252377856e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document megawika +2.255195411289217e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document megawika +2.4832313897952067e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document megawika +9.303873918189968e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document megawika +2.179532302620228e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document megawika +1.9750517506901206e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document megawika +2.7740420380648435e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document megawika +2.7813714782319335e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document megawika +4.1595357937609806e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document megawika +2.741365122389175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document megawika +2.117451071361901e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document megawika +1.7132649760565998e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document megawika +1.7492547092602047e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document megawika +1.7499951097392276e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document megawika +1.6632444789170958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document megawika +1.6678802252361607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document megawika +1.5519208704558896e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document megawika +1.652420992967167e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document megawika +1.6119931034508755e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document megawika +1.6638882076736552e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document megawika +1.7198076782652946e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document megawika +1.572927860565175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document megawika +1.5194822618169918e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document megawika +1.6677776832669846e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document megawika +1.595612492245688e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document megawika +1.682350633181197e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document megawika +1.663983380609724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document megawika +1.710187842689243e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document megawika +1.5733697527539038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document megawika +1.6972104757911438e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document megawika +1.6610142847616577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document megawika +1.61094882403031e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document megawika +1.4789207305138325e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document megawika +1.639299617676302e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document megawika +1.3241204512116132e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document megawika +8.582260726625535e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document megawika +8.213000975576739e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document megawika +9.549247732811947e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document megawika +9.17242785339013e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document megawika +7.632868223725218e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document megawika +8.674401118222175e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document megawika +9.124384255505347e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document megawika +8.344222222417358e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document megawika +8.992299957499065e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document megawika +8.76689497361025e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document megawika +7.973396239586015e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document megawika +9.006935606644125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document megawika +8.725545954955498e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document megawika +1.215449694669174e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document megawika +3.3041720284158646e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document megawika +2.0593512412624502e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document megawika +1.893608946986248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document megawika +1.737111666788535e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document megawika +1.4915923449873955e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document megawika +2.289370239067605e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document megawika +2.8615335689614638e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document megawika +8.847283630883125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document megawika +1.8175470362373804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document megawika +1.8152226683368038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document megawika +1.789149655314284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document megawika +1.7690523036477663e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document megawika +1.8333732213753644e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document megawika +1.8794105687718654e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document megawika +1.721841156706417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document megawika +2.0612008685724796e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document megawika +1.9297370681336376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document megawika +2.0188440409661018e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document megawika +5.1741216329695265e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document megawika +1.3417913926038429e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document megawika +1.1010813016469651e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document megawika +1.1252416134320087e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document megawika +1.2801744104313002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document megawika +1.3041514955795817e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document megawika +1.3428837580879075e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document megawika +1.320809382267804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document megawika +1.3451566676555968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document megawika +1.228284926657501e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document megawika +1.2410599573923043e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document megawika +1.3815343367377182e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document megawika +1.3895126265148832e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document megawika +1.2306773644401741e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document megawika +1.32981021906281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document megawika +1.101337469221607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document megawika +1.513094184404692e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document megawika +1.1073759547073234e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document megawika +1.2879348765857567e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document megawika +9.619595770228435e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document megawika +1.2384340836286436e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document megawika +1.1766667232211577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document megawika +1.2871049236196452e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document megawika +1.2010645926497744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document megawika +1.3971428231518597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document megawika +1.2283733550547932e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document megawika +1.2659530508255308e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document megawika +1.551775613074462e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document megawika +1.1169413343776979e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document megawika +1.1433700593712463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document megawika +4.964773647323492e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document megawika +1.0995586595687313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document megawika +1.2957393071411267e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document megawika +2.75899247407709e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document megawika +2.8269344597344854e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document megawika +2.329108187246831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document megawika +2.4231761430460284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document megawika +1.2434140512230442e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document megawika +1.638718338352859e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document megawika +3.272953556801187e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document megawika +6.061314500486327e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document megawika +1.2465979731210292e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document megawika +1.2737557327967737e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document megawika +1.038428658075627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document megawika +2.61666472045566e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document megawika +3.6506873212272224e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document megawika +1.5066359138295701e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document megawika +1.1166290872121178e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document megawika +1.5546966228590285e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document megawika +1.2583434625014828e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document megawika +1.3398826881300862e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document megawika +1.2944933160515968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document megawika +1.0971437399901365e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document megawika +1.2787922795775774e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document megawika +1.404979227816985e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document megawika +1.3344734431324463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document megawika +4.886031157107555e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document megawika +3.277261443596394e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document megawika +3.5057957685786495e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document megawika +3.287625301718589e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document megawika +3.1370056372668855e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document megawika +3.186092015785841e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document megawika +7.271819324142512e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document megawika +0.001451215788905126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document open-web-math-train +0.0014486847196258788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document open-web-math-train +0.0008861032722895899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document open-web-math-train +0.0018119590809459816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document open-web-math-train +0.0008916937917547129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document open-web-math-train +6.960128832809415e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document open-web-math-train +0.002008403651063623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document open-web-math-train +0.0014374900742131454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document open-web-math-train +0.00180213596996716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document open-web-math-train +0.001956178877532413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document open-web-math-train +0.0008829547017667033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document open-web-math-train +0.0008910853619157279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document open-web-math-train +0.0018260998845299973 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document open-web-math-train +0.0012499632072059553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document pes2o +0.00125398260359913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document pes2o +0.0012541704774729071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document pes2o +0.0012527268234360602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document pes2o +0.0012532925243737164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document pes2o +0.0012456396241204315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document pes2o +0.0012589894424352072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document pes2o +0.001508020123999618 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document pes2o +0.00333096950781965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document pes2o +0.0033233414614415547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document pes2o +0.003512387990689828 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document pes2o +0.0035091382940513126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document pes2o +0.003514155927147005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document pes2o +0.003327108000579638 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document pes2o +0.003329106196589836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document pes2o +0.003505604148738077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document pes2o +0.003324825759567855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document pes2o +0.0033248240149804913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document pes2o +0.0033385962112851358 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document pes2o +0.0035043186296553615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document pes2o +0.003340469505431529 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document pes2o +0.0035106889084796276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document pes2o +0.0033309469281030167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document pes2o +0.003340337858029757 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document pes2o +0.003505919861097801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document pes2o +0.0003882924098240512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document pes2o +0.0005759963691850877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document reddit +0.0005959971675332674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document reddit +0.0006026179290353799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document reddit +0.0005824184320784846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document reddit +0.0005854598548616037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document reddit +0.0005903767055633473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document reddit +0.0005930306490982049 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document reddit +0.000569425602700746 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document reddit +0.0005675060415179408 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document reddit +0.0005772431621253389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document reddit +0.0005678026053826858 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document reddit +0.0005700398263483378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document reddit +0.0005669467963528824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document reddit +0.0005701015953324305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document reddit +0.0005795907287413296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document reddit +0.0005735602737531164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document reddit +0.0005749862745842101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document reddit +0.0005693257015931971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document reddit +0.0005716568794795563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document reddit +0.0005761083919774021 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document reddit +0.0005688343169797355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document reddit +0.0005807913190929842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document reddit +0.0005710229258078636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document reddit +0.0005704083039826862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document reddit +0.0005862132348308056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document reddit +0.0005717662049559556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document reddit +0.0005858155213694451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document reddit +0.0005812012281792392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document reddit +0.0005803981414588498 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document reddit +0.0005700102108287723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document reddit +0.0005719243459052329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document reddit +0.0005867253401661752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document reddit +0.0005731087218860733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document reddit +0.0005712197789109317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document reddit +0.0005702376926310089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document reddit +0.0005700411527742972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document reddit +0.0005828090098178196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document reddit +0.0005770140826168056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document reddit +0.0005723509664597896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document reddit +0.0005755499231836962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document reddit +0.0005636407438471367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document reddit +0.0005640281556500104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document reddit +0.0005633159058766496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document reddit +0.0005638034311151449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document reddit +0.0005630066273073224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document reddit +0.0005631803831128559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document reddit +0.0005631228881679657 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document reddit +0.0005628178701487633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document reddit +0.0005624448092256196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document reddit +0.0005620957024062329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document reddit +0.0005614201504177484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document reddit +0.0005616890951464056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document reddit +0.0005611348559279058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document reddit +0.0005604238061828518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document reddit +0.0005603301490194237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document reddit +0.0005607291294548833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document reddit +0.0005605234569930727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document reddit +0.0005613778566640694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document reddit +0.0005610248539992471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document reddit +0.0005599977416780475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document reddit +0.0005603632562116935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document reddit +0.0005599177479509897 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document reddit +0.0005595202318298379 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document reddit +0.0005600975633499175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document reddit +0.0005614075491213365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document reddit +0.000612563885043477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document reddit +0.0005515469909644413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document reddit +0.0005526782014946906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document reddit +0.0005472463408095445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document reddit +0.0005502284746004587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document reddit +0.0005414514790555363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document reddit +0.0005513499500134784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document reddit +0.0005391391454105187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document reddit +0.0005415836910001838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document reddit +0.0005208132468536551 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document reddit +0.0005889827143132871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document reddit +0.0005822520817765276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document reddit +0.0004173155230758696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document reddit +0.0009994361338078242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document stackexchange +0.001087156194657966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document stackexchange +0.0010667737163656816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document stackexchange +0.0009602877882124873 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document stackexchange +0.0008968956271971105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document stackexchange +0.0009198034843762967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document stackexchange +0.0009423901016715341 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document stackexchange +0.0009674094553686345 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document stackexchange +0.0009858331322519164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document stackexchange +0.0009970593645879198 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document stackexchange +0.0010027035193731686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document stackexchange +0.0010128291154221853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document stackexchange +0.0010215631382631918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document stackexchange +0.0010288663771461238 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document stackexchange +0.0010346219929285867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document stackexchange +0.00104544019940344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document stackexchange +0.0010525172676724333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document stackexchange +0.0010609529620775127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document stackexchange +0.0010725892748610153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document stackexchange +0.0010818563598181568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document stackexchange +0.0010992760196793917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document stackexchange +0.0011178992762079917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document stackexchange +0.001124687532085676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document stackexchange +0.001118303661267191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document stackexchange +0.0010206825575416534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document stackexchange +0.0005512280117499715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document stackexchange +0.004474659408857016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document starcoder +0.00409944473890653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document starcoder +0.005137179939941845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document starcoder +0.005143172251066109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document starcoder +0.005206134363352808 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document starcoder +0.004892747858974329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document starcoder +0.004844731352552902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document starcoder +0.005308320169123755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document starcoder +0.005124709815666577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document starcoder +0.005424710744483826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document starcoder +0.00538244648861977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document starcoder +0.0029107284679086853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document starcoder +0.0026825258998444705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document starcoder +0.0026904503191419243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document starcoder +0.002687906577174073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document starcoder +0.002850165346048818 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document starcoder +0.005322698571717847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document starcoder +0.004450334290869719 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document starcoder +0.004700990083440683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document starcoder +0.003903568556500995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document starcoder +0.00390561515396931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document starcoder +0.0039046402900912262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document starcoder +0.003907454839379547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document starcoder +0.0038583224578603824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document starcoder +0.0037914116657695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document starcoder +0.003786665266798682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document starcoder +0.003792000802430658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document starcoder +0.00319266847466091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document starcoder +0.0032658716699838944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document starcoder +0.0034801959532460023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document starcoder +0.0028307012092022594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document starcoder +0.0028420360878146276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document starcoder +0.0028410455248484914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document starcoder +0.00283497183526842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document starcoder +0.002840187195459487 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document starcoder +0.0028398709431369834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document starcoder +0.004364722843422023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document starcoder +0.004093255713117101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document starcoder +0.004092331079566252 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document starcoder +0.004005326985579649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document starcoder +0.0036205502856964207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document starcoder +0.003625316793034984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document starcoder +0.003604743435602363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document starcoder +0.0035405823343673125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document starcoder +0.0041601413517253945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document starcoder +0.005886303658937057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document starcoder +0.003600909532810332 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document starcoder +0.0034941365817168658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document starcoder +0.0004992164842980224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document starcoder +0.00032927705604725614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document tulu +0.0002860154190878753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document tulu +0.0002845217585425619 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document tulu +0.0002743528685497456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document tulu +0.00026025323737738766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document tulu +0.00023493876414603155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document tulu +0.00029665994994226705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document tulu +0.00031808102075993956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document tulu +0.00031813573046011285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document tulu +0.0002711905171855542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document tulu +0.00028892513401817095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document tulu +0.00030003908676979083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document tulu +0.00026839878771944684 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document tulu +0.00029155935002690497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document tulu +0.0002998624927624209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document tulu +0.0003091705447974841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document tulu +0.00026873195794309786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document tulu +0.00027721873498527547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document tulu +0.0002841662554024377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document tulu +0.0002839461156551537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document tulu +0.0002861705604659811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document tulu +0.0002460995649635886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document tulu +0.00019420142619795496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document tulu +0.00021967677816173628 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document tulu +0.0002620283200480949 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document tulu +0.0002433390542188936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document tulu +0.00021254976608350767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document tulu +0.00022094815569522115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document tulu +0.000342862378668244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document tulu +0.00033784225259118157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document tulu +0.0003367278459543952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document tulu +0.00029843279042852765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document tulu +0.0002926583661257988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document tulu +0.00029320337282010673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document tulu +0.00029281450669483455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document tulu +0.0002915338187002653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document tulu +0.0002864226923084572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document tulu +0.00028643439083586396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document tulu +0.00028253710956299054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document tulu +0.0002810856078805806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document tulu +0.00031474941344656715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document tulu +0.0002139130222205655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document tulu +0.0003084648871862831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document tulu +0.0003309477872140129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document tulu +0.0003360096824695161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document tulu +0.0003355452655196557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document tulu +0.00038119390366386037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document tulu +0.00038078927630086064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document tulu +0.0003386200917551554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document tulu +0.0002158905159938882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document tulu +0.00021621682877018768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document tulu +0.00021553306942740535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document tulu +0.00021581563462722296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document tulu +0.0002157694110556169 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document tulu +0.000215643699847159 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document tulu +0.00021532716715168094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document tulu +0.00021531221326022472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document tulu +0.0002831801179028896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document tulu +0.0002514844936507595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document tulu +0.00031638782778107964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document tulu +0.0002749197545278445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document tulu +0.00026159721512464495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document tulu +0.0002630052420096968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document tulu +0.00031106811228913666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document tulu +0.0002852973415334161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document tulu +3.7555372465932136e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document tulu +0.003548077173506675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document wiki +0.0018372203137874265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document wiki diff --git a/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt b/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt new file mode 100644 index 0000000000..6f34558ec3 --- /dev/null +++ b/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document +0.0002583902668716813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document +0.0031025147279277244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.003102019887362634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.0009996745994661548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document +0.0002406272620255565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document +0.0003742481815405742 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document +0.0003547982093445404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document +6.322825248625475e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document +0.001451215788905126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document +0.0012499632072059553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document +0.0005759963691850877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document +0.0009994361338078242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document +0.004474659408857016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document +0.00032927705604725614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document +0.003548077173506675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document diff --git a/ALCF/data-lists/polaris/falcon.txt b/ALCF/data-lists/polaris/falcon.txt new file mode 100644 index 0000000000..914d4803a4 --- /dev/null +++ b/ALCF/data-lists/polaris/falcon.txt @@ -0,0 +1,501 @@ +0.0003547982093445404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document falcon +0.00035934014428504944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document falcon +0.00035707704501371544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document falcon +0.00035287930712815354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document falcon +0.00035977166728996823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document falcon +0.0003581675664109838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document falcon +0.0003548617059697185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document falcon +0.0003639582000286208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document falcon +0.00035375839698688127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document falcon +0.0003743722020080678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document falcon +0.0003530399715341242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document falcon +0.00035511875882752406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document falcon +0.0003618733574783154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document falcon +0.00035185243285420104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document falcon +0.0003541503739732106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document falcon +0.0003631679485751914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document falcon +0.00035748045578182274 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document falcon +0.0003606490690555877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document falcon +0.0003626383296610091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document falcon +0.00035442644361264756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document falcon +0.00035978370170539796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document falcon +0.0003585562375341541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document falcon +0.0003601958372888019 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document falcon +0.000350277765402227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document falcon +0.0003616521184211704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document falcon +0.0003620625543608188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document falcon +0.0003560781983850704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document falcon +0.0003553209610592676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document falcon +0.00035905348643915075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document falcon +0.00034744258805696526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document falcon +0.00035462784035661496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document falcon +0.00034768186175100895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document falcon +0.0003568534635532736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document falcon +0.00035586511544371234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document falcon +0.0003524567827568137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document falcon +0.0003512453770426313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document falcon +0.0003591792726468799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document falcon +0.0003514024529343127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document falcon +0.0003584880112586934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document falcon +0.00035133552916418045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document falcon +0.0003600811981350215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document falcon +0.0003571663974228119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document falcon +0.00035768103378874214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document falcon +0.00035939205561113694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document falcon +0.00035186773916029825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document falcon +0.0003542829672490847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document falcon +0.0003592783642898726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document falcon +0.0003556367340099302 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document falcon +0.00035391392271377027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document falcon +0.00035486725707484836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document falcon +0.00034866743396828035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document falcon +0.0003517219808644735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document falcon +0.00034874458549673823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document falcon +0.000355773136961014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document falcon +0.00035611750387841917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document falcon +0.00035305602013916315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document falcon +0.0003578207127071924 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document falcon +0.00035514635841943707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document falcon +0.00034816946212866206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document falcon +0.0003512707269761496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document falcon +0.0003483392117980654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document falcon +0.0003572169607204321 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document falcon +0.00035139153281660794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document falcon +0.00035536422129036537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document falcon +0.000352017164107143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document falcon +0.000351889550179365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document falcon +0.000358759689953589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document falcon +0.0003569286079869268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document falcon +0.0003657752958602099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document falcon +0.00035396127934790697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document falcon +0.0003618565071224743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document falcon +0.00035146051531973204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document falcon +0.00036107135765783567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document falcon +0.00035019554279994576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document falcon +0.00035567858879904983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document falcon +0.0003504753174793183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document falcon +0.00035931140831329194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document falcon +0.0003502967866002823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document falcon +0.0003532911801041972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document falcon +0.0003583543013070199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document falcon +0.0003566243489931224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document falcon +0.0003468752314799221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document falcon +0.0003597840618138091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document falcon +0.00035128822484768084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document falcon +0.00035889496943437507 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document falcon +0.000352400524650424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document falcon +0.0003518689536768735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document falcon +0.00035866864741303467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document falcon +0.0003454687659106334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document falcon +0.00035348007259317576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document falcon +0.0003539752270940644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document falcon +0.00035146495994081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document falcon +0.00035397212846310423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document falcon +0.00035208246467162587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document falcon +0.0003490843168676626 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document falcon +0.00035299633658644394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document falcon +0.00034868327466167065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document falcon +0.00035941351365601583 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document falcon +0.0003545343062735255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document falcon +0.0003528956380445978 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document falcon +0.0003553355770443352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document falcon +0.0003644224004937743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document falcon +0.00035234291036216907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document falcon +0.0003596237469847771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document falcon +0.0003531996065735989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document falcon +0.0003547177054106099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document falcon +0.0003575586499260483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document falcon +0.00035262635135283667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document falcon +0.0003624191962188944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document falcon +0.0003488398052948616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document falcon +0.0003598294093147917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document falcon +0.00035583006534466323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document falcon +0.00035403139653225103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document falcon +0.00036134702642187156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document falcon +0.0003573689927162834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document falcon +0.0003577141131435527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document falcon +0.00035208814419277406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document falcon +0.00035996720683665625 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document falcon +0.00035415304658912596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document falcon +0.00036353353029443546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document falcon +0.0003537326003150983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document falcon +0.00036053976358299083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document falcon +0.000352380489373494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document falcon +0.00036154661616900994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document falcon +0.00035959332325963614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document falcon +0.0003597954667189692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document falcon +0.0003563108270597542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document falcon +0.0003582891940460143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document falcon +0.0003497728210484297 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document falcon +0.0003549834902179354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document falcon +0.0003529828233484542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document falcon +0.00034627483903285777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document falcon +0.00035569006572589215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document falcon +0.00035449377946910314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document falcon +0.00035802844396194623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document falcon +0.0003617277809353208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document falcon +0.00035034118898654814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document falcon +0.000351091193908611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document falcon +0.0003527914342210668 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document falcon +0.00035028288369781376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document falcon +0.00035775745592780506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document falcon +0.0003449630690661468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document falcon +0.0003583490698830361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document falcon +0.0003476995746684122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document falcon +0.0003535632505019212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document falcon +0.00035640180641147417 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document falcon +0.000361731045691765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document falcon +0.0003534082129597368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document falcon +0.0003550344149828664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document falcon +0.00035363002411364057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document falcon +0.0003537265579677396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document falcon +0.00034950531383577937 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document falcon +0.00035008511827347514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document falcon +0.00035594533400871325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document falcon +0.00035266312861335946 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document falcon +0.00035280268794863923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document falcon +0.0003565470391528536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document falcon +0.0003588492322689137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document falcon +0.00035469909697832775 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document falcon +0.00034712082813410526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document falcon +0.000348701157101807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document falcon +0.0003500192014479944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document falcon +0.00035120560544669755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document falcon +0.00035403656850437445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document falcon +0.00035852376560749366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document falcon +0.0003534754068111774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document falcon +0.00035591740046720765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document falcon +0.000348522354782563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document falcon +0.0003533533959664415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document falcon +0.00035631425964030697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document falcon +0.0003485886551574741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document falcon +0.00035917652631065777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document falcon +0.0003482975272111288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document falcon +0.00035580661277480167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document falcon +0.0003492290722955348 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document falcon +0.00034989284450240613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document falcon +0.0003545677216162781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document falcon +0.00034622286859463484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document falcon +0.00036070626989861965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document falcon +0.00035518365036320786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document falcon +0.00035272907057848406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document falcon +0.0003547343638218734 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document falcon +0.0003496450144966242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document falcon +0.0003537407829294287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document falcon +0.0003489722653985685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document falcon +0.00035057186899911295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document falcon +0.0003507566548933051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document falcon +0.00035630360179023747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document falcon +0.00035631362503416367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document falcon +0.0003490204248026821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document falcon +0.00035761724058371226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document falcon +0.00035037664777467137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document falcon +0.000353402110481068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document falcon +0.00034524163568371745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document falcon +0.00035528523728570974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document falcon +0.00034784916132431703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document falcon +0.00034928476408048925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document falcon +0.00034989205973784984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document falcon +0.00034201664404094254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document falcon +0.0003529676016338611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document falcon +0.00034643433682346637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document falcon +0.0003511666373001904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document falcon +0.00034828669066575333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document falcon +0.0003494625207264413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document falcon +0.0003458957535879216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document falcon +0.0003543020478990003 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document falcon +0.00034754384069014956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document falcon +0.0003598856392240133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document falcon +0.0003503335458553846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document falcon +0.00035919595619778716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document falcon +0.00035767737970754404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document falcon +0.00035197152783998165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document falcon +0.0003549609834422404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document falcon +0.0003568184100569753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document falcon +0.0003512652818651935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document falcon +0.00035912648958665754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document falcon +0.00034764526964056546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document falcon +0.000352439784960359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document falcon +0.00035295886560764226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document falcon +0.0003518132693658672 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document falcon +0.00035589987915465713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document falcon +0.00034923863317385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document falcon +0.0003457987267929692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document falcon +0.0003560928663480501 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document falcon +0.0003529603811204932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document falcon +0.0003524438555443043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document falcon +0.0003438847030263783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document falcon +0.00035981978898461613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document falcon +0.0003446342778566972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document falcon +0.00035529584995236537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document falcon +0.00034855740895831116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document falcon +0.00034932634912802544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document falcon +0.00035805518303064666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document falcon +0.0003497941877073061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document falcon +0.00035774398685405447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document falcon +0.0003560421780316607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document falcon +0.0003508844468369392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document falcon +0.00035731928892270107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document falcon +0.0003557884626314314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document falcon +0.00034992996760289355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document falcon +0.000360752554360921 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document falcon +0.0003452321668708545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document falcon +0.0003591745226131023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document falcon +0.00035256981433229084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document falcon +0.00035378123159712034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document falcon +0.000350464354895999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document falcon +0.00035074625557389677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document falcon +0.00035025894701994667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document falcon +0.00035437902514857614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document falcon +0.0003514684519732232 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document falcon +0.00035449717909633905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document falcon +0.0003436816402714221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document falcon +0.00035139158071782116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document falcon +0.0003509424079843335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document falcon +0.000343894618577506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document falcon +0.0003500789770661659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document falcon +0.0003407788080680086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document falcon +0.0003581908175239701 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document falcon +0.0003465541618780918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document falcon +0.00034600228792437736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document falcon +0.00034416738982773204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document falcon +0.0003519900340150641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document falcon +0.000343369616864659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document falcon +0.0003544993883274688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document falcon +0.0003504441365073392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document falcon +0.00034859160702727056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document falcon +0.00035355909532647185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document falcon +0.0003471900922691849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document falcon +0.0003563015508709187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document falcon +0.0003487888744148821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document falcon +0.00034711767548688336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document falcon +0.0003530734609369085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document falcon +0.00035123969242560935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document falcon +0.0003517127620891489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document falcon +0.00035232835416868673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document falcon +0.0003524437481912308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document falcon +0.0003525996167005602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document falcon +0.00035064770545242043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document falcon +0.00035311558274981226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document falcon +0.00034952204800569914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document falcon +0.0003541471367344846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document falcon +0.00035418812454561825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document falcon +0.0003528951372900714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document falcon +0.0003542338042975688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document falcon +0.00034937738939942796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document falcon +0.0003522182190878447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document falcon +0.0003501406466507449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document falcon +0.00034973079877492633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document falcon +0.0003485274567713538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document falcon +0.00034999308679368985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document falcon +0.0003570051724707296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document falcon +0.00034567230462019706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document falcon +0.00035529000940160696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document falcon +0.00034956512308671755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document falcon +0.0003496962834028953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document falcon +0.0003468745282493457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document falcon +0.0003502717155809202 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document falcon +0.0003556240880896514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document falcon +0.0003515109488424343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document falcon +0.0003563156688192592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document falcon +0.00035040277363989817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document falcon +0.0003481408593290717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document falcon +0.0003624575124332874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document falcon +0.0003522684124250313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document falcon +0.00035286996027653544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document falcon +0.00034967623997256725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document falcon +0.00035182649587602765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document falcon +0.0003524892557026489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document falcon +0.0003507642477451811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document falcon +0.00036190408389835666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document falcon +0.00035102739424880766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document falcon +0.00035239718753257265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document falcon +0.00035298076121821316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document falcon +0.0003478704389752654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document falcon +0.0003503109191567942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document falcon +0.00035143250975654426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document falcon +0.0003480663923069012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document falcon +0.00035691540219998623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document falcon +0.000348815437166351 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document falcon +0.00035202073257766225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document falcon +0.0003491569096274706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document falcon +0.00035277390475511834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document falcon +0.0003524972090026609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document falcon +0.0003504854249750236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document falcon +0.00034740238025423914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document falcon +0.00034968015462277606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document falcon +0.0003493798632762674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document falcon +0.0003488202537862122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document falcon +0.0003525461864643725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document falcon +0.00034903815232825664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document falcon +0.00035536982539258216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document falcon +0.00034858083265155483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document falcon +0.0003505014973608067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document falcon +0.00035327984042622104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document falcon +0.0003503286677453136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document falcon +0.00035835274842442816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document falcon +0.00034970302660275595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document falcon +0.000357929573140149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document falcon +0.0003517238649788585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document falcon +0.00036097027318848475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document falcon +0.0003502734074110026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document falcon +0.00035801510806036273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document falcon +0.0003568006373479869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document falcon +0.00036128108717454636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document falcon +0.0003563436883111686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document falcon +0.00035559725321852463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document falcon +0.00035089656006854944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document falcon +0.000359453964362057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document falcon +0.00035629498059104033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document falcon +0.0003622207707090437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document falcon +0.0003540946784512821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document falcon +0.0003594750565232011 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document falcon +0.0003566007415086991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document falcon +0.0003562142599126134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document falcon +0.0003569948186744601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document falcon +0.00035166554847920186 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document falcon +0.00035047994419295137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document falcon +0.0003561578193739437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document falcon +0.00035470866838811544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document falcon +0.00034216920464876335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document falcon +0.0003550021513075795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document falcon +0.0003488045105938729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document falcon +0.0003513340720840151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document falcon +0.0003448558566387584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document falcon +0.0003460966026953241 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document falcon +0.0003488157616036459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document falcon +0.0003446120387842362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document falcon +0.000351528602987427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document falcon +0.00035661118227454713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document falcon +0.0003551342699877457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document falcon +0.0003478953397924445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document falcon +0.00034625782458988215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document falcon +0.0003527515447405871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document falcon +0.00034823744889805696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document falcon +0.00034823314560254406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document falcon +0.00035162668292961944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document falcon +0.0003477307716074623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document falcon +0.0003446457989477787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document falcon +0.00034782916273767795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document falcon +0.0003517249130302248 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document falcon +0.0003449873430908556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document falcon +0.00034841291749669877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document falcon +0.0003466028498941749 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document falcon +0.0003486436831199424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document falcon +0.0003478279234211838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document falcon +0.0003495903653274374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document falcon +0.00034896893881218957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document falcon +0.000348941645312426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document falcon +0.0003474221308416894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document falcon +0.0003462621543839385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document falcon +0.0003669373860863891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document falcon +0.00034691156268163006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document falcon +0.0003527774103765281 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document falcon +0.00034684565672734663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document falcon +0.0003454250599604457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document falcon +0.0003541536557159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document falcon +0.000345735737037366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document falcon +0.0003524669816385214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document falcon +0.0003441817133096468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document falcon +0.0003519093265859089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document falcon +0.00035080085480352095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document falcon +0.00035285227929327434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document falcon +0.00034354836346901676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document falcon +0.00034789770937373467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document falcon +0.000343665920520102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document falcon +0.0003490884931060568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document falcon +0.00034380029463398654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document falcon +0.00034874768005099945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document falcon +0.0003457058510967673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document falcon +0.00034644265227023904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document falcon +0.00035008339858594957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document falcon +0.0003462377193296194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document falcon +0.0003620491787114201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document falcon +0.000348717011044469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document falcon +0.00034370072363913706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document falcon +0.0003551981066775649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document falcon +0.0003500119496799342 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document falcon +0.0003485082952669081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document falcon +0.0003508155580978919 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document falcon +0.00035311375163251416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document falcon +0.00034945972003423253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document falcon +0.0003474220353789879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document falcon +0.0003536443686585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document falcon +0.0003560350489042953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document falcon +0.0003493655927914396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document falcon +0.0003528423977146383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document falcon +0.00035255554724471217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document falcon +0.0003479760010190111 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document falcon +0.00035458598862501956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document falcon +0.0003458990560538315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document falcon +0.00035157946422379875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document falcon +0.00034736860650169996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document falcon +0.0003529152313394119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document falcon +0.00034586294329524465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document falcon +0.00035707214923794877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document falcon +0.0003509580363496512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document falcon +0.00035244176725524474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document falcon +0.0003467539557999047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document falcon +0.00034919687962275546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document falcon +0.00035094031731719953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document falcon +0.0003484309008351352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document falcon +0.0003485409424916253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document falcon +0.0003499590776117838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document falcon +0.0003492842758957848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document falcon +0.0003529712275178912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document falcon +0.0003566141287087449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document falcon +0.0003649496522047409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document falcon +0.0003563218912208234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document falcon +0.00035614782126966145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document falcon +0.0003531944298453266 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document falcon +0.0003535950949566616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document falcon +0.0003544295554928795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document falcon +0.0003519908503740376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document falcon +0.00035752817626134463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document falcon +0.0003515322689589972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document falcon +0.0003486893890307115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document falcon +0.0003446520464889867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document falcon +0.0003509421562481707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document falcon +0.00035335015702909084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document falcon +0.0003490178167345008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document falcon +0.0003520497821155174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document falcon +0.0003549762618908944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document falcon +0.00035072190850833103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document falcon +0.0003542458638526423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document falcon +0.000352419194572916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document falcon +0.0003545102564672614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document falcon +0.0003495437992331806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document falcon +0.0003542843376993964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document falcon +0.000352827529313958 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document falcon +0.00035442506093223886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document falcon +0.0003496970719044257 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document falcon +0.0003553096424442362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document falcon +0.00034986845565067564 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document falcon +0.000352131055186658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document falcon +0.0003527021708198983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document falcon +0.00034905885414547214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document falcon +0.0003583433842468394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document falcon +0.00034409435202828383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document falcon +0.00034846410520871483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document falcon +0.0003554459991927314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document falcon +0.00035310507471843076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document falcon +0.000350028910786098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document falcon +0.00035049727458009896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document falcon +0.0003519047735925826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document falcon +0.0003513027429919726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document falcon +0.0003626947260354396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document falcon +0.0003500087324849783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document falcon +0.0003618315726725285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document falcon +0.0003535385113938023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document falcon +0.0003487064058517615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document falcon +0.0003618709124780938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document falcon +0.00035040070335625915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document falcon +0.0003506279032267829 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document falcon +0.0003498435310527524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document falcon +0.0003554634749821431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document falcon +0.00035091209738758963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document falcon +0.00035034103678978573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document falcon +0.00035398931854386146 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document falcon +0.00035495529304989485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document falcon +0.00036067883473356603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document falcon + diff --git a/ALCF/data-lists/polaris/megawiki.txt b/ALCF/data-lists/polaris/megawiki.txt new file mode 100644 index 0000000000..56ec7debc7 --- /dev/null +++ b/ALCF/data-lists/polaris/megawiki.txt @@ -0,0 +1,262 @@ +6.322825248625475e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document megawika +2.4432314037946264e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document megawika +5.6313888721313454e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document megawika +2.4208171781595055e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document megawika +2.325811856369237e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document megawika +2.4010790356322705e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document megawika +5.36773610843632e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document megawika +1.360574433501002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document megawika +1.3076540344853244e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document megawika +1.3386534334886313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document megawika +1.2498103719605153e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document megawika +1.403763836949682e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document megawika +1.3636756723495417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document megawika +1.2242489446940814e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document megawika +1.2398255818973339e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document megawika +1.2972616994216281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document megawika +1.3947809855914134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document megawika +1.3144843787829514e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document megawika +1.1693809976572487e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document megawika +1.3677252682893802e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document megawika +1.3940876719849597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document megawika +1.4222245138730965e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document megawika +1.3201677767919704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document megawika +1.1421717796486169e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document megawika +1.2890514724498703e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document megawika +1.3649507648749037e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document megawika +1.2400732563490717e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document megawika +1.1557681453277616e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document megawika +1.2294483595964517e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document megawika +1.2137484472122283e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document megawika +1.3299663426456e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document megawika +1.2461984216479532e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document megawika +1.4666434217609636e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document megawika +1.1876997894686238e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document megawika +1.2939155338964078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document megawika +1.3859590039728515e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document megawika +1.317917848615668e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document megawika +1.1335281536110342e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document megawika +1.2889923952861426e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document megawika +1.3471671647053326e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document megawika +1.2221720014475102e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document megawika +1.2632647276287541e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document megawika +1.28276219004076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document megawika +1.36213704321643e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document megawika +1.2414858625261553e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document megawika +1.3173700421883744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document megawika +1.295597796725686e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document megawika +1.242783936442904e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document megawika +1.2417374088427464e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document megawika +1.2134479405400744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document megawika +1.3090040663304255e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document megawika +1.2713470581614905e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document megawika +5.5750231378906594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document megawika +5.777597358425469e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document megawika +5.349786767471258e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document megawika +5.675165050453583e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document megawika +5.482611216158831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document megawika +5.065421899890121e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document megawika +5.384718357480146e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document megawika +4.872037363236061e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document megawika +4.532709250783155e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document megawika +5.7257963030489613e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document megawika +4.9014365579652036e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document megawika +5.722863552770969e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document megawika +6.149911636146833e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document megawika +5.2178057608273506e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document megawika +4.990228161160431e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document megawika +5.866186875255134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document megawika +5.004185734360719e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document megawika +4.79401853705107e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document megawika +5.435219965052376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document megawika +5.035997225792266e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document megawika +5.622401774211625e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document megawika +5.028826157387559e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document megawika +5.596379470128795e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document megawika +6.027824493191489e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document megawika +5.5358270009931474e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document megawika +5.9839051807685496e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document megawika +5.1221077499249595e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document megawika +5.517228560620279e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document megawika +5.1687858285052305e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document megawika +5.684188244145645e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document megawika +5.212693275535878e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document megawika +4.8551007022784084e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document megawika +5.4888506639203145e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document megawika +5.345098688527242e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document megawika +4.8506420625516594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document megawika +5.132168603397676e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document megawika +5.719476795114223e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document megawika +5.7448621149792696e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document megawika +4.9068410568059265e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document megawika +5.382937299647678e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document megawika +4.8288432136304634e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document megawika +5.841703200305416e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document megawika +5.1589611587885584e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document megawika +6.031113829732574e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document megawika +5.4558202844532094e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document megawika +5.341852317196142e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document megawika +5.1402942738369954e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document megawika +5.735421384377395e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document megawika +5.473629863586958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document megawika +5.4708993245733936e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document megawika +4.931161863634078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document megawika +5.104173022127248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document megawika +5.510157161510824e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document megawika +5.652501401782597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document megawika +5.7273656573031666e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document megawika +5.638363224821738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document megawika +5.6128115396668704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document megawika +5.00304877998141e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document megawika +5.596120554779096e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document megawika +5.5280923889040006e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document megawika +5.223477917938408e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document megawika +5.29472809986569e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document megawika +2.205682378243213e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document megawika +1.4367563720603185e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document megawika +3.5506193487931076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document megawika +3.0442910855821778e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document megawika +2.2540042508019627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document megawika +2.6880163202623216e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document megawika +2.534473148048727e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document megawika +2.6560945431318916e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document megawika +2.547470248967691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document megawika +2.5248825388073738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document megawika +2.5828729575000054e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document megawika +2.4026583817957736e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document megawika +2.3930425429834413e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document megawika +2.5037365362599724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document megawika +2.6696745470595603e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document megawika +2.140323051341762e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document megawika +2.617354786691592e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document megawika +1.538359101762691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document megawika +1.2871029252377856e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document megawika +2.255195411289217e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document megawika +2.4832313897952067e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document megawika +9.303873918189968e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document megawika +2.179532302620228e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document megawika +1.9750517506901206e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document megawika +2.7740420380648435e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document megawika +2.7813714782319335e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document megawika +4.1595357937609806e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document megawika +2.741365122389175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document megawika +2.117451071361901e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document megawika +1.7132649760565998e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document megawika +1.7492547092602047e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document megawika +1.7499951097392276e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document megawika +1.6632444789170958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document megawika +1.6678802252361607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document megawika +1.5519208704558896e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document megawika +1.652420992967167e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document megawika +1.6119931034508755e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document megawika +1.6638882076736552e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document megawika +1.7198076782652946e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document megawika +1.572927860565175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document megawika +1.5194822618169918e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document megawika +1.6677776832669846e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document megawika +1.595612492245688e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document megawika +1.682350633181197e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document megawika +1.663983380609724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document megawika +1.710187842689243e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document megawika +1.5733697527539038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document megawika +1.6972104757911438e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document megawika +1.6610142847616577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document megawika +1.61094882403031e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document megawika +1.4789207305138325e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document megawika +1.639299617676302e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document megawika +1.3241204512116132e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document megawika +8.582260726625535e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document megawika +8.213000975576739e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document megawika +9.549247732811947e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document megawika +9.17242785339013e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document megawika +7.632868223725218e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document megawika +8.674401118222175e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document megawika +9.124384255505347e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document megawika +8.344222222417358e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document megawika +8.992299957499065e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document megawika +8.76689497361025e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document megawika +7.973396239586015e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document megawika +9.006935606644125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document megawika +8.725545954955498e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document megawika +1.215449694669174e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document megawika +3.3041720284158646e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document megawika +2.0593512412624502e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document megawika +1.893608946986248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document megawika +1.737111666788535e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document megawika +1.4915923449873955e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document megawika +2.289370239067605e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document megawika +2.8615335689614638e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document megawika +8.847283630883125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document megawika +1.8175470362373804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document megawika +1.8152226683368038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document megawika +1.789149655314284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document megawika +1.7690523036477663e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document megawika +1.8333732213753644e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document megawika +1.8794105687718654e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document megawika +1.721841156706417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document megawika +2.0612008685724796e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document megawika +1.9297370681336376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document megawika +2.0188440409661018e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document megawika +5.1741216329695265e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document megawika +1.3417913926038429e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document megawika +1.1010813016469651e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document megawika +1.1252416134320087e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document megawika +1.2801744104313002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document megawika +1.3041514955795817e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document megawika +1.3428837580879075e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document megawika +1.320809382267804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document megawika +1.3451566676555968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document megawika +1.228284926657501e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document megawika +1.2410599573923043e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document megawika +1.3815343367377182e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document megawika +1.3895126265148832e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document megawika +1.2306773644401741e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document megawika +1.32981021906281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document megawika +1.101337469221607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document megawika +1.513094184404692e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document megawika +1.1073759547073234e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document megawika +1.2879348765857567e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document megawika +9.619595770228435e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document megawika +1.2384340836286436e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document megawika +1.1766667232211577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document megawika +1.2871049236196452e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document megawika +1.2010645926497744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document megawika +1.3971428231518597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document megawika +1.2283733550547932e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document megawika +1.2659530508255308e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document megawika +1.551775613074462e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document megawika +1.1169413343776979e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document megawika +1.1433700593712463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document megawika +4.964773647323492e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document megawika +1.0995586595687313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document megawika +1.2957393071411267e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document megawika +2.75899247407709e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document megawika +2.8269344597344854e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document megawika +2.329108187246831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document megawika +2.4231761430460284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document megawika +1.2434140512230442e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document megawika +1.638718338352859e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document megawika +3.272953556801187e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document megawika +6.061314500486327e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document megawika +1.2465979731210292e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document megawika +1.2737557327967737e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document megawika +1.038428658075627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document megawika +2.61666472045566e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document megawika +3.6506873212272224e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document megawika +1.5066359138295701e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document megawika +1.1166290872121178e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document megawika +1.5546966228590285e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document megawika +1.2583434625014828e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document megawika +1.3398826881300862e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document megawika +1.2944933160515968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document megawika +1.0971437399901365e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document megawika +1.2787922795775774e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document megawika +1.404979227816985e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document megawika +1.3344734431324463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document megawika +4.886031157107555e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document megawika +3.277261443596394e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document megawika +3.5057957685786495e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document megawika +3.287625301718589e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document megawika +3.1370056372668855e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document megawika +3.186092015785841e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document megawika +7.271819324142512e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document megawika diff --git a/ALCF/data-lists/polaris/open-web-math-train.txt b/ALCF/data-lists/polaris/open-web-math-train.txt new file mode 100644 index 0000000000..6d86bd35a4 --- /dev/null +++ b/ALCF/data-lists/polaris/open-web-math-train.txt @@ -0,0 +1,13 @@ +0.001451215788905126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document open-web-math-train +0.0014486847196258788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document open-web-math-train +0.0008861032722895899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document open-web-math-train +0.0018119590809459816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document open-web-math-train +0.0008916937917547129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document open-web-math-train +6.960128832809415e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document open-web-math-train +0.002008403651063623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document open-web-math-train +0.0014374900742131454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document open-web-math-train +0.00180213596996716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document open-web-math-train +0.001956178877532413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document open-web-math-train +0.0008829547017667033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document open-web-math-train +0.0008910853619157279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document open-web-math-train +0.0018260998845299973 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document open-web-math-train diff --git a/ALCF/data-lists/polaris/pes2o.txt b/ALCF/data-lists/polaris/pes2o.txt new file mode 100644 index 0000000000..47a7eb3ffd --- /dev/null +++ b/ALCF/data-lists/polaris/pes2o.txt @@ -0,0 +1,26 @@ +0.0012499632072059553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document pes2o +0.00125398260359913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document pes2o +0.0012541704774729071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document pes2o +0.0012527268234360602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document pes2o +0.0012532925243737164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document pes2o +0.0012456396241204315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document pes2o +0.0012589894424352072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document pes2o +0.001508020123999618 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document pes2o +0.00333096950781965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document pes2o +0.0033233414614415547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document pes2o +0.003512387990689828 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document pes2o +0.0035091382940513126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document pes2o +0.003514155927147005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document pes2o +0.003327108000579638 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document pes2o +0.003329106196589836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document pes2o +0.003505604148738077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document pes2o +0.003324825759567855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document pes2o +0.0033248240149804913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document pes2o +0.0033385962112851358 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document pes2o +0.0035043186296553615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document pes2o +0.003340469505431529 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document pes2o +0.0035106889084796276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document pes2o +0.0033309469281030167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document pes2o +0.003340337858029757 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document pes2o +0.003505919861097801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document pes2o +0.0003882924098240512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document pes2o diff --git a/ALCF/data-lists/polaris/reddit.txt b/ALCF/data-lists/polaris/reddit.txt new file mode 100644 index 0000000000..ef79bbc7c8 --- /dev/null +++ b/ALCF/data-lists/polaris/reddit.txt @@ -0,0 +1,78 @@ +0.0005759963691850877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document reddit +0.0005959971675332674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document reddit +0.0006026179290353799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document reddit +0.0005824184320784846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document reddit +0.0005854598548616037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document reddit +0.0005903767055633473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document reddit +0.0005930306490982049 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document reddit +0.000569425602700746 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document reddit +0.0005675060415179408 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document reddit +0.0005772431621253389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document reddit +0.0005678026053826858 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document reddit +0.0005700398263483378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document reddit +0.0005669467963528824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document reddit +0.0005701015953324305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document reddit +0.0005795907287413296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document reddit +0.0005735602737531164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document reddit +0.0005749862745842101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document reddit +0.0005693257015931971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document reddit +0.0005716568794795563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document reddit +0.0005761083919774021 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document reddit +0.0005688343169797355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document reddit +0.0005807913190929842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document reddit +0.0005710229258078636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document reddit +0.0005704083039826862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document reddit +0.0005862132348308056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document reddit +0.0005717662049559556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document reddit +0.0005858155213694451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document reddit +0.0005812012281792392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document reddit +0.0005803981414588498 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document reddit +0.0005700102108287723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document reddit +0.0005719243459052329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document reddit +0.0005867253401661752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document reddit +0.0005731087218860733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document reddit +0.0005712197789109317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document reddit +0.0005702376926310089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document reddit +0.0005700411527742972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document reddit +0.0005828090098178196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document reddit +0.0005770140826168056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document reddit +0.0005723509664597896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document reddit +0.0005755499231836962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document reddit +0.0005636407438471367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document reddit +0.0005640281556500104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document reddit +0.0005633159058766496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document reddit +0.0005638034311151449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document reddit +0.0005630066273073224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document reddit +0.0005631803831128559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document reddit +0.0005631228881679657 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document reddit +0.0005628178701487633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document reddit +0.0005624448092256196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document reddit +0.0005620957024062329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document reddit +0.0005614201504177484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document reddit +0.0005616890951464056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document reddit +0.0005611348559279058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document reddit +0.0005604238061828518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document reddit +0.0005603301490194237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document reddit +0.0005607291294548833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document reddit +0.0005605234569930727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document reddit +0.0005613778566640694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document reddit +0.0005610248539992471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document reddit +0.0005599977416780475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document reddit +0.0005603632562116935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document reddit +0.0005599177479509897 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document reddit +0.0005595202318298379 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document reddit +0.0005600975633499175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document reddit +0.0005614075491213365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document reddit +0.000612563885043477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document reddit +0.0005515469909644413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document reddit +0.0005526782014946906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document reddit +0.0005472463408095445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document reddit +0.0005502284746004587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document reddit +0.0005414514790555363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document reddit +0.0005513499500134784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document reddit +0.0005391391454105187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document reddit +0.0005415836910001838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document reddit +0.0005208132468536551 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document reddit +0.0005889827143132871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document reddit +0.0005822520817765276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document reddit +0.0004173155230758696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document reddit diff --git a/ALCF/data-lists/polaris/stack.txt b/ALCF/data-lists/polaris/stack.txt new file mode 100644 index 0000000000..a81e55f94a --- /dev/null +++ b/ALCF/data-lists/polaris/stack.txt @@ -0,0 +1,26 @@ +0.0009994361338078242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document stackexchange +0.001087156194657966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document stackexchange +0.0010667737163656816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document stackexchange +0.0009602877882124873 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document stackexchange +0.0008968956271971105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document stackexchange +0.0009198034843762967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document stackexchange +0.0009423901016715341 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document stackexchange +0.0009674094553686345 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document stackexchange +0.0009858331322519164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document stackexchange +0.0009970593645879198 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document stackexchange +0.0010027035193731686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document stackexchange +0.0010128291154221853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document stackexchange +0.0010215631382631918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document stackexchange +0.0010288663771461238 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document stackexchange +0.0010346219929285867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document stackexchange +0.00104544019940344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document stackexchange +0.0010525172676724333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document stackexchange +0.0010609529620775127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document stackexchange +0.0010725892748610153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document stackexchange +0.0010818563598181568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document stackexchange +0.0010992760196793917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document stackexchange +0.0011178992762079917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document stackexchange +0.001124687532085676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document stackexchange +0.001118303661267191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document stackexchange +0.0010206825575416534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document stackexchange +0.0005512280117499715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document stackexchange diff --git a/ALCF/data-lists/polaris/starcoder.txt b/ALCF/data-lists/polaris/starcoder.txt new file mode 100644 index 0000000000..5c28dd55b6 --- /dev/null +++ b/ALCF/data-lists/polaris/starcoder.txt @@ -0,0 +1,50 @@ +0.004474659408857016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document starcoder +0.00409944473890653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document starcoder +0.005137179939941845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document starcoder +0.005143172251066109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document starcoder +0.005206134363352808 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document starcoder +0.004892747858974329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document starcoder +0.004844731352552902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document starcoder +0.005308320169123755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document starcoder +0.005124709815666577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document starcoder +0.005424710744483826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document starcoder +0.00538244648861977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document starcoder +0.0029107284679086853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document starcoder +0.0026825258998444705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document starcoder +0.0026904503191419243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document starcoder +0.002687906577174073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document starcoder +0.002850165346048818 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document starcoder +0.005322698571717847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document starcoder +0.004450334290869719 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document starcoder +0.004700990083440683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document starcoder +0.003903568556500995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document starcoder +0.00390561515396931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document starcoder +0.0039046402900912262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document starcoder +0.003907454839379547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document starcoder +0.0038583224578603824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document starcoder +0.0037914116657695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document starcoder +0.003786665266798682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document starcoder +0.003792000802430658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document starcoder +0.00319266847466091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document starcoder +0.0032658716699838944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document starcoder +0.0034801959532460023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document starcoder +0.0028307012092022594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document starcoder +0.0028420360878146276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document starcoder +0.0028410455248484914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document starcoder +0.00283497183526842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document starcoder +0.002840187195459487 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document starcoder +0.0028398709431369834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document starcoder +0.004364722843422023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document starcoder +0.004093255713117101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document starcoder +0.004092331079566252 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document starcoder +0.004005326985579649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document starcoder +0.0036205502856964207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document starcoder +0.003625316793034984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document starcoder +0.003604743435602363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document starcoder +0.0035405823343673125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document starcoder +0.0041601413517253945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document starcoder +0.005886303658937057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document starcoder +0.003600909532810332 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document starcoder +0.0034941365817168658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document starcoder +0.0004992164842980224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document starcoder + diff --git a/ALCF/data-lists/polaris/tulu.txt b/ALCF/data-lists/polaris/tulu.txt new file mode 100644 index 0000000000..e7a681d660 --- /dev/null +++ b/ALCF/data-lists/polaris/tulu.txt @@ -0,0 +1,66 @@ +0.00032927705604725614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document tulu +0.0002860154190878753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document tulu +0.0002845217585425619 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document tulu +0.0002743528685497456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document tulu +0.00026025323737738766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document tulu +0.00023493876414603155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document tulu +0.00029665994994226705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document tulu +0.00031808102075993956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document tulu +0.00031813573046011285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document tulu +0.0002711905171855542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document tulu +0.00028892513401817095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document tulu +0.00030003908676979083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document tulu +0.00026839878771944684 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document tulu +0.00029155935002690497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document tulu +0.0002998624927624209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document tulu +0.0003091705447974841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document tulu +0.00026873195794309786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document tulu +0.00027721873498527547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document tulu +0.0002841662554024377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document tulu +0.0002839461156551537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document tulu +0.0002861705604659811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document tulu +0.0002460995649635886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document tulu +0.00019420142619795496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document tulu +0.00021967677816173628 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document tulu +0.0002620283200480949 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document tulu +0.0002433390542188936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document tulu +0.00021254976608350767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document tulu +0.00022094815569522115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document tulu +0.000342862378668244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document tulu +0.00033784225259118157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document tulu +0.0003367278459543952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document tulu +0.00029843279042852765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document tulu +0.0002926583661257988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document tulu +0.00029320337282010673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document tulu +0.00029281450669483455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document tulu +0.0002915338187002653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document tulu +0.0002864226923084572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document tulu +0.00028643439083586396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document tulu +0.00028253710956299054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document tulu +0.0002810856078805806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document tulu +0.00031474941344656715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document tulu +0.0002139130222205655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document tulu +0.0003084648871862831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document tulu +0.0003309477872140129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document tulu +0.0003360096824695161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document tulu +0.0003355452655196557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document tulu +0.00038119390366386037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document tulu +0.00038078927630086064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document tulu +0.0003386200917551554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document tulu +0.0002158905159938882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document tulu +0.00021621682877018768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document tulu +0.00021553306942740535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document tulu +0.00021581563462722296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document tulu +0.0002157694110556169 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document tulu +0.000215643699847159 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document tulu +0.00021532716715168094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document tulu +0.00021531221326022472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document tulu +0.0002831801179028896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document tulu +0.0002514844936507595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document tulu +0.00031638782778107964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document tulu +0.0002749197545278445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document tulu +0.00026159721512464495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document tulu +0.0002630052420096968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document tulu +0.00031106811228913666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document tulu +0.0002852973415334161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document tulu +3.7555372465932136e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document tulu diff --git a/ALCF/data-lists/polaris/wiki.txt b/ALCF/data-lists/polaris/wiki.txt new file mode 100644 index 0000000000..55ba7680ad --- /dev/null +++ b/ALCF/data-lists/polaris/wiki.txt @@ -0,0 +1,2 @@ +0.003548077173506675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document wiki +0.0018372203137874265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document wiki diff --git a/ALCF/data-lists/sirius/books.txt b/ALCF/data-lists/sirius/books.txt new file mode 100644 index 0000000000..7567ba5227 --- /dev/null +++ b/ALCF/data-lists/sirius/books.txt @@ -0,0 +1,3 @@ +0.006 /lus/tegu/projects/PolarisAT/foremans/projects/argonne-lcf/Megatron-DeepSpeed/data/books-0000_text_document +0.006 /lus/tegu/projects/PolarisAT/foremans/projects/argonne-lcf/Megatron-DeepSpeed/data/books-0001_text_document +0.006 /lus/tegu/projects/PolarisAT/foremans/projects/argonne-lcf/Megatron-DeepSpeed/data/books-0002_text_document diff --git a/ALCF/data-lists/sunspot/algebraic.txt b/ALCF/data-lists/sunspot/algebraic.txt new file mode 100644 index 0000000000..f72bf47d74 --- /dev/null +++ b/ALCF/data-lists/sunspot/algebraic.txt @@ -0,0 +1,16 @@ +0.0018520780893211373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document algebraic-stack-train +0.0017591050606817512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document algebraic-stack-train +0.001459052794333798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document algebraic-stack-train +0.0007405667281569194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document algebraic-stack-train +0.00019420030110896795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document algebraic-stack-train +0.0009008668715801845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document algebraic-stack-train +0.00015115827957143057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document algebraic-stack-train +0.0014552844319220648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document algebraic-stack-train +0.0012469861325685161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document algebraic-stack-train +0.00136412011372413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document algebraic-stack-train +0.0007064279699221103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document algebraic-stack-train +0.0008472240000687427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document algebraic-stack-train +0.0001984375713341955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document algebraic-stack-train +0.0005472773881697123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document algebraic-stack-train +0.001815779629850992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document algebraic-stack-train +0.0018313600689757324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document algebraic-stack-train diff --git a/ALCF/data-lists/sunspot/arxiv.txt b/ALCF/data-lists/sunspot/arxiv.txt new file mode 100644 index 0000000000..34972accf4 --- /dev/null +++ b/ALCF/data-lists/sunspot/arxiv.txt @@ -0,0 +1,100 @@ +0.0002583902668716813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document arxiv +0.0002646575141232155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document arxiv +0.0003165521247456758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document arxiv +0.0002920706460176214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document arxiv +0.00028396813182810215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document arxiv +0.00030445161883108107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document arxiv +0.00031628781276576474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document arxiv +0.0003083776568189157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document arxiv +0.0003176359471472902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document arxiv +0.0002536009369131698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document arxiv +0.0003067491424681363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document arxiv +0.0002597217257557784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document arxiv +0.0003788556450109768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document arxiv +0.0002796563272052598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document arxiv +0.00033573826524290287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document arxiv +0.00030523658022800287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document arxiv +0.00032211552192240096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document arxiv +0.0003329295675164247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document arxiv +0.0003101982186639862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document arxiv +0.00032361798234223355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document arxiv +0.0003495541581652915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document arxiv +0.0002821637448858042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document arxiv +0.00030399523537629673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document arxiv +0.0002955658968247219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document arxiv +0.00028942158502924254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document arxiv +0.00028769546171490733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document arxiv +0.0002938111057234182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document arxiv +0.0002711150403010948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document arxiv +0.00031130095874747565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document arxiv +0.0003002996118160777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document arxiv +0.0003732757901604459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document arxiv +0.00026784205751795894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document arxiv +0.0002799626521661984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document arxiv +0.00034334276069078164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document arxiv +0.0003582469803674965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document arxiv +0.00031094844818418623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document arxiv +0.0002766228384977191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document arxiv +0.00030297116159471485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document arxiv +0.00027033888377464685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document arxiv +0.00030090862368377933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document arxiv +0.00028543875802490955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document arxiv +0.00027559768459074204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document arxiv +0.0003182185533962886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document arxiv +0.0003311392971435837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document arxiv +0.00028751652060804325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document arxiv +0.000303466863212589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document arxiv +0.00033400462801277524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document arxiv +0.0002589234031777426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document arxiv +0.0002913508598466723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document arxiv +0.0002670572450004856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document arxiv +0.00032027399105647656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document arxiv +0.00032188376258379377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document arxiv +0.0003161585784100882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document arxiv +0.0003184249182974135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document arxiv +0.00030381336664000807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document arxiv +0.0003190437442184283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document arxiv +0.0002537961798200545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document arxiv +0.0003017817117223326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document arxiv +0.00028685268513240224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document arxiv +0.00031265179094451165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document arxiv +0.00034708319096986816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document arxiv +0.00026650837943080664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document arxiv +0.00034588832248507335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document arxiv +0.0002416982248399037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document arxiv +0.0003089296918222243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document arxiv +0.00029137184185700827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document arxiv +0.00026464226846800774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document arxiv +0.00030545397919456627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document arxiv +0.0003206778460448875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document arxiv +0.00030968971641110967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document arxiv +0.00023325653928600864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document arxiv +0.00030526899198338555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document arxiv +0.00035376719076633584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document arxiv +0.000290224385981026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document arxiv +0.000294650083382008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document arxiv +0.00028768858128616436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document arxiv +0.00030856965235527843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document arxiv +0.00030579942447879054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document arxiv +0.0002863101084704357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document arxiv +0.0002870032092492213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document arxiv +0.000264182727569885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document arxiv +0.0002974012367036449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document arxiv +0.00032238412143059203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document arxiv +0.00031683716893819036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document arxiv +0.00031157434937617524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document arxiv +0.0003411742735695989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document arxiv +0.00026778444816570715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document arxiv +0.0003037045797275201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document arxiv +0.00027746114370081314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document arxiv +0.00027148285946862043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document arxiv +0.00028042950114678207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document arxiv +0.0003235607816590721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document arxiv +0.0003086692227306295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document arxiv +0.00033990349455148105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document arxiv +0.00030945053208470265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document arxiv +0.00027309074552265303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document arxiv +0.00028737393506316194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document arxiv +0.0003098868328009879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document arxiv +0.0002614229162588409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document arxiv +0.0002884388407820923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document arxiv diff --git a/ALCF/data-lists/sunspot/books.txt b/ALCF/data-lists/sunspot/books.txt new file mode 100644 index 0000000000..9502fba1f5 --- /dev/null +++ b/ALCF/data-lists/sunspot/books.txt @@ -0,0 +1,3 @@ +0.0031025147279277244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document books +0.003102019887362634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document books +0.0009996745994661548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document books diff --git a/ALCF/data-lists/sunspot/c4.txt b/ALCF/data-lists/sunspot/c4.txt new file mode 100644 index 0000000000..ca4836ad81 --- /dev/null +++ b/ALCF/data-lists/sunspot/c4.txt @@ -0,0 +1,171 @@ +0.0002406272620255565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document c4 +0.0002404825539493424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document c4 +0.00024062296575435581 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document c4 +0.00024069315766818953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document c4 +0.00024055829162263452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document c4 +0.00024062053397343032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document c4 +0.0002410715545206964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document c4 +0.00024024881846087368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document c4 +0.0002407074700790688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document c4 +0.00024072141428809043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document c4 +0.00024027710230872736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document c4 +0.0002409111299205489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document c4 +0.00024081954058275009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document c4 +0.00024086076794990912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document c4 +0.00024098672620832446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document c4 +0.00024068622303333862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document c4 +0.00024140627024291824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document c4 +0.0002414512033594384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document c4 +0.00024028742594941463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document c4 +0.00024018036089269645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document c4 +0.0002398347365034979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document c4 +0.00024006780153485276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document c4 +0.00024015620270419213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document c4 +0.0002408848259695227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document c4 +0.0002408023185278831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document c4 +0.00024021196580140326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document c4 +0.00024077677271297493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document c4 +0.00024087392454668027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document c4 +0.0002408071293824126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document c4 +0.00024042223828845715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document c4 +0.0002411484752360495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document c4 +0.00023605263746465907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document c4 +0.00023471222158326908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document c4 +0.00023432138580287644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document c4 +0.00023407385623382327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document c4 +0.00023487504174367091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document c4 +0.0002341843704976313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document c4 +0.00023421993170282486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document c4 +0.00023445057969132037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document c4 +0.0002337681680073047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document c4 +0.000234627964808109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document c4 +0.0002338942211888584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document c4 +0.00023403849286843386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document c4 +0.00023405641310796305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document c4 +0.00023349169562397965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document c4 +0.00023381157386048856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document c4 +0.00023388742993790587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document c4 +0.00023363103829469813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document c4 +0.00023421141834630477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document c4 +0.00023420564352232565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document c4 +0.00023367463699173143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document c4 +0.00023344969163567033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document c4 +0.00023372196941547188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document c4 +0.00023399207645297834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document c4 +0.00023357915605505856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document c4 +0.00023337585642190864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document c4 +0.00023385005470157914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document c4 +0.00023301533534493465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document c4 +0.00023377864302541782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document c4 +0.00023323745848621437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document c4 +0.0002330594611151835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document c4 +0.0002334149675026783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document c4 +0.00023198945902291534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document c4 +0.00023023784834634142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document c4 +0.00022985623060187217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document c4 +0.0002292605284569516 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document c4 +0.00022926593333048894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document c4 +0.00022922766406807777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document c4 +0.00022898153911167426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document c4 +0.0002292473111593315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document c4 +0.000228804579400424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document c4 +0.00022865485613513526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document c4 +0.00022937426835887895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document c4 +0.00022917388311587372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document c4 +0.0002291660582019043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document c4 +0.00022907895248360543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document c4 +0.0002294617879920205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document c4 +0.0002290452150516566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document c4 +0.00022943405619715553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document c4 +0.0002296271421006204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document c4 +0.00022854791372910372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document c4 +0.00022923123467686557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document c4 +0.00022852404355738494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document c4 +0.00022847798660086642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document c4 +0.0002289604586810316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document c4 +0.00022835479834950643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document c4 +0.0002289149402884243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document c4 +0.00022806655474763446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document c4 +0.00022826296420992974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document c4 +0.00022906829636213627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document c4 +0.0002287628414466998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document c4 +0.0002282673911253445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document c4 +0.00022869309841939134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document c4 +0.0002281540116815451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document c4 +0.0002259755756162738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document c4 +0.00022562331285233504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document c4 +0.0002259061146106053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document c4 +0.00022567670836663787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document c4 +0.00022573165387587061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document c4 +0.00022508514961670572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document c4 +0.00022564642513773356 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document c4 +0.00022563088621998788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document c4 +0.0002250438755373707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document c4 +0.00022524465346241134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document c4 +0.00022531737657666812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document c4 +0.00022444687519363458 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document c4 +0.00022460397498596298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document c4 +0.00022454218976501763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document c4 +0.00022447528843671366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document c4 +0.00022501666332178926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document c4 +0.00022453752304377972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document c4 +0.00022484451871163002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document c4 +0.00022465678847154914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document c4 +0.00022453180917044732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document c4 +0.0002247278486823009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document c4 +0.00022465794828242097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document c4 +0.00022431000701925386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document c4 +0.00022476020248460963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document c4 +0.00022467531771795015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document c4 +0.0002236391309945234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document c4 +0.00022458764920536007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document c4 +0.00022430877426744415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document c4 +0.0002247047786127192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document c4 +0.0002245298090400035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document c4 +0.0002245648831396188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document c4 +0.00022292894729820784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document c4 +0.00022236668082957533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document c4 +0.0002217622659895442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document c4 +0.00022252452726732609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document c4 +0.00022135333211363678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document c4 +0.0002214571757787971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document c4 +0.0002217188139237798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document c4 +0.00022144214894640303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document c4 +0.00022100172806631854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document c4 +0.00022156392409199052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document c4 +0.00022134830143710272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document c4 +0.00022158598922529453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document c4 +0.00022142932483041377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document c4 +0.00022120980907786554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document c4 +0.00022117917738112441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document c4 +0.00022077089397851235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document c4 +0.00022093265074996711 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document c4 +0.00022091299741377004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document c4 +0.0002205849150703338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document c4 +0.0002210648204787979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document c4 +0.0002214235747364102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document c4 +0.00022083907302221787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document c4 +0.0002206334237915964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document c4 +0.00022065193929912214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document c4 +0.00022079775597767288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document c4 +0.00022091492909963518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document c4 +0.00022095009987097293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document c4 +0.0002208150577180165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document c4 +0.00022085759102772088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document c4 +0.00022073789170129016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document c4 +0.00022049322781182384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document c4 +0.00022083270617761285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document c4 +0.00021982452827473632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document c4 +0.00021899870446514259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document c4 +0.00021890358773356361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document c4 +0.00021875556609042841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document c4 +0.00021861195987201226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document c4 +0.00021856782186167455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document c4 +0.00021912837771543515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document c4 +0.00021900213768517756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document c4 +0.00021871675851390374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document c4 +0.0002180537056545586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document c4 +0.0002188196714327129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document c4 +0.00021851362624523464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document c4 +0.0002183236795498736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document c4 +7.291153618675672e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document c4 diff --git a/ALCF/data-lists/sunspot/cc.txt b/ALCF/data-lists/sunspot/cc.txt new file mode 100644 index 0000000000..d771efb06a --- /dev/null +++ b/ALCF/data-lists/sunspot/cc.txt @@ -0,0 +1,1108 @@ +0.0003742481815405742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document cc +0.00038204855962733055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document cc +0.00038821818392663593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document cc +0.00038723332988783727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document cc +0.00038916141142149904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document cc +0.00038049542523949033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document cc +0.0003854755539534284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document cc +0.00024202756466512517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document cc +0.0003915405155008087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document cc +0.0003927382151931033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document cc +0.0003839151202260479 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document cc +0.00040006817468967907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document cc +0.00040318965964443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document cc +0.0003831013019452741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document cc +0.00039166638383204036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document cc +0.00039962784023961004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document cc +0.00039536707853602614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document cc +0.0004204304698247758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document cc +0.00041538899178693555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document cc +0.00039186953333675306 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document cc +0.00038945837196504305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document cc +0.0003919951238929062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document cc +0.00044377065718528966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document cc +0.0004407759068603017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document cc +0.0002487811895843715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document cc +0.00039349432045556636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document cc +0.00041223198559462343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document cc +0.0004036573014830213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document cc +0.0003825982215521807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document cc +0.00040386867133151386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document cc +0.00024460575279105167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document cc +0.000269029789531335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document cc +0.0003573757493252864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document cc +0.0004600876681392076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document cc +0.0002605354166397086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document cc +0.0003882502452157999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document cc +0.0002466747612126512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document cc +0.0004024726105072402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document cc +0.00040820631128483644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document cc +0.0002691094350403538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document cc +0.00026916830387277267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document cc +0.0004204663297880574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document cc +0.00042379698687085554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document cc +0.0004502169227311871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document cc +0.0002661708937015295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document cc +0.00031239486948031334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document cc +0.0003109054589936201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document cc +0.00045873053079760646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document cc +0.00022904931423244635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document cc +0.0003813462028433663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document cc +0.00039188129256500874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document cc +0.00045124222276983765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document cc +0.00048138658436853695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document cc +0.0003944178776279866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document cc +0.00039941569676754006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document cc +0.00037952761190240494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document cc +0.0003944870860881476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document cc +0.0003891842411856621 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document cc +0.000387688981934861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document cc +0.00039197953876258005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document cc +0.00039007915280311206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document cc +0.0003995520363699188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document cc +0.00039230985654592406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document cc +0.0003929472067173851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document cc +0.0003924096172671473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document cc +0.0003881636143629905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document cc +0.000389790617937084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document cc +0.00037351762309221023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document cc +0.0003630196170929407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document cc +0.00033532465765142113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document cc +0.0003076088685761823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document cc +0.00039463850897720803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document cc +0.0002843816115231449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document cc +0.0002909175709416474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document cc +0.00028867170997202486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document cc +0.0002838644617723659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document cc +0.00029027869525543416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document cc +0.0002821339567560056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document cc +0.0002922988877045601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document cc +0.0002866955958315786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document cc +0.0002865271754558126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document cc +0.0002861247475618473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document cc +0.0002826681072408606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document cc +0.0002849746458282827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document cc +0.0002816966633435316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document cc +0.00026255342235948463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document cc +0.0002552895098829678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document cc +0.00025990194083107813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document cc +0.0002524062657685835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document cc +0.0002538577379748611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document cc +0.0002561415177406761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document cc +0.00026206253059694905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document cc +0.00026168095406910565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document cc +0.0002601305742008613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document cc +0.00025200823006814814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document cc +0.0003229951981263502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document cc +0.00037289448266476045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document cc +0.0003807825862179898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document cc +0.0003616333738191483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document cc +0.0003665117918907636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document cc +0.0003684186453633228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document cc +0.0003589330610806066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document cc +0.00036383861418030395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document cc +0.000359841363355303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document cc +0.00036431044063050464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document cc +0.0003668574090358279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document cc +0.000362768263620199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document cc +0.0003501888032771077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document cc +0.000352401968221528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document cc +0.0003541019701869794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document cc +0.0003628121865546891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document cc +0.0003752582953758773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document cc +0.00037902046230424966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document cc +0.0003777927146925147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document cc +0.0003760676130509053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document cc +0.00034046049078755405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document cc +0.0003338847563259091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document cc +0.00033294499102761794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document cc +0.0004912026198265864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document cc +0.00032064363474664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document cc +0.00032154190389541214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document cc +0.00032309660151746207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document cc +0.00031181143365304544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document cc +0.00031046092294569104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document cc +0.00031150165249068046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document cc +0.0003041314265988224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document cc +0.0003024834909739394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document cc +0.0003019936835833604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document cc +0.000292329665283177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document cc +0.0002867061143144972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document cc +0.00028443615610701707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document cc +0.00028462291013755945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document cc +0.0002793538601205013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document cc +0.00027306573977044246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document cc +0.00027097155673336525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document cc +0.0002752934202112985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document cc +0.00043042012694697647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document cc +0.00047495648822986177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document cc +0.00047755032493473855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document cc +0.0004706974343933747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document cc +0.00046682163297771817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document cc +0.0004616765425874178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document cc +0.00030644496751628097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document cc +0.0002909492555358308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document cc +0.00027272036068261724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document cc +0.0004101070217315588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document cc +0.0003728914338834357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document cc +0.00036546911442305647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document cc +0.0003669945482407483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document cc +0.0003715902407424017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document cc +0.00035837486406683366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document cc +0.0003573318538685469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document cc +0.0003553784893071916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document cc +0.0004920659809912352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document cc +0.0004533619411303183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document cc +0.00045067066057818706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document cc +0.00044396985139270645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document cc +0.00043198288204468477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document cc +0.00043005174223738454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document cc +0.00041847118430776784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document cc +0.00042952036375796664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document cc +0.00043420594647324267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document cc +0.0003461123241053012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document cc +0.0003408581597849182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document cc +0.00033172705422182547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document cc +0.0003392566490686136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document cc +0.00033578341518385483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document cc +0.0003439196710518844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document cc +0.00034559163447085543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document cc +0.00033762478642902825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document cc +0.00033215210055107224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document cc +0.00033423579608014966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document cc +0.0004963355016025102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document cc +0.0004996862761456923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document cc +0.0005000551829325451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document cc +0.0005004212610098755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document cc +0.00027768695585500585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document cc +0.00028395983854338433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document cc +0.00027835826303062254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document cc +0.0002740073176010804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document cc +0.0002791830529274016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document cc +0.0002796863816194411 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document cc +0.00026697453022672804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document cc +0.0002594197440280141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document cc +0.0003779565697649222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document cc +0.00041835823476586606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document cc +0.00043788493575265915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document cc +0.0002731731970096006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document cc +0.000276305847423402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document cc +0.0002704955773958623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document cc +0.0002629635944827518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document cc +0.000260070956974436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document cc +0.00025661553791456334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document cc +0.00025794727207576157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document cc +0.00025295733980001527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document cc +0.0003788106407021029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document cc +0.0004882344027669431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document cc +0.0003275324309642705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document cc +0.0004803401856640094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document cc +0.00046720138323433943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document cc +0.00043527810307095335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document cc +0.00043905395741627827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document cc +0.00048774175867331425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document cc +0.00048380704121346737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document cc +0.0004779011848346118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document cc +0.00046255587581908036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document cc +0.00045127922880511576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document cc +0.0004503891485256095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document cc +0.0004450142332303422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document cc +0.00044630282482516654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document cc +0.00044325014465743616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document cc +0.0004263874842796447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document cc +0.0004217530913646938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document cc +0.000415120314341852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document cc +0.00040987168279144537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document cc +0.00033468337266607834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document cc +0.0003353094464683005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document cc +0.0004833936821707294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document cc +0.00047194878988920935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document cc +0.0004648324126996427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document cc +0.0004562345003964941 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document cc +0.0004933203505465098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document cc +0.0003530166075325466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document cc +0.00035368548192804685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document cc +0.0004872620828289663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document cc +0.00048293889392426456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document cc +0.00047936768462267655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document cc +0.00047821013991587545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document cc +0.0004660610308564753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document cc +0.000394683430103437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document cc +0.00039165053441571324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document cc +0.0003906936040164381 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document cc +0.00038074803919159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document cc +0.0003686529291578143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document cc +0.00035832920428870976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document cc +0.00035929024535947033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document cc +0.0003538226556050544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document cc +0.0003584167868708799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document cc +0.0003480507542594234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document cc +0.0003413709023543034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document cc +0.00034001304759361455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document cc +0.00033430532902756514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document cc +0.00046519252660631277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document cc +0.0002938876402514769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document cc +0.00028676090994509047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document cc +0.00027296150117506716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document cc +0.00026513502621960483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document cc +0.0002680081327926125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document cc +0.00025831225828720344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document cc +0.00026647037295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document cc +0.0002525733734572654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document cc +0.00025831708887575375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document cc +0.00042487627444443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document cc +0.0004951213245023891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document cc +0.0004804051413177752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document cc +0.0004662397611340532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document cc +0.0004550138655253933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document cc +0.00044494909122746795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document cc +0.0002899112253051385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document cc +0.0004372879736279761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document cc +0.0004529568099252922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document cc +0.00045127826158829573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document cc +0.0004436558176737439 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document cc +0.0004419233237678378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document cc +0.000434589215880319 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document cc +0.00029153613207706566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document cc +0.0004312458058738854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document cc +0.00028741854968757313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document cc +0.00046853200754421234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document cc +0.0004949145252030074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document cc +0.00044459683920483167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document cc +0.0003836095306696336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document cc +0.0003789760237872398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document cc +0.0003749227438304427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document cc +0.0003628558277173369 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document cc +0.00039468301394041474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document cc +0.00038874701821614864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document cc +0.0004158492456077867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document cc +0.00042360504554060077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document cc +0.00040386729844317623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document cc +0.00027595096702902474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document cc +0.00043638766787829135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document cc +0.0002218691596850179 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document cc +0.0004437566108089954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document cc +0.0003889996411609667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document cc +0.00043454421906537704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document cc +0.0004522564392830988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document cc +0.00041517835659357416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document cc +0.0002614360863446896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document cc +0.00037543522111463596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document cc +0.0004386190133514781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document cc +0.00046358333286115075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document cc +0.00043186261317942404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document cc +0.0002377581602097957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document cc +0.00025973334085074254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document cc +0.00040139099332000796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document cc +0.00043674860686687174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document cc +0.00040853289309329373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document cc +0.000242910191729688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document cc +0.0004431071731750582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document cc +0.0004388092670482523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document cc +0.000381418866255965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document cc +0.0004100117296419717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document cc +0.00042469230366022745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document cc +0.00041744151905374254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document cc +0.00022835699906752945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document cc +0.0004380161085387397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document cc +0.00044803212381807456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document cc +0.00040554932796137236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document cc +0.0004234508646347761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document cc +0.00043341209652360653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document cc +0.00023966604734537185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document cc +0.000259165907316014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document cc +0.0004270653021833602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document cc +0.0004341547032162028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document cc +0.0004111478117275994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document cc +0.0004299383567984396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document cc +0.0004241899124590779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document cc +0.0004502719349364145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document cc +0.00038994621469645615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document cc +0.0003859912398894952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document cc +0.0004247535950310557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document cc +0.000386982084327716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document cc +0.0004196451040053251 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document cc +0.0004096278509782259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document cc +0.0004373334932695721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document cc +0.0004180889975240641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document cc +0.00042079636929672745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document cc +0.00038063574611812913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document cc +0.0003817505891515542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document cc +0.0004420096268860222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document cc +0.00039182670726410623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document cc +0.0003635667850372299 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document cc +0.00041564996472055667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document cc +0.000400529358757286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document cc +0.0003939113874958451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document cc +0.00039066622068940996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document cc +0.0004290098538807143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document cc +0.0004240739958197099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document cc +0.00040775392659215333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document cc +0.0004091634200396925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document cc +0.00042299190476617914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document cc +0.0003701492680344151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document cc +0.0003807353844384635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document cc +0.00038813507771983156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document cc +0.00040072346558408346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document cc +0.0003603595180423597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document cc +0.00038799421353112465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document cc +0.00037575235582264926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document cc +0.0004239190342959713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document cc +0.0004606044799136546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document cc +0.00045107950652529253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document cc +0.0004391947201871058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document cc +0.0004457516661123035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document cc +0.0004301297170991686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document cc +0.00044661704164586694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document cc +0.0004438849846114837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document cc +0.0004444205734316823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document cc +0.0004190924165303394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document cc +0.00043942581131677875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document cc +0.00021568459798090663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document cc +0.0003814929225407199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document cc +0.0003217453179359235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document cc +0.00031719591470267974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document cc +0.00032434115726922137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document cc +0.0004079911120371051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document cc +0.000329492766381148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document cc +0.0003845916162001633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document cc +0.0003835208964390098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document cc +0.00037847334157173194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document cc +0.00038296039903791865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document cc +0.00037896336828472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document cc +0.00037620974396391355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document cc +0.00037420590727111843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document cc +0.000340490625886403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document cc +0.0003078314411035827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document cc +0.00034153990750656097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document cc +0.0003308858103982067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document cc +0.0003452640607156025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document cc +0.00033095276418403455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document cc +0.0003116308995860414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document cc +0.00032446713226408477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document cc +0.0003015816821912984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document cc +0.00031612418775706894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document cc +0.0003278516344971041 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document cc +0.00033079446736097217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document cc +0.00032278977146550837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document cc +0.00032065272988207914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document cc +0.0003936696452406576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document cc +0.0003450109536627789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document cc +0.0003339787189919641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document cc +0.0003284303856176974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document cc +0.00033652677276843477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document cc +0.0003257822443845694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document cc +0.0003293985569149334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document cc +0.0003310360260148262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document cc +0.0003233770986418526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document cc +0.0003172280092149422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document cc +0.0003160674744292835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document cc +0.00030931090289598506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document cc +0.0003093173886443107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document cc +0.00033167847081104083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document cc +0.00031131501311729723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document cc +0.00031046608876279845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document cc +0.00030569235942207244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document cc +0.00030777943671285197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document cc +0.00029303314290956683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document cc +0.0003045824546400205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document cc +0.00030360880677729793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document cc +0.00031646239964835433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document cc +0.0003129122300603785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document cc +0.00031060464956661433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document cc +0.000311819032500067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document cc +0.0002977872483902282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document cc +0.0003009448600922438 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document cc +0.00028610292098537774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document cc +0.0002988326876216654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document cc +0.00028550828372819075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document cc +0.0002830381750875739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document cc +0.0002848495855927156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document cc +0.0002856443760308144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document cc +0.00027442895344188584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document cc +0.0002681160554049462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document cc +0.0003421482544126989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document cc +0.0004005872948449718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document cc +0.0003930123959320308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document cc +0.0003867271832275778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document cc +0.000380805140455254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document cc +0.0003814769861947819 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document cc +0.00038025170883282324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document cc +0.0003738026647867475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document cc +0.00018960856915036276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document cc +0.0003697177501953134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document cc +0.00036674194328136693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document cc +0.00036447406838697555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document cc +0.00036686410861101255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document cc +0.00035915267825103423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document cc +0.0003624758404026675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document cc +0.0002822812140180794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document cc +0.00030620512946920813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document cc +0.000294249776520589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document cc +0.00030238536967523434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document cc +0.00029509593361580754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document cc +0.0002906912701830899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document cc +0.0002921944165474959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document cc +0.00028358919691127954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document cc +0.0002813182772323272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document cc +0.00027442640800299205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document cc +0.0002747820342933984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document cc +0.0002747584403979717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document cc +0.00027499129634862444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document cc +0.0002712050404257197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document cc +0.0002616256943143254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document cc +0.00026769938929002815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document cc +0.00038396081322727017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document cc +0.0003863140490027991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document cc +0.00037702277513203237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document cc +0.0003633274156107032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document cc +0.0003587473889240435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document cc +0.0003507672084278415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document cc +0.00033776425499780385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document cc +0.0003377914127574796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document cc +0.00032948015659161326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document cc +0.00033245638541392985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document cc +0.00031080707640648695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document cc +0.0002976903331149755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document cc +0.0002965121463725523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document cc +0.0002933849695266647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document cc +0.0002837035078508233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document cc +0.00028684569079589323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document cc +0.0003145192320802359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document cc +0.0003566937253273515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document cc +0.0003470199109592918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document cc +0.0003060245312041868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document cc +0.0002650817213818789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document cc +0.0002643604938780134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document cc +0.000299350876031416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document cc +0.0003178540797697938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document cc +0.000271850367887767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document cc +0.00031349896596549 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document cc +0.00031749734412765755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document cc +0.0003791137842391209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document cc +0.0003742334169957992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document cc +0.0003705639757351107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document cc +0.0003126986769797042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document cc +0.00031038132814561196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document cc +0.00036464437173804883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document cc +0.0003569480488951322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document cc +0.0003541239221619106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document cc +0.00035315297411308053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document cc +0.0003572451925404141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document cc +0.0003514986129411253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document cc +0.0003521798298425866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document cc +0.00034553677439244716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document cc +0.000349004719809412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document cc +0.0003468247484872769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document cc +0.0003465822608356558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document cc +0.00035410983132162007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document cc +0.0003487908354969444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document cc +0.0003479024763238147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document cc +0.000341412530646823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document cc +0.00034451316273667034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document cc +0.0002618849993484869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document cc +0.00026788679978901144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document cc +0.00027450670773227214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document cc +0.0002661273129899329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document cc +0.00026836569676402957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document cc +0.00026155876975483236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document cc +0.0002609276830117151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document cc +0.0002644161630512771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document cc +0.00036789208972872557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document cc +0.00037829849439990513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document cc +0.0003788894943523098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document cc +0.0003617207777959397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document cc +0.0002541334487248998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document cc +0.0002707945538071073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document cc +0.00027046282716455214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document cc +0.0002652443167243215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document cc +0.0002685859923850986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document cc +0.00025734961751176414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document cc +0.000259041720872915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document cc +0.00025340107274823446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document cc +0.00025757135121837893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document cc +0.00025617700500574084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document cc +0.0002566931670562857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document cc +0.0002543871190716101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document cc +0.00024997565589481713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document cc +0.0002954079779456287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document cc +0.00034890741135252835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document cc +0.0003473298137731525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document cc +0.0003296959618486435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document cc +0.0003304520061604598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document cc +0.00032377956175729824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document cc +0.00031700696295168713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document cc +0.0003060382346081943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document cc +0.0003012003005056863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document cc +0.0002981074073993884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document cc +0.0002922128825950705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document cc +0.000348901087722931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document cc +0.0003408286289467841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document cc +0.0003410649680770183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document cc +0.0003358524215576502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document cc +0.0003343661874989231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document cc +0.00032810573699389156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document cc +0.00032261449539097497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document cc +0.0003162694866049203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document cc +0.0003158381156468853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document cc +0.000317376061083603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document cc +0.0003125788639953052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document cc +0.0003010105041885602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document cc +0.0003065865059090678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document cc +0.0003084275726508053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document cc +0.00030966560718296085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document cc +0.0002957728057853081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document cc +0.00029904164542325336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document cc +0.0002955358888729187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document cc +0.00028692976446931544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document cc +0.0002923476214935797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document cc +0.0002893691697212419 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document cc +0.0002855895211981585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document cc +0.00027968347097626246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document cc +0.0002810783462604979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document cc +0.00027794080455729715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document cc +0.00034784376461416953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document cc +0.0003488347959010943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document cc +0.00034790583710250724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document cc +0.000345913166618151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document cc +0.00033801936268066675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document cc +0.0003290591130212315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document cc +0.00034051399521366823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document cc +0.00032470943131841784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document cc +0.00031679540050914276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document cc +0.00031814596342422325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document cc +0.0003156466289485036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document cc +0.00029985010879003633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document cc +0.0002905176377776361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document cc +0.0004206836775460856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document cc +0.00020660449162246918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document cc +0.0003461727254468087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document cc +0.00020592870907067763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document cc +0.00034173505299233005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document cc +0.0004052437256652738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document cc +0.0004080650901351697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document cc +0.00039778184149144276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document cc +0.00039046311464950275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document cc +0.00039043444911071384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document cc +0.000388575704932843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document cc +0.00019737533145666597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document cc +0.00037610755595812403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document cc +0.00037315400127598317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document cc +0.00037415028580922163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document cc +0.00036694041707212337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document cc +0.00018947219857306515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document cc +0.00037046050826533545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document cc +0.0003587440768559087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document cc +0.00034623936498708903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document cc +0.0003502289592617922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document cc +0.00034692398063649823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document cc +0.000339340809421849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document cc +0.0003360510394816983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document cc +0.0003354673850814145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document cc +0.00032937682875877047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document cc +0.00032844505049317715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document cc +0.00028287199339908627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document cc +0.0002795217197003578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document cc +0.00028048955601883463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document cc +0.0002769326396439027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document cc +0.0002727090021299243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document cc +0.0002726577841024554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document cc +0.00026663619593455374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document cc +0.00026068042672138127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document cc +0.0002637704114326801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document cc +0.0002593043567100412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document cc +0.0002599897110113453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document cc +0.0002435078682758859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document cc +0.0002450530071379054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document cc +0.00024233331983743606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document cc +0.0002934750947999535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document cc +0.00033241226364044474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document cc +0.00032938406090272075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document cc +0.00032778705403953246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document cc +0.00032184551480398754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document cc +0.00031874002264945737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document cc +0.0003165319685666433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document cc +0.00031307071173376295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document cc +0.00031119524184911957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document cc +0.0003102253344576429 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document cc +0.0003088976240383192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document cc +0.0002951410823077708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document cc +0.00029772657676757413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document cc +0.0003056048989909935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document cc +0.00031991305381648026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document cc +0.00030890256978362426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document cc +0.0003109382904091933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document cc +0.00031035798529690644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document cc +0.00030741666395911753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document cc +0.0002989918594861846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document cc +0.00029569635443989434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document cc +0.0002973992445667285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document cc +0.000293397351001072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document cc +0.00028737817438047954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document cc +0.00028252738144009747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document cc +0.0002805511898623541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document cc +0.0003718020784620472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document cc +0.0003499713845765235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document cc +0.00034283547445326676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document cc +0.00031464759888838765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document cc +0.00033188946446414833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document cc +0.000326084432195463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document cc +0.0003764568303917893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document cc +0.0003604955598858414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document cc +0.0003655654554133222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document cc +0.00035762304033750504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document cc +0.00038478883950347103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document cc +0.00027735714341247454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document cc +0.00028139534607773563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document cc +0.00019777292251713763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document cc +0.000285571704874486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document cc +0.00028543482146244363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document cc +0.00019434234484256758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document cc +0.00027854908176986763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document cc +0.0002847068039566143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document cc +0.00028672356943064853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document cc +0.00027782687605808177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document cc +0.0002843539634105203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document cc +0.0002894748379090401 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document cc +0.0002868852440186493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document cc +0.0002818504885373851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document cc +0.00028680112812941034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document cc +0.00019258978168723977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document cc +0.00028760637934715155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document cc +0.0002820439443912918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document cc +0.0002831001054410018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document cc +0.00029001901552467397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document cc +0.00027779449377883156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document cc +0.00019949837437516796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document cc +0.0002907306472984446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document cc +0.00027814858381318327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document cc +0.00019472790889161432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document cc +0.00020472626596924125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document cc +0.0002870045081974301 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document cc +0.00019812241927078482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document cc +0.0002817553333369554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document cc +0.00027829782796642117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document cc +0.00028289431732284113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document cc +0.0002795526296717729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document cc +0.00027682829988044574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document cc +0.0002895432402719184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document cc +0.0002823174903941811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document cc +0.00028170972351837796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document cc +0.00027807915877838826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document cc +0.00028588515681452956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document cc +0.00028112324090816726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document cc +0.00020636178289985485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document cc +0.00019447255290980535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document cc +0.0002850824220591452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document cc +0.00027856429520116784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document cc +0.0002820880676635633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document cc +0.00028943902215995714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document cc +0.0002676366291085329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document cc +0.00023806333809954687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document cc +0.00024526460430233455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document cc +0.00023876876664622726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document cc +0.00023379770334179805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document cc +0.00024175151269138382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document cc +0.00023386583242595706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document cc +0.00023771797150160827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document cc +0.0002262748967483896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document cc +0.0002408148346432682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document cc +0.00023398651720444235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document cc +0.00022989433874474592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document cc +0.00023948500543957772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document cc +0.0002331594076859196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document cc +0.00023375132439600242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document cc +0.00023923410909668642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document cc +0.00023952796315562954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document cc +0.0002327466076905069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document cc +0.00023082758956797212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document cc +0.0002240509275524448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document cc +0.00022798879995765268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document cc +0.000221172516774386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document cc +0.00021767045123534623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document cc +0.00021982832794804484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document cc +0.00021971626543789102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document cc +0.00022566565206920132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document cc +0.0002181984894194856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document cc +0.00021831417549554653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document cc +0.00021601405421187145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document cc +0.00022275733725519607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document cc +0.00021847734911973986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document cc +0.0002243591012664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document cc +0.00021688758139483833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document cc +0.0002182953624789215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document cc +0.00020475155724026002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document cc +0.00021498078062960065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document cc +0.0002157914337233064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document cc +0.00021781838494967963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document cc +0.00021723242266814558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document cc +0.0002176782686553837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document cc +0.0003486179404943968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document cc +0.00034882846352857634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document cc +0.00031400868448352596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document cc +0.00030273484020011963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document cc +0.00029895889118145404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document cc +0.00029770764609621714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document cc +0.0002990181332116852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document cc +0.00029653733972285996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document cc +0.00029624649222942476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document cc +0.00029625609720203576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document cc +0.00029731928930852147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document cc +0.00029011721326148513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document cc +0.00028849788197494655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document cc +0.00021601278623858145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document cc +0.00021319599281739178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document cc +0.0002153325290600083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document cc +0.00018566946174516558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document cc +0.00020736824394291617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document cc +0.00020857419820128004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document cc +0.00020058526129536423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document cc +0.00020745812166665217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document cc +0.00020652171015271702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document cc +0.00020643808911278608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document cc +0.00020040513914482103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document cc +0.00020598050188272898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document cc +0.0001969184139343296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document cc +0.0001972748812937012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document cc +0.0002038556751586195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document cc +0.00020245186011313464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document cc +0.00019950381422038783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document cc +0.00020837055459665258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document cc +0.00020371856218246096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document cc +0.00019537612301625791 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document cc +0.00019914984508813857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document cc +0.0002053787713691309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document cc +0.00019082100541008637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document cc +0.00020397153334531813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document cc +0.0002021462693077317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document cc +0.00019609357008124035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document cc +0.00019693256622486236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document cc +0.00020007239732428112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document cc +0.00020467075741591954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document cc +0.00019584883400022932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document cc +0.00019135050391176972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document cc +0.0003362829834208298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document cc +0.00034013691154784095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document cc +0.00033215887031941976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document cc +0.00032681189065396707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document cc +0.0003149138485493094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document cc +0.00030179177307540077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document cc +0.0002923278437581119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document cc +0.00029470052278994486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document cc +0.0002994095093045731 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document cc +0.00029033525096085037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document cc +0.00029390798852496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document cc +0.0002916230924130842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document cc +0.00029419886374594913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document cc +0.0002865469756730764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document cc +0.00021191292549942086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document cc +0.00021369664817409847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document cc +0.00021612485624266726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document cc +0.00022242192634588478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document cc +0.00014605095659989698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document cc +0.00022070626106341693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document cc +0.0002174420774054071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document cc +0.00021325858963116995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document cc +0.0002124322999488052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document cc +0.0002081218896969054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document cc +0.0002108710211556957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document cc +0.00020686867095978426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document cc +0.00020895752681041895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document cc +0.00020741922266415738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document cc +0.0002069112657197308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document cc +0.00020644627473468118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document cc +0.00020332991338121604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document cc +0.0003560895677789848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document cc +0.00032915779111908214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document cc +0.00033810613317040864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document cc +0.00033729626594036923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document cc +0.00033550342864602944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document cc +0.00034173474024556906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document cc +0.000331505340748827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document cc +0.0003270050330117195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document cc +0.00032585275329172556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document cc +0.0003143383203190604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document cc +0.00031655199110388894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document cc +0.00030738872158476413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document cc +0.00030838388352699285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document cc +0.0003053596995351888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document cc +0.00031836304739584593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document cc +0.000315315435873905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document cc +0.0003087116248965243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document cc +0.00030396790625537645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document cc +0.0003335812246032149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document cc +0.00034570956323095843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document cc +0.00034563035636675786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document cc +0.00033411265479076335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document cc +0.00034439191141692787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document cc +0.0003364483125496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document cc +0.0003299500453608033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document cc +0.00033163377700074837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document cc +0.00032638649660627673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document cc +0.00032616167939645234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document cc +0.0003205289298760723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document cc +0.00031939393740815355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document cc +0.00031593164066731296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document cc +0.00031928871111254405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document cc +0.00029670189073175004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document cc +0.00020517703846735904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document cc +0.00020128418186172073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document cc +0.00019662723895606717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document cc +0.0001981157042081407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document cc +0.00019703489037041608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document cc +0.00019079796331785068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document cc +0.0001909352306690079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document cc +0.00018824662295261396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document cc +0.00019864275319325954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document cc +0.00018818516521649587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document cc +0.00018875694972812844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document cc +0.00018231621170645482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document cc +0.00018349407845798273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document cc +0.00018088971427746906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document cc +0.00018296284236327237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document cc +0.0001876011825819916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document cc +0.000329052068725176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document cc +0.00032223616273648536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document cc +0.00031272564089633955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document cc +0.00031621609908414494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document cc +0.0003117213560911235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document cc +0.00030218064069945934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document cc +0.00030658916600512085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document cc +0.0002915863534115821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document cc +0.0002940280138374372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document cc +0.00029067860468866085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document cc +0.00028529228063135635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document cc +0.00028336893301452256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document cc +0.0002794668089130099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document cc +0.00021681361378827842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document cc +0.0001484664674497246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document cc +0.00021950558378215133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document cc +0.00021806860758808645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document cc +0.00021819568718852282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document cc +0.00021626925931585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document cc +0.0001464536143077762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document cc +0.00021432777088808917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document cc +0.000213473805865147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document cc +0.00021397067253964538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document cc +0.00020758957647437263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document cc +0.00020687124337683314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document cc +0.00020630057046511005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document cc +0.0002091166859352538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document cc +0.00020777355025615267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document cc +0.00020709287641496176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document cc +0.00020736464660577094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document cc +0.00020062246741862607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document cc +0.00020693207561942915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document cc +0.00021151004871893024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document cc +0.00019930249098689716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document cc +0.00021589710041231824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document cc +0.00021369204789905741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document cc +0.0002147099923936778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document cc +0.00021077531190389536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document cc +0.0002100509829113836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document cc +0.00021185362601571124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document cc +0.00020722136637339565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document cc +0.00020300093701169531 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document cc +0.00019859737993313477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document cc +0.00019971314372100164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document cc +0.00019549908270269278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document cc +0.00019649820843534028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document cc +0.00019619415513498067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document cc +0.00019493006120377898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document cc +0.00019499409035775506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document cc +0.00019252988593634277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document cc +0.00019440768268686405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document cc +0.00018747161324755577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document cc +0.0001879575932372779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document cc +0.00019040707058357506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document cc +0.0001871931095090703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document cc +0.00020112966223017096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document cc +0.00020516878165311017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document cc +0.00020664735191740533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document cc +0.00021041398572882962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document cc +0.00020397992929690396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document cc +0.0002039978580295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document cc +0.00020592785601142126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document cc +0.0001990755527445265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document cc +0.00019729564847798732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document cc +0.00019958182230527032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document cc +0.0001985037302636386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document cc +0.00020204130355115716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document cc +0.0002000296401958085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document cc +0.0001983064832295463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document cc +0.00019663108484195617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document cc +0.00019510678560556523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document cc +0.0001873284057063206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document cc +0.00019311553072495885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document cc +0.00034652137288816547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document cc +0.0002813690318850024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document cc +0.00027697649713138685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document cc +0.0002755419092534421 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document cc +0.0002681583054440219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document cc +0.00026945753192750824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document cc +0.00026169470768245737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document cc +0.00026437008960810825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document cc +0.0002637294838228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document cc +0.00026491867965088836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document cc +0.00025504483625138986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document cc +0.0002545040623796586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document cc +0.0002546682814073622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document cc +0.00025545439487142615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document cc +0.0002626896557978271 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document cc +0.00025092040940402784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document cc +0.0002589154885863872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document cc +0.00024106160482721467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document cc +0.0002483289690087987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document cc +0.0002388930282784437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document cc +0.00024006340759273874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document cc +0.00023765248178029045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document cc +0.00023061351965578936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document cc +0.00024954224883546477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document cc +0.00017861017233018525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document cc +0.00017810832743667658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document cc +0.00017599709170759497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document cc +0.00017462723516505223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document cc +0.0002906316527068669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document cc +0.00033762141066247166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document cc +0.00017170670574152494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document cc +0.00017258674515137717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document cc +0.0002815386173173926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document cc +0.0002996845935618989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document cc +0.0002735268488987296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document cc +0.0002971738713071517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document cc +0.0002942690674002763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document cc +0.0003322222207729567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document cc +0.0003378721656198464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document cc +0.00018307262621851067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document cc +0.00033956081502775057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document cc +0.00031604820927876276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document cc +0.00028805657681088917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document cc +0.00026312293321215633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document cc +0.00034366936722921455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document cc +0.0002865256504406559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document cc +0.0003063615195861786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document cc +0.00028412791619666136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document cc +0.00028060835132727154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document cc +0.00032544974761560506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document cc +0.0002647177833217225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document cc +0.0003152621884896575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document cc +0.0003054625140336913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document cc +0.00031183308312292263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document cc +0.00018175026696621178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document cc +0.00017699918328872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document cc +0.00018222339261441908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document cc +0.00018348005930964137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document cc +0.0001810735993810541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document cc +0.00030846441282038914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document cc +0.0002972326889310354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document cc +0.00017433421318235594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document cc +0.00032799458649525895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document cc +0.00032482130048512673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document cc +0.00031943465668672475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document cc +0.00029615593630484517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document cc +0.0002893126939511001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document cc +0.0002849288351723284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document cc +0.00028383906633569267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document cc +0.00028072526091262615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document cc +0.000284239564292377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document cc +0.0002778903109432523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document cc +0.0002771644389501471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document cc +0.0002733316182319337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document cc +0.00026362539185869363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document cc +0.0002636325383220217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document cc +0.00026740622442302886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document cc +0.0002646771971853427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document cc +0.0002628566720605389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document cc +0.0002644760695434766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document cc +0.0002623837702310999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document cc +0.00026088722976772894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document cc +0.0002567065374799158 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document cc +0.00018857382101207726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document cc +0.00019036580399817203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document cc +0.00018348828065261222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document cc +0.00018491851780345073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document cc +0.00018904887260080187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document cc +0.0001875609304251801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document cc +0.00018393034720015817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document cc +0.00018419795526114903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document cc +0.00018699955623404795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document cc +0.00018276256902965128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document cc +0.00017698045695190812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document cc +0.00018104650132303642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document cc +0.00017758206731279688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document cc +0.00017131402995103497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document cc +0.000175944428350446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document cc +0.0003416745727147391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document cc +0.0003163259373952889 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document cc +0.0002804489269172448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document cc +0.00028748272397403175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document cc +0.00027603318345630605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document cc +0.000271638824679648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document cc +0.0002763761210210942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document cc +0.00026501984873172717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document cc +0.00026422486894694714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document cc +0.0002686339100849262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document cc +0.0002610837453940606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document cc +0.000260974343729353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document cc +0.0002599403837029134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document cc +0.0002937273113238609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document cc +0.0003341790732600504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document cc +0.0002620661576600244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document cc +0.0003027929169239288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document cc +0.00031944039129326894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document cc +0.00019025676304139009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document cc +0.00018680910145009907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document cc +0.00034215840419416437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document cc +0.00018618120812119364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document cc +0.00018605853095599425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document cc +0.00018120712626096538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document cc +0.00018315079292495327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document cc +0.00018362556449041974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document cc +0.0001780024456718171 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document cc +0.00033296526436178697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document cc +0.0001802398632282846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document cc +0.00017340263100798256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document cc +0.00017755840547238697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document cc +0.00018419413735260606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document cc +0.00017869518174591322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document cc +0.00017526271460129484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document cc +0.00017852168597981907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document cc +0.00017566536156787157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document cc +0.00017589867964432936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document cc +0.00017831487394075305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document cc +0.00017837310528935862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document cc +0.00018200908814216548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document cc +0.0001795136627511612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document cc +0.0003414021775300033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document cc +0.00017177291787788502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document cc +0.0003441900648571877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document cc +0.0003394534597060673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document cc +0.0003236887233114832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document cc +0.0001639544129688747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document cc +0.00019137443753211255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document cc +0.00018575146284680153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document cc +0.00019184792863440243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document cc +0.00018966043065679055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document cc +0.00017968851317035848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document cc +0.00018479881897661546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document cc +0.0001813642692683015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document cc +0.0001686449798983066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document cc +0.00018516104592230446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document cc +0.00031283726601066385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document cc +0.0003248607542883853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document cc +0.00031583241601202365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document cc +0.00031238270857730376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document cc +0.000307150592403979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document cc +0.00029443829986847044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document cc +0.0002942723732234677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document cc +0.00023514930666443422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document cc +0.0020776328951453444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document cc +0.0021768234410538883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document cc +0.002106973549276289 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document cc +0.002110915756171751 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document cc +0.0017032382109816464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document cc +0.0019047944877712286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document cc +0.0019402711744016077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document cc +0.0006264790011223686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document cc +0.0017885401938106643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document cc + diff --git a/ALCF/data-lists/sunspot/dolma.txt b/ALCF/data-lists/sunspot/dolma.txt new file mode 100644 index 0000000000..7015d885c1 --- /dev/null +++ b/ALCF/data-lists/sunspot/dolma.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document algebraic-stack-train +0.0017591050606817512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document algebraic-stack-train +0.001459052794333798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document algebraic-stack-train +0.0007405667281569194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document algebraic-stack-train +0.00019420030110896795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document algebraic-stack-train +0.0009008668715801845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document algebraic-stack-train +0.00015115827957143057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document algebraic-stack-train +0.0014552844319220648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document algebraic-stack-train +0.0012469861325685161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document algebraic-stack-train +0.00136412011372413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document algebraic-stack-train +0.0007064279699221103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document algebraic-stack-train +0.0008472240000687427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document algebraic-stack-train +0.0001984375713341955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document algebraic-stack-train +0.0005472773881697123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document algebraic-stack-train +0.001815779629850992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document algebraic-stack-train +0.0018313600689757324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document algebraic-stack-train +0.0002583902668716813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document arxiv +0.0002646575141232155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document arxiv +0.0003165521247456758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document arxiv +0.0002920706460176214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document arxiv +0.00028396813182810215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document arxiv +0.00030445161883108107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document arxiv +0.00031628781276576474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document arxiv +0.0003083776568189157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document arxiv +0.0003176359471472902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document arxiv +0.0002536009369131698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document arxiv +0.0003067491424681363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document arxiv +0.0002597217257557784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document arxiv +0.0003788556450109768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document arxiv +0.0002796563272052598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document arxiv +0.00033573826524290287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document arxiv +0.00030523658022800287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document arxiv +0.00032211552192240096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document arxiv +0.0003329295675164247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document arxiv +0.0003101982186639862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document arxiv +0.00032361798234223355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document arxiv +0.0003495541581652915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document arxiv +0.0002821637448858042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document arxiv +0.00030399523537629673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document arxiv +0.0002955658968247219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document arxiv +0.00028942158502924254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document arxiv +0.00028769546171490733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document arxiv +0.0002938111057234182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document arxiv +0.0002711150403010948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document arxiv +0.00031130095874747565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document arxiv +0.0003002996118160777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document arxiv +0.0003732757901604459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document arxiv +0.00026784205751795894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document arxiv +0.0002799626521661984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document arxiv +0.00034334276069078164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document arxiv +0.0003582469803674965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document arxiv +0.00031094844818418623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document arxiv +0.0002766228384977191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document arxiv +0.00030297116159471485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document arxiv +0.00027033888377464685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document arxiv +0.00030090862368377933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document arxiv +0.00028543875802490955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document arxiv +0.00027559768459074204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document arxiv +0.0003182185533962886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document arxiv +0.0003311392971435837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document arxiv +0.00028751652060804325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document arxiv +0.000303466863212589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document arxiv +0.00033400462801277524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document arxiv +0.0002589234031777426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document arxiv +0.0002913508598466723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document arxiv +0.0002670572450004856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document arxiv +0.00032027399105647656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document arxiv +0.00032188376258379377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document arxiv +0.0003161585784100882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document arxiv +0.0003184249182974135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document arxiv +0.00030381336664000807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document arxiv +0.0003190437442184283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document arxiv +0.0002537961798200545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document arxiv +0.0003017817117223326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document arxiv +0.00028685268513240224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document arxiv +0.00031265179094451165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document arxiv +0.00034708319096986816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document arxiv +0.00026650837943080664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document arxiv +0.00034588832248507335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document arxiv +0.0002416982248399037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document arxiv +0.0003089296918222243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document arxiv +0.00029137184185700827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document arxiv +0.00026464226846800774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document arxiv +0.00030545397919456627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document arxiv +0.0003206778460448875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document arxiv +0.00030968971641110967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document arxiv +0.00023325653928600864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document arxiv +0.00030526899198338555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document arxiv +0.00035376719076633584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document arxiv +0.000290224385981026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document arxiv +0.000294650083382008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document arxiv +0.00028768858128616436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document arxiv +0.00030856965235527843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document arxiv +0.00030579942447879054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document arxiv +0.0002863101084704357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document arxiv +0.0002870032092492213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document arxiv +0.000264182727569885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document arxiv +0.0002974012367036449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document arxiv +0.00032238412143059203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document arxiv +0.00031683716893819036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document arxiv +0.00031157434937617524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document arxiv +0.0003411742735695989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document arxiv +0.00026778444816570715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document arxiv +0.0003037045797275201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document arxiv +0.00027746114370081314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document arxiv +0.00027148285946862043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document arxiv +0.00028042950114678207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document arxiv +0.0003235607816590721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document arxiv +0.0003086692227306295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document arxiv +0.00033990349455148105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document arxiv +0.00030945053208470265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document arxiv +0.00027309074552265303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document arxiv +0.00028737393506316194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document arxiv +0.0003098868328009879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document arxiv +0.0002614229162588409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document arxiv +0.0002884388407820923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document arxiv +0.0031025147279277244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document books +0.003102019887362634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document books +0.0009996745994661548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document books +0.0002406272620255565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document c4 +0.0002404825539493424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document c4 +0.00024062296575435581 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document c4 +0.00024069315766818953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document c4 +0.00024055829162263452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document c4 +0.00024062053397343032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document c4 +0.0002410715545206964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document c4 +0.00024024881846087368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document c4 +0.0002407074700790688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document c4 +0.00024072141428809043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document c4 +0.00024027710230872736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document c4 +0.0002409111299205489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document c4 +0.00024081954058275009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document c4 +0.00024086076794990912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document c4 +0.00024098672620832446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document c4 +0.00024068622303333862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document c4 +0.00024140627024291824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document c4 +0.0002414512033594384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document c4 +0.00024028742594941463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document c4 +0.00024018036089269645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document c4 +0.0002398347365034979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document c4 +0.00024006780153485276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document c4 +0.00024015620270419213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document c4 +0.0002408848259695227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document c4 +0.0002408023185278831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document c4 +0.00024021196580140326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document c4 +0.00024077677271297493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document c4 +0.00024087392454668027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document c4 +0.0002408071293824126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document c4 +0.00024042223828845715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document c4 +0.0002411484752360495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document c4 +0.00023605263746465907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document c4 +0.00023471222158326908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document c4 +0.00023432138580287644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document c4 +0.00023407385623382327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document c4 +0.00023487504174367091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document c4 +0.0002341843704976313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document c4 +0.00023421993170282486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document c4 +0.00023445057969132037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document c4 +0.0002337681680073047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document c4 +0.000234627964808109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document c4 +0.0002338942211888584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document c4 +0.00023403849286843386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document c4 +0.00023405641310796305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document c4 +0.00023349169562397965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document c4 +0.00023381157386048856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document c4 +0.00023388742993790587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document c4 +0.00023363103829469813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document c4 +0.00023421141834630477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document c4 +0.00023420564352232565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document c4 +0.00023367463699173143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document c4 +0.00023344969163567033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document c4 +0.00023372196941547188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document c4 +0.00023399207645297834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document c4 +0.00023357915605505856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document c4 +0.00023337585642190864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document c4 +0.00023385005470157914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document c4 +0.00023301533534493465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document c4 +0.00023377864302541782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document c4 +0.00023323745848621437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document c4 +0.0002330594611151835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document c4 +0.0002334149675026783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document c4 +0.00023198945902291534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document c4 +0.00023023784834634142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document c4 +0.00022985623060187217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document c4 +0.0002292605284569516 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document c4 +0.00022926593333048894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document c4 +0.00022922766406807777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document c4 +0.00022898153911167426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document c4 +0.0002292473111593315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document c4 +0.000228804579400424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document c4 +0.00022865485613513526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document c4 +0.00022937426835887895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document c4 +0.00022917388311587372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document c4 +0.0002291660582019043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document c4 +0.00022907895248360543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document c4 +0.0002294617879920205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document c4 +0.0002290452150516566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document c4 +0.00022943405619715553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document c4 +0.0002296271421006204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document c4 +0.00022854791372910372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document c4 +0.00022923123467686557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document c4 +0.00022852404355738494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document c4 +0.00022847798660086642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document c4 +0.0002289604586810316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document c4 +0.00022835479834950643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document c4 +0.0002289149402884243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document c4 +0.00022806655474763446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document c4 +0.00022826296420992974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document c4 +0.00022906829636213627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document c4 +0.0002287628414466998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document c4 +0.0002282673911253445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document c4 +0.00022869309841939134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document c4 +0.0002281540116815451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document c4 +0.0002259755756162738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document c4 +0.00022562331285233504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document c4 +0.0002259061146106053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document c4 +0.00022567670836663787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document c4 +0.00022573165387587061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document c4 +0.00022508514961670572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document c4 +0.00022564642513773356 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document c4 +0.00022563088621998788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document c4 +0.0002250438755373707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document c4 +0.00022524465346241134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document c4 +0.00022531737657666812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document c4 +0.00022444687519363458 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document c4 +0.00022460397498596298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document c4 +0.00022454218976501763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document c4 +0.00022447528843671366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document c4 +0.00022501666332178926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document c4 +0.00022453752304377972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document c4 +0.00022484451871163002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document c4 +0.00022465678847154914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document c4 +0.00022453180917044732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document c4 +0.0002247278486823009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document c4 +0.00022465794828242097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document c4 +0.00022431000701925386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document c4 +0.00022476020248460963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document c4 +0.00022467531771795015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document c4 +0.0002236391309945234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document c4 +0.00022458764920536007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document c4 +0.00022430877426744415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document c4 +0.0002247047786127192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document c4 +0.0002245298090400035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document c4 +0.0002245648831396188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document c4 +0.00022292894729820784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document c4 +0.00022236668082957533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document c4 +0.0002217622659895442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document c4 +0.00022252452726732609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document c4 +0.00022135333211363678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document c4 +0.0002214571757787971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document c4 +0.0002217188139237798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document c4 +0.00022144214894640303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document c4 +0.00022100172806631854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document c4 +0.00022156392409199052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document c4 +0.00022134830143710272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document c4 +0.00022158598922529453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document c4 +0.00022142932483041377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document c4 +0.00022120980907786554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document c4 +0.00022117917738112441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document c4 +0.00022077089397851235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document c4 +0.00022093265074996711 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document c4 +0.00022091299741377004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document c4 +0.0002205849150703338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document c4 +0.0002210648204787979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document c4 +0.0002214235747364102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document c4 +0.00022083907302221787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document c4 +0.0002206334237915964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document c4 +0.00022065193929912214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document c4 +0.00022079775597767288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document c4 +0.00022091492909963518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document c4 +0.00022095009987097293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document c4 +0.0002208150577180165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document c4 +0.00022085759102772088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document c4 +0.00022073789170129016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document c4 +0.00022049322781182384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document c4 +0.00022083270617761285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document c4 +0.00021982452827473632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document c4 +0.00021899870446514259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document c4 +0.00021890358773356361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document c4 +0.00021875556609042841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document c4 +0.00021861195987201226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document c4 +0.00021856782186167455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document c4 +0.00021912837771543515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document c4 +0.00021900213768517756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document c4 +0.00021871675851390374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document c4 +0.0002180537056545586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document c4 +0.0002188196714327129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document c4 +0.00021851362624523464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document c4 +0.0002183236795498736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document c4 +7.291153618675672e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document c4 +0.0003742481815405742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document cc +0.00038204855962733055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document cc +0.00038821818392663593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document cc +0.00038723332988783727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document cc +0.00038916141142149904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document cc +0.00038049542523949033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document cc +0.0003854755539534284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document cc +0.00024202756466512517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document cc +0.0003915405155008087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document cc +0.0003927382151931033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document cc +0.0003839151202260479 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document cc +0.00040006817468967907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document cc +0.00040318965964443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document cc +0.0003831013019452741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document cc +0.00039166638383204036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document cc +0.00039962784023961004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document cc +0.00039536707853602614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document cc +0.0004204304698247758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document cc +0.00041538899178693555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document cc +0.00039186953333675306 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document cc +0.00038945837196504305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document cc +0.0003919951238929062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document cc +0.00044377065718528966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document cc +0.0004407759068603017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document cc +0.0002487811895843715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document cc +0.00039349432045556636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document cc +0.00041223198559462343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document cc +0.0004036573014830213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document cc +0.0003825982215521807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document cc +0.00040386867133151386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document cc +0.00024460575279105167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document cc +0.000269029789531335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document cc +0.0003573757493252864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document cc +0.0004600876681392076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document cc +0.0002605354166397086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document cc +0.0003882502452157999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document cc +0.0002466747612126512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document cc +0.0004024726105072402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document cc +0.00040820631128483644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document cc +0.0002691094350403538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document cc +0.00026916830387277267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document cc +0.0004204663297880574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document cc +0.00042379698687085554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document cc +0.0004502169227311871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document cc +0.0002661708937015295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document cc +0.00031239486948031334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document cc +0.0003109054589936201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document cc +0.00045873053079760646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document cc +0.00022904931423244635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document cc +0.0003813462028433663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document cc +0.00039188129256500874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document cc +0.00045124222276983765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document cc +0.00048138658436853695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document cc +0.0003944178776279866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document cc +0.00039941569676754006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document cc +0.00037952761190240494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document cc +0.0003944870860881476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document cc +0.0003891842411856621 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document cc +0.000387688981934861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document cc +0.00039197953876258005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document cc +0.00039007915280311206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document cc +0.0003995520363699188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document cc +0.00039230985654592406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document cc +0.0003929472067173851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document cc +0.0003924096172671473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document cc +0.0003881636143629905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document cc +0.000389790617937084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document cc +0.00037351762309221023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document cc +0.0003630196170929407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document cc +0.00033532465765142113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document cc +0.0003076088685761823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document cc +0.00039463850897720803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document cc +0.0002843816115231449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document cc +0.0002909175709416474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document cc +0.00028867170997202486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document cc +0.0002838644617723659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document cc +0.00029027869525543416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document cc +0.0002821339567560056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document cc +0.0002922988877045601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document cc +0.0002866955958315786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document cc +0.0002865271754558126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document cc +0.0002861247475618473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document cc +0.0002826681072408606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document cc +0.0002849746458282827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document cc +0.0002816966633435316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document cc +0.00026255342235948463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document cc +0.0002552895098829678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document cc +0.00025990194083107813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document cc +0.0002524062657685835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document cc +0.0002538577379748611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document cc +0.0002561415177406761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document cc +0.00026206253059694905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document cc +0.00026168095406910565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document cc +0.0002601305742008613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document cc +0.00025200823006814814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document cc +0.0003229951981263502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document cc +0.00037289448266476045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document cc +0.0003807825862179898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document cc +0.0003616333738191483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document cc +0.0003665117918907636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document cc +0.0003684186453633228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document cc +0.0003589330610806066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document cc +0.00036383861418030395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document cc +0.000359841363355303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document cc +0.00036431044063050464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document cc +0.0003668574090358279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document cc +0.000362768263620199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document cc +0.0003501888032771077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document cc +0.000352401968221528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document cc +0.0003541019701869794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document cc +0.0003628121865546891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document cc +0.0003752582953758773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document cc +0.00037902046230424966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document cc +0.0003777927146925147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document cc +0.0003760676130509053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document cc +0.00034046049078755405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document cc +0.0003338847563259091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document cc +0.00033294499102761794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document cc +0.0004912026198265864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document cc +0.00032064363474664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document cc +0.00032154190389541214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document cc +0.00032309660151746207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document cc +0.00031181143365304544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document cc +0.00031046092294569104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document cc +0.00031150165249068046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document cc +0.0003041314265988224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document cc +0.0003024834909739394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document cc +0.0003019936835833604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document cc +0.000292329665283177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document cc +0.0002867061143144972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document cc +0.00028443615610701707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document cc +0.00028462291013755945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document cc +0.0002793538601205013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document cc +0.00027306573977044246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document cc +0.00027097155673336525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document cc +0.0002752934202112985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document cc +0.00043042012694697647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document cc +0.00047495648822986177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document cc +0.00047755032493473855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document cc +0.0004706974343933747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document cc +0.00046682163297771817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document cc +0.0004616765425874178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document cc +0.00030644496751628097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document cc +0.0002909492555358308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document cc +0.00027272036068261724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document cc +0.0004101070217315588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document cc +0.0003728914338834357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document cc +0.00036546911442305647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document cc +0.0003669945482407483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document cc +0.0003715902407424017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document cc +0.00035837486406683366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document cc +0.0003573318538685469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document cc +0.0003553784893071916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document cc +0.0004920659809912352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document cc +0.0004533619411303183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document cc +0.00045067066057818706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document cc +0.00044396985139270645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document cc +0.00043198288204468477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document cc +0.00043005174223738454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document cc +0.00041847118430776784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document cc +0.00042952036375796664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document cc +0.00043420594647324267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document cc +0.0003461123241053012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document cc +0.0003408581597849182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document cc +0.00033172705422182547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document cc +0.0003392566490686136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document cc +0.00033578341518385483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document cc +0.0003439196710518844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document cc +0.00034559163447085543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document cc +0.00033762478642902825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document cc +0.00033215210055107224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document cc +0.00033423579608014966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document cc +0.0004963355016025102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document cc +0.0004996862761456923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document cc +0.0005000551829325451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document cc +0.0005004212610098755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document cc +0.00027768695585500585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document cc +0.00028395983854338433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document cc +0.00027835826303062254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document cc +0.0002740073176010804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document cc +0.0002791830529274016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document cc +0.0002796863816194411 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document cc +0.00026697453022672804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document cc +0.0002594197440280141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document cc +0.0003779565697649222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document cc +0.00041835823476586606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document cc +0.00043788493575265915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document cc +0.0002731731970096006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document cc +0.000276305847423402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document cc +0.0002704955773958623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document cc +0.0002629635944827518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document cc +0.000260070956974436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document cc +0.00025661553791456334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document cc +0.00025794727207576157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document cc +0.00025295733980001527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document cc +0.0003788106407021029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document cc +0.0004882344027669431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document cc +0.0003275324309642705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document cc +0.0004803401856640094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document cc +0.00046720138323433943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document cc +0.00043527810307095335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document cc +0.00043905395741627827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document cc +0.00048774175867331425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document cc +0.00048380704121346737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document cc +0.0004779011848346118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document cc +0.00046255587581908036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document cc +0.00045127922880511576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document cc +0.0004503891485256095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document cc +0.0004450142332303422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document cc +0.00044630282482516654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document cc +0.00044325014465743616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document cc +0.0004263874842796447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document cc +0.0004217530913646938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document cc +0.000415120314341852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document cc +0.00040987168279144537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document cc +0.00033468337266607834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document cc +0.0003353094464683005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document cc +0.0004833936821707294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document cc +0.00047194878988920935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document cc +0.0004648324126996427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document cc +0.0004562345003964941 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document cc +0.0004933203505465098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document cc +0.0003530166075325466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document cc +0.00035368548192804685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document cc +0.0004872620828289663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document cc +0.00048293889392426456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document cc +0.00047936768462267655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document cc +0.00047821013991587545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document cc +0.0004660610308564753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document cc +0.000394683430103437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document cc +0.00039165053441571324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document cc +0.0003906936040164381 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document cc +0.00038074803919159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document cc +0.0003686529291578143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document cc +0.00035832920428870976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document cc +0.00035929024535947033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document cc +0.0003538226556050544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document cc +0.0003584167868708799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document cc +0.0003480507542594234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document cc +0.0003413709023543034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document cc +0.00034001304759361455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document cc +0.00033430532902756514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document cc +0.00046519252660631277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document cc +0.0002938876402514769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document cc +0.00028676090994509047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document cc +0.00027296150117506716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document cc +0.00026513502621960483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document cc +0.0002680081327926125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document cc +0.00025831225828720344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document cc +0.00026647037295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document cc +0.0002525733734572654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document cc +0.00025831708887575375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document cc +0.00042487627444443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document cc +0.0004951213245023891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document cc +0.0004804051413177752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document cc +0.0004662397611340532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document cc +0.0004550138655253933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document cc +0.00044494909122746795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document cc +0.0002899112253051385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document cc +0.0004372879736279761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document cc +0.0004529568099252922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document cc +0.00045127826158829573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document cc +0.0004436558176737439 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document cc +0.0004419233237678378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document cc +0.000434589215880319 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document cc +0.00029153613207706566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document cc +0.0004312458058738854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document cc +0.00028741854968757313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document cc +0.00046853200754421234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document cc +0.0004949145252030074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document cc +0.00044459683920483167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document cc +0.0003836095306696336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document cc +0.0003789760237872398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document cc +0.0003749227438304427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document cc +0.0003628558277173369 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document cc +0.00039468301394041474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document cc +0.00038874701821614864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document cc +0.0004158492456077867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document cc +0.00042360504554060077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document cc +0.00040386729844317623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document cc +0.00027595096702902474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document cc +0.00043638766787829135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document cc +0.0002218691596850179 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document cc +0.0004437566108089954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document cc +0.0003889996411609667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document cc +0.00043454421906537704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document cc +0.0004522564392830988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document cc +0.00041517835659357416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document cc +0.0002614360863446896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document cc +0.00037543522111463596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document cc +0.0004386190133514781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document cc +0.00046358333286115075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document cc +0.00043186261317942404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document cc +0.0002377581602097957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document cc +0.00025973334085074254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document cc +0.00040139099332000796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document cc +0.00043674860686687174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document cc +0.00040853289309329373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document cc +0.000242910191729688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document cc +0.0004431071731750582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document cc +0.0004388092670482523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document cc +0.000381418866255965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document cc +0.0004100117296419717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document cc +0.00042469230366022745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document cc +0.00041744151905374254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document cc +0.00022835699906752945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document cc +0.0004380161085387397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document cc +0.00044803212381807456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document cc +0.00040554932796137236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document cc +0.0004234508646347761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document cc +0.00043341209652360653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document cc +0.00023966604734537185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document cc +0.000259165907316014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document cc +0.0004270653021833602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document cc +0.0004341547032162028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document cc +0.0004111478117275994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document cc +0.0004299383567984396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document cc +0.0004241899124590779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document cc +0.0004502719349364145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document cc +0.00038994621469645615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document cc +0.0003859912398894952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document cc +0.0004247535950310557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document cc +0.000386982084327716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document cc +0.0004196451040053251 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document cc +0.0004096278509782259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document cc +0.0004373334932695721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document cc +0.0004180889975240641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document cc +0.00042079636929672745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document cc +0.00038063574611812913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document cc +0.0003817505891515542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document cc +0.0004420096268860222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document cc +0.00039182670726410623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document cc +0.0003635667850372299 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document cc +0.00041564996472055667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document cc +0.000400529358757286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document cc +0.0003939113874958451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document cc +0.00039066622068940996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document cc +0.0004290098538807143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document cc +0.0004240739958197099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document cc +0.00040775392659215333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document cc +0.0004091634200396925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document cc +0.00042299190476617914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document cc +0.0003701492680344151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document cc +0.0003807353844384635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document cc +0.00038813507771983156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document cc +0.00040072346558408346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document cc +0.0003603595180423597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document cc +0.00038799421353112465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document cc +0.00037575235582264926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document cc +0.0004239190342959713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document cc +0.0004606044799136546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document cc +0.00045107950652529253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document cc +0.0004391947201871058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document cc +0.0004457516661123035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document cc +0.0004301297170991686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document cc +0.00044661704164586694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document cc +0.0004438849846114837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document cc +0.0004444205734316823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document cc +0.0004190924165303394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document cc +0.00043942581131677875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document cc +0.00021568459798090663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document cc +0.0003814929225407199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document cc +0.0003217453179359235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document cc +0.00031719591470267974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document cc +0.00032434115726922137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document cc +0.0004079911120371051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document cc +0.000329492766381148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document cc +0.0003845916162001633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document cc +0.0003835208964390098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document cc +0.00037847334157173194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document cc +0.00038296039903791865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document cc +0.00037896336828472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document cc +0.00037620974396391355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document cc +0.00037420590727111843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document cc +0.000340490625886403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document cc +0.0003078314411035827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document cc +0.00034153990750656097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document cc +0.0003308858103982067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document cc +0.0003452640607156025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document cc +0.00033095276418403455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document cc +0.0003116308995860414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document cc +0.00032446713226408477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document cc +0.0003015816821912984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document cc +0.00031612418775706894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document cc +0.0003278516344971041 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document cc +0.00033079446736097217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document cc +0.00032278977146550837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document cc +0.00032065272988207914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document cc +0.0003936696452406576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document cc +0.0003450109536627789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document cc +0.0003339787189919641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document cc +0.0003284303856176974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document cc +0.00033652677276843477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document cc +0.0003257822443845694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document cc +0.0003293985569149334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document cc +0.0003310360260148262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document cc +0.0003233770986418526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document cc +0.0003172280092149422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document cc +0.0003160674744292835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document cc +0.00030931090289598506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document cc +0.0003093173886443107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document cc +0.00033167847081104083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document cc +0.00031131501311729723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document cc +0.00031046608876279845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document cc +0.00030569235942207244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document cc +0.00030777943671285197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document cc +0.00029303314290956683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document cc +0.0003045824546400205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document cc +0.00030360880677729793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document cc +0.00031646239964835433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document cc +0.0003129122300603785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document cc +0.00031060464956661433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document cc +0.000311819032500067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document cc +0.0002977872483902282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document cc +0.0003009448600922438 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document cc +0.00028610292098537774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document cc +0.0002988326876216654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document cc +0.00028550828372819075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document cc +0.0002830381750875739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document cc +0.0002848495855927156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document cc +0.0002856443760308144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document cc +0.00027442895344188584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document cc +0.0002681160554049462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document cc +0.0003421482544126989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document cc +0.0004005872948449718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document cc +0.0003930123959320308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document cc +0.0003867271832275778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document cc +0.000380805140455254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document cc +0.0003814769861947819 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document cc +0.00038025170883282324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document cc +0.0003738026647867475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document cc +0.00018960856915036276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document cc +0.0003697177501953134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document cc +0.00036674194328136693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document cc +0.00036447406838697555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document cc +0.00036686410861101255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document cc +0.00035915267825103423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document cc +0.0003624758404026675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document cc +0.0002822812140180794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document cc +0.00030620512946920813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document cc +0.000294249776520589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document cc +0.00030238536967523434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document cc +0.00029509593361580754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document cc +0.0002906912701830899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document cc +0.0002921944165474959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document cc +0.00028358919691127954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document cc +0.0002813182772323272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document cc +0.00027442640800299205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document cc +0.0002747820342933984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document cc +0.0002747584403979717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document cc +0.00027499129634862444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document cc +0.0002712050404257197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document cc +0.0002616256943143254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document cc +0.00026769938929002815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document cc +0.00038396081322727017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document cc +0.0003863140490027991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document cc +0.00037702277513203237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document cc +0.0003633274156107032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document cc +0.0003587473889240435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document cc +0.0003507672084278415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document cc +0.00033776425499780385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document cc +0.0003377914127574796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document cc +0.00032948015659161326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document cc +0.00033245638541392985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document cc +0.00031080707640648695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document cc +0.0002976903331149755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document cc +0.0002965121463725523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document cc +0.0002933849695266647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document cc +0.0002837035078508233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document cc +0.00028684569079589323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document cc +0.0003145192320802359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document cc +0.0003566937253273515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document cc +0.0003470199109592918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document cc +0.0003060245312041868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document cc +0.0002650817213818789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document cc +0.0002643604938780134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document cc +0.000299350876031416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document cc +0.0003178540797697938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document cc +0.000271850367887767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document cc +0.00031349896596549 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document cc +0.00031749734412765755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document cc +0.0003791137842391209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document cc +0.0003742334169957992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document cc +0.0003705639757351107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document cc +0.0003126986769797042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document cc +0.00031038132814561196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document cc +0.00036464437173804883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document cc +0.0003569480488951322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document cc +0.0003541239221619106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document cc +0.00035315297411308053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document cc +0.0003572451925404141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document cc +0.0003514986129411253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document cc +0.0003521798298425866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document cc +0.00034553677439244716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document cc +0.000349004719809412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document cc +0.0003468247484872769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document cc +0.0003465822608356558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document cc +0.00035410983132162007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document cc +0.0003487908354969444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document cc +0.0003479024763238147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document cc +0.000341412530646823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document cc +0.00034451316273667034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document cc +0.0002618849993484869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document cc +0.00026788679978901144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document cc +0.00027450670773227214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document cc +0.0002661273129899329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document cc +0.00026836569676402957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document cc +0.00026155876975483236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document cc +0.0002609276830117151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document cc +0.0002644161630512771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document cc +0.00036789208972872557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document cc +0.00037829849439990513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document cc +0.0003788894943523098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document cc +0.0003617207777959397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document cc +0.0002541334487248998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document cc +0.0002707945538071073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document cc +0.00027046282716455214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document cc +0.0002652443167243215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document cc +0.0002685859923850986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document cc +0.00025734961751176414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document cc +0.000259041720872915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document cc +0.00025340107274823446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document cc +0.00025757135121837893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document cc +0.00025617700500574084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document cc +0.0002566931670562857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document cc +0.0002543871190716101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document cc +0.00024997565589481713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document cc +0.0002954079779456287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document cc +0.00034890741135252835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document cc +0.0003473298137731525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document cc +0.0003296959618486435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document cc +0.0003304520061604598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document cc +0.00032377956175729824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document cc +0.00031700696295168713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document cc +0.0003060382346081943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document cc +0.0003012003005056863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document cc +0.0002981074073993884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document cc +0.0002922128825950705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document cc +0.000348901087722931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document cc +0.0003408286289467841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document cc +0.0003410649680770183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document cc +0.0003358524215576502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document cc +0.0003343661874989231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document cc +0.00032810573699389156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document cc +0.00032261449539097497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document cc +0.0003162694866049203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document cc +0.0003158381156468853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document cc +0.000317376061083603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document cc +0.0003125788639953052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document cc +0.0003010105041885602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document cc +0.0003065865059090678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document cc +0.0003084275726508053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document cc +0.00030966560718296085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document cc +0.0002957728057853081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document cc +0.00029904164542325336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document cc +0.0002955358888729187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document cc +0.00028692976446931544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document cc +0.0002923476214935797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document cc +0.0002893691697212419 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document cc +0.0002855895211981585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document cc +0.00027968347097626246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document cc +0.0002810783462604979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document cc +0.00027794080455729715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document cc +0.00034784376461416953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document cc +0.0003488347959010943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document cc +0.00034790583710250724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document cc +0.000345913166618151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document cc +0.00033801936268066675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document cc +0.0003290591130212315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document cc +0.00034051399521366823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document cc +0.00032470943131841784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document cc +0.00031679540050914276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document cc +0.00031814596342422325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document cc +0.0003156466289485036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document cc +0.00029985010879003633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document cc +0.0002905176377776361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document cc +0.0004206836775460856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document cc +0.00020660449162246918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document cc +0.0003461727254468087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document cc +0.00020592870907067763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document cc +0.00034173505299233005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document cc +0.0004052437256652738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document cc +0.0004080650901351697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document cc +0.00039778184149144276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document cc +0.00039046311464950275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document cc +0.00039043444911071384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document cc +0.000388575704932843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document cc +0.00019737533145666597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document cc +0.00037610755595812403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document cc +0.00037315400127598317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document cc +0.00037415028580922163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document cc +0.00036694041707212337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document cc +0.00018947219857306515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document cc +0.00037046050826533545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document cc +0.0003587440768559087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document cc +0.00034623936498708903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document cc +0.0003502289592617922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document cc +0.00034692398063649823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document cc +0.000339340809421849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document cc +0.0003360510394816983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document cc +0.0003354673850814145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document cc +0.00032937682875877047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document cc +0.00032844505049317715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document cc +0.00028287199339908627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document cc +0.0002795217197003578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document cc +0.00028048955601883463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document cc +0.0002769326396439027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document cc +0.0002727090021299243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document cc +0.0002726577841024554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document cc +0.00026663619593455374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document cc +0.00026068042672138127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document cc +0.0002637704114326801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document cc +0.0002593043567100412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document cc +0.0002599897110113453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document cc +0.0002435078682758859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document cc +0.0002450530071379054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document cc +0.00024233331983743606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document cc +0.0002934750947999535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document cc +0.00033241226364044474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document cc +0.00032938406090272075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document cc +0.00032778705403953246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document cc +0.00032184551480398754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document cc +0.00031874002264945737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document cc +0.0003165319685666433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document cc +0.00031307071173376295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document cc +0.00031119524184911957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document cc +0.0003102253344576429 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document cc +0.0003088976240383192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document cc +0.0002951410823077708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document cc +0.00029772657676757413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document cc +0.0003056048989909935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document cc +0.00031991305381648026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document cc +0.00030890256978362426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document cc +0.0003109382904091933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document cc +0.00031035798529690644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document cc +0.00030741666395911753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document cc +0.0002989918594861846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document cc +0.00029569635443989434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document cc +0.0002973992445667285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document cc +0.000293397351001072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document cc +0.00028737817438047954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document cc +0.00028252738144009747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document cc +0.0002805511898623541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document cc +0.0003718020784620472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document cc +0.0003499713845765235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document cc +0.00034283547445326676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document cc +0.00031464759888838765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document cc +0.00033188946446414833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document cc +0.000326084432195463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document cc +0.0003764568303917893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document cc +0.0003604955598858414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document cc +0.0003655654554133222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document cc +0.00035762304033750504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document cc +0.00038478883950347103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document cc +0.00027735714341247454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document cc +0.00028139534607773563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document cc +0.00019777292251713763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document cc +0.000285571704874486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document cc +0.00028543482146244363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document cc +0.00019434234484256758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document cc +0.00027854908176986763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document cc +0.0002847068039566143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document cc +0.00028672356943064853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document cc +0.00027782687605808177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document cc +0.0002843539634105203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document cc +0.0002894748379090401 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document cc +0.0002868852440186493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document cc +0.0002818504885373851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document cc +0.00028680112812941034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document cc +0.00019258978168723977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document cc +0.00028760637934715155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document cc +0.0002820439443912918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document cc +0.0002831001054410018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document cc +0.00029001901552467397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document cc +0.00027779449377883156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document cc +0.00019949837437516796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document cc +0.0002907306472984446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document cc +0.00027814858381318327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document cc +0.00019472790889161432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document cc +0.00020472626596924125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document cc +0.0002870045081974301 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document cc +0.00019812241927078482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document cc +0.0002817553333369554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document cc +0.00027829782796642117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document cc +0.00028289431732284113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document cc +0.0002795526296717729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document cc +0.00027682829988044574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document cc +0.0002895432402719184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document cc +0.0002823174903941811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document cc +0.00028170972351837796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document cc +0.00027807915877838826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document cc +0.00028588515681452956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document cc +0.00028112324090816726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document cc +0.00020636178289985485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document cc +0.00019447255290980535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document cc +0.0002850824220591452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document cc +0.00027856429520116784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document cc +0.0002820880676635633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document cc +0.00028943902215995714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document cc +0.0002676366291085329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document cc +0.00023806333809954687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document cc +0.00024526460430233455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document cc +0.00023876876664622726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document cc +0.00023379770334179805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document cc +0.00024175151269138382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document cc +0.00023386583242595706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document cc +0.00023771797150160827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document cc +0.0002262748967483896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document cc +0.0002408148346432682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document cc +0.00023398651720444235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document cc +0.00022989433874474592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document cc +0.00023948500543957772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document cc +0.0002331594076859196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document cc +0.00023375132439600242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document cc +0.00023923410909668642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document cc +0.00023952796315562954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document cc +0.0002327466076905069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document cc +0.00023082758956797212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document cc +0.0002240509275524448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document cc +0.00022798879995765268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document cc +0.000221172516774386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document cc +0.00021767045123534623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document cc +0.00021982832794804484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document cc +0.00021971626543789102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document cc +0.00022566565206920132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document cc +0.0002181984894194856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document cc +0.00021831417549554653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document cc +0.00021601405421187145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document cc +0.00022275733725519607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document cc +0.00021847734911973986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document cc +0.0002243591012664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document cc +0.00021688758139483833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document cc +0.0002182953624789215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document cc +0.00020475155724026002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document cc +0.00021498078062960065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document cc +0.0002157914337233064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document cc +0.00021781838494967963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document cc +0.00021723242266814558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document cc +0.0002176782686553837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document cc +0.0003486179404943968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document cc +0.00034882846352857634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document cc +0.00031400868448352596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document cc +0.00030273484020011963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document cc +0.00029895889118145404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document cc +0.00029770764609621714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document cc +0.0002990181332116852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document cc +0.00029653733972285996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document cc +0.00029624649222942476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document cc +0.00029625609720203576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document cc +0.00029731928930852147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document cc +0.00029011721326148513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document cc +0.00028849788197494655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document cc +0.00021601278623858145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document cc +0.00021319599281739178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document cc +0.0002153325290600083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document cc +0.00018566946174516558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document cc +0.00020736824394291617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document cc +0.00020857419820128004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document cc +0.00020058526129536423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document cc +0.00020745812166665217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document cc +0.00020652171015271702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document cc +0.00020643808911278608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document cc +0.00020040513914482103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document cc +0.00020598050188272898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document cc +0.0001969184139343296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document cc +0.0001972748812937012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document cc +0.0002038556751586195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document cc +0.00020245186011313464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document cc +0.00019950381422038783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document cc +0.00020837055459665258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document cc +0.00020371856218246096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document cc +0.00019537612301625791 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document cc +0.00019914984508813857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document cc +0.0002053787713691309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document cc +0.00019082100541008637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document cc +0.00020397153334531813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document cc +0.0002021462693077317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document cc +0.00019609357008124035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document cc +0.00019693256622486236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document cc +0.00020007239732428112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document cc +0.00020467075741591954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document cc +0.00019584883400022932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document cc +0.00019135050391176972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document cc +0.0003362829834208298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document cc +0.00034013691154784095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document cc +0.00033215887031941976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document cc +0.00032681189065396707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document cc +0.0003149138485493094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document cc +0.00030179177307540077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document cc +0.0002923278437581119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document cc +0.00029470052278994486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document cc +0.0002994095093045731 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document cc +0.00029033525096085037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document cc +0.00029390798852496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document cc +0.0002916230924130842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document cc +0.00029419886374594913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document cc +0.0002865469756730764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document cc +0.00021191292549942086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document cc +0.00021369664817409847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document cc +0.00021612485624266726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document cc +0.00022242192634588478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document cc +0.00014605095659989698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document cc +0.00022070626106341693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document cc +0.0002174420774054071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document cc +0.00021325858963116995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document cc +0.0002124322999488052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document cc +0.0002081218896969054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document cc +0.0002108710211556957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document cc +0.00020686867095978426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document cc +0.00020895752681041895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document cc +0.00020741922266415738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document cc +0.0002069112657197308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document cc +0.00020644627473468118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document cc +0.00020332991338121604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document cc +0.0003560895677789848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document cc +0.00032915779111908214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document cc +0.00033810613317040864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document cc +0.00033729626594036923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document cc +0.00033550342864602944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document cc +0.00034173474024556906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document cc +0.000331505340748827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document cc +0.0003270050330117195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document cc +0.00032585275329172556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document cc +0.0003143383203190604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document cc +0.00031655199110388894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document cc +0.00030738872158476413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document cc +0.00030838388352699285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document cc +0.0003053596995351888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document cc +0.00031836304739584593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document cc +0.000315315435873905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document cc +0.0003087116248965243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document cc +0.00030396790625537645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document cc +0.0003335812246032149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document cc +0.00034570956323095843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document cc +0.00034563035636675786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document cc +0.00033411265479076335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document cc +0.00034439191141692787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document cc +0.0003364483125496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document cc +0.0003299500453608033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document cc +0.00033163377700074837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document cc +0.00032638649660627673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document cc +0.00032616167939645234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document cc +0.0003205289298760723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document cc +0.00031939393740815355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document cc +0.00031593164066731296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document cc +0.00031928871111254405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document cc +0.00029670189073175004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document cc +0.00020517703846735904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document cc +0.00020128418186172073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document cc +0.00019662723895606717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document cc +0.0001981157042081407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document cc +0.00019703489037041608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document cc +0.00019079796331785068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document cc +0.0001909352306690079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document cc +0.00018824662295261396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document cc +0.00019864275319325954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document cc +0.00018818516521649587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document cc +0.00018875694972812844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document cc +0.00018231621170645482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document cc +0.00018349407845798273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document cc +0.00018088971427746906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document cc +0.00018296284236327237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document cc +0.0001876011825819916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document cc +0.000329052068725176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document cc +0.00032223616273648536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document cc +0.00031272564089633955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document cc +0.00031621609908414494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document cc +0.0003117213560911235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document cc +0.00030218064069945934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document cc +0.00030658916600512085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document cc +0.0002915863534115821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document cc +0.0002940280138374372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document cc +0.00029067860468866085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document cc +0.00028529228063135635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document cc +0.00028336893301452256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document cc +0.0002794668089130099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document cc +0.00021681361378827842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document cc +0.0001484664674497246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document cc +0.00021950558378215133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document cc +0.00021806860758808645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document cc +0.00021819568718852282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document cc +0.00021626925931585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document cc +0.0001464536143077762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document cc +0.00021432777088808917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document cc +0.000213473805865147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document cc +0.00021397067253964538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document cc +0.00020758957647437263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document cc +0.00020687124337683314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document cc +0.00020630057046511005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document cc +0.0002091166859352538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document cc +0.00020777355025615267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document cc +0.00020709287641496176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document cc +0.00020736464660577094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document cc +0.00020062246741862607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document cc +0.00020693207561942915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document cc +0.00021151004871893024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document cc +0.00019930249098689716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document cc +0.00021589710041231824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document cc +0.00021369204789905741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document cc +0.0002147099923936778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document cc +0.00021077531190389536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document cc +0.0002100509829113836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document cc +0.00021185362601571124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document cc +0.00020722136637339565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document cc +0.00020300093701169531 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document cc +0.00019859737993313477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document cc +0.00019971314372100164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document cc +0.00019549908270269278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document cc +0.00019649820843534028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document cc +0.00019619415513498067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document cc +0.00019493006120377898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document cc +0.00019499409035775506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document cc +0.00019252988593634277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document cc +0.00019440768268686405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document cc +0.00018747161324755577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document cc +0.0001879575932372779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document cc +0.00019040707058357506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document cc +0.0001871931095090703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document cc +0.00020112966223017096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document cc +0.00020516878165311017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document cc +0.00020664735191740533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document cc +0.00021041398572882962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document cc +0.00020397992929690396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document cc +0.0002039978580295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document cc +0.00020592785601142126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document cc +0.0001990755527445265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document cc +0.00019729564847798732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document cc +0.00019958182230527032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document cc +0.0001985037302636386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document cc +0.00020204130355115716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document cc +0.0002000296401958085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document cc +0.0001983064832295463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document cc +0.00019663108484195617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document cc +0.00019510678560556523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document cc +0.0001873284057063206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document cc +0.00019311553072495885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document cc +0.00034652137288816547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document cc +0.0002813690318850024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document cc +0.00027697649713138685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document cc +0.0002755419092534421 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document cc +0.0002681583054440219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document cc +0.00026945753192750824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document cc +0.00026169470768245737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document cc +0.00026437008960810825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document cc +0.0002637294838228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document cc +0.00026491867965088836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document cc +0.00025504483625138986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document cc +0.0002545040623796586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document cc +0.0002546682814073622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document cc +0.00025545439487142615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document cc +0.0002626896557978271 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document cc +0.00025092040940402784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document cc +0.0002589154885863872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document cc +0.00024106160482721467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document cc +0.0002483289690087987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document cc +0.0002388930282784437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document cc +0.00024006340759273874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document cc +0.00023765248178029045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document cc +0.00023061351965578936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document cc +0.00024954224883546477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document cc +0.00017861017233018525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document cc +0.00017810832743667658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document cc +0.00017599709170759497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document cc +0.00017462723516505223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document cc +0.0002906316527068669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document cc +0.00033762141066247166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document cc +0.00017170670574152494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document cc +0.00017258674515137717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document cc +0.0002815386173173926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document cc +0.0002996845935618989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document cc +0.0002735268488987296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document cc +0.0002971738713071517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document cc +0.0002942690674002763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document cc +0.0003322222207729567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document cc +0.0003378721656198464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document cc +0.00018307262621851067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document cc +0.00033956081502775057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document cc +0.00031604820927876276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document cc +0.00028805657681088917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document cc +0.00026312293321215633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document cc +0.00034366936722921455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document cc +0.0002865256504406559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document cc +0.0003063615195861786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document cc +0.00028412791619666136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document cc +0.00028060835132727154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document cc +0.00032544974761560506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document cc +0.0002647177833217225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document cc +0.0003152621884896575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document cc +0.0003054625140336913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document cc +0.00031183308312292263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document cc +0.00018175026696621178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document cc +0.00017699918328872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document cc +0.00018222339261441908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document cc +0.00018348005930964137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document cc +0.0001810735993810541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document cc +0.00030846441282038914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document cc +0.0002972326889310354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document cc +0.00017433421318235594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document cc +0.00032799458649525895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document cc +0.00032482130048512673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document cc +0.00031943465668672475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document cc +0.00029615593630484517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document cc +0.0002893126939511001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document cc +0.0002849288351723284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document cc +0.00028383906633569267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document cc +0.00028072526091262615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document cc +0.000284239564292377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document cc +0.0002778903109432523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document cc +0.0002771644389501471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document cc +0.0002733316182319337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document cc +0.00026362539185869363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document cc +0.0002636325383220217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document cc +0.00026740622442302886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document cc +0.0002646771971853427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document cc +0.0002628566720605389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document cc +0.0002644760695434766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document cc +0.0002623837702310999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document cc +0.00026088722976772894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document cc +0.0002567065374799158 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document cc +0.00018857382101207726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document cc +0.00019036580399817203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document cc +0.00018348828065261222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document cc +0.00018491851780345073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document cc +0.00018904887260080187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document cc +0.0001875609304251801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document cc +0.00018393034720015817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document cc +0.00018419795526114903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document cc +0.00018699955623404795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document cc +0.00018276256902965128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document cc +0.00017698045695190812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document cc +0.00018104650132303642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document cc +0.00017758206731279688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document cc +0.00017131402995103497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document cc +0.000175944428350446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document cc +0.0003416745727147391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document cc +0.0003163259373952889 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document cc +0.0002804489269172448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document cc +0.00028748272397403175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document cc +0.00027603318345630605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document cc +0.000271638824679648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document cc +0.0002763761210210942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document cc +0.00026501984873172717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document cc +0.00026422486894694714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document cc +0.0002686339100849262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document cc +0.0002610837453940606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document cc +0.000260974343729353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document cc +0.0002599403837029134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document cc +0.0002937273113238609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document cc +0.0003341790732600504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document cc +0.0002620661576600244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document cc +0.0003027929169239288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document cc +0.00031944039129326894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document cc +0.00019025676304139009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document cc +0.00018680910145009907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document cc +0.00034215840419416437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document cc +0.00018618120812119364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document cc +0.00018605853095599425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document cc +0.00018120712626096538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document cc +0.00018315079292495327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document cc +0.00018362556449041974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document cc +0.0001780024456718171 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document cc +0.00033296526436178697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document cc +0.0001802398632282846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document cc +0.00017340263100798256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document cc +0.00017755840547238697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document cc +0.00018419413735260606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document cc +0.00017869518174591322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document cc +0.00017526271460129484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document cc +0.00017852168597981907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document cc +0.00017566536156787157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document cc +0.00017589867964432936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document cc +0.00017831487394075305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document cc +0.00017837310528935862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document cc +0.00018200908814216548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document cc +0.0001795136627511612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document cc +0.0003414021775300033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document cc +0.00017177291787788502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document cc +0.0003441900648571877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document cc +0.0003394534597060673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document cc +0.0003236887233114832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document cc +0.0001639544129688747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document cc +0.00019137443753211255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document cc +0.00018575146284680153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document cc +0.00019184792863440243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document cc +0.00018966043065679055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document cc +0.00017968851317035848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document cc +0.00018479881897661546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document cc +0.0001813642692683015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document cc +0.0001686449798983066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document cc +0.00018516104592230446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document cc +0.00031283726601066385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document cc +0.0003248607542883853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document cc +0.00031583241601202365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document cc +0.00031238270857730376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document cc +0.000307150592403979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document cc +0.00029443829986847044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document cc +0.0002942723732234677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document cc +0.00023514930666443422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document cc +0.0020776328951453444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document cc +0.0021768234410538883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document cc +0.002106973549276289 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document cc +0.002110915756171751 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document cc +0.0017032382109816464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document cc +0.0019047944877712286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document cc +0.0019402711744016077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document cc +0.0006264790011223686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document cc +0.0017885401938106643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document cc +0.0003547982093445404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document falcon +0.00035934014428504944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document falcon +0.00035707704501371544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document falcon +0.00035287930712815354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document falcon +0.00035977166728996823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document falcon +0.0003581675664109838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document falcon +0.0003548617059697185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document falcon +0.0003639582000286208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document falcon +0.00035375839698688127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document falcon +0.0003743722020080678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document falcon +0.0003530399715341242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document falcon +0.00035511875882752406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document falcon +0.0003618733574783154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document falcon +0.00035185243285420104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document falcon +0.0003541503739732106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document falcon +0.0003631679485751914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document falcon +0.00035748045578182274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document falcon +0.0003606490690555877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document falcon +0.0003626383296610091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document falcon +0.00035442644361264756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document falcon +0.00035978370170539796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document falcon +0.0003585562375341541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document falcon +0.0003601958372888019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document falcon +0.000350277765402227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document falcon +0.0003616521184211704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document falcon +0.0003620625543608188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document falcon +0.0003560781983850704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document falcon +0.0003553209610592676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document falcon +0.00035905348643915075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document falcon +0.00034744258805696526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document falcon +0.00035462784035661496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document falcon +0.00034768186175100895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document falcon +0.0003568534635532736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document falcon +0.00035586511544371234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document falcon +0.0003524567827568137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document falcon +0.0003512453770426313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document falcon +0.0003591792726468799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document falcon +0.0003514024529343127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document falcon +0.0003584880112586934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document falcon +0.00035133552916418045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document falcon +0.0003600811981350215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document falcon +0.0003571663974228119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document falcon +0.00035768103378874214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document falcon +0.00035939205561113694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document falcon +0.00035186773916029825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document falcon +0.0003542829672490847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document falcon +0.0003592783642898726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document falcon +0.0003556367340099302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document falcon +0.00035391392271377027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document falcon +0.00035486725707484836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document falcon +0.00034866743396828035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document falcon +0.0003517219808644735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document falcon +0.00034874458549673823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document falcon +0.000355773136961014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document falcon +0.00035611750387841917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document falcon +0.00035305602013916315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document falcon +0.0003578207127071924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document falcon +0.00035514635841943707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document falcon +0.00034816946212866206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document falcon +0.0003512707269761496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document falcon +0.0003483392117980654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document falcon +0.0003572169607204321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document falcon +0.00035139153281660794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document falcon +0.00035536422129036537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document falcon +0.000352017164107143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document falcon +0.000351889550179365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document falcon +0.000358759689953589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document falcon +0.0003569286079869268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document falcon +0.0003657752958602099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document falcon +0.00035396127934790697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document falcon +0.0003618565071224743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document falcon +0.00035146051531973204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document falcon +0.00036107135765783567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document falcon +0.00035019554279994576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document falcon +0.00035567858879904983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document falcon +0.0003504753174793183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document falcon +0.00035931140831329194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document falcon +0.0003502967866002823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document falcon +0.0003532911801041972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document falcon +0.0003583543013070199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document falcon +0.0003566243489931224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document falcon +0.0003468752314799221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document falcon +0.0003597840618138091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document falcon +0.00035128822484768084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document falcon +0.00035889496943437507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document falcon +0.000352400524650424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document falcon +0.0003518689536768735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document falcon +0.00035866864741303467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document falcon +0.0003454687659106334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document falcon +0.00035348007259317576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document falcon +0.0003539752270940644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document falcon +0.00035146495994081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document falcon +0.00035397212846310423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document falcon +0.00035208246467162587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document falcon +0.0003490843168676626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document falcon +0.00035299633658644394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document falcon +0.00034868327466167065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document falcon +0.00035941351365601583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document falcon +0.0003545343062735255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document falcon +0.0003528956380445978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document falcon +0.0003553355770443352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document falcon +0.0003644224004937743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document falcon +0.00035234291036216907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document falcon +0.0003596237469847771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document falcon +0.0003531996065735989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document falcon +0.0003547177054106099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document falcon +0.0003575586499260483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document falcon +0.00035262635135283667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document falcon +0.0003624191962188944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document falcon +0.0003488398052948616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document falcon +0.0003598294093147917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document falcon +0.00035583006534466323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document falcon +0.00035403139653225103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document falcon +0.00036134702642187156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document falcon +0.0003573689927162834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document falcon +0.0003577141131435527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document falcon +0.00035208814419277406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document falcon +0.00035996720683665625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document falcon +0.00035415304658912596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document falcon +0.00036353353029443546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document falcon +0.0003537326003150983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document falcon +0.00036053976358299083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document falcon +0.000352380489373494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document falcon +0.00036154661616900994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document falcon +0.00035959332325963614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document falcon +0.0003597954667189692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document falcon +0.0003563108270597542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document falcon +0.0003582891940460143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document falcon +0.0003497728210484297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document falcon +0.0003549834902179354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document falcon +0.0003529828233484542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document falcon +0.00034627483903285777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document falcon +0.00035569006572589215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document falcon +0.00035449377946910314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document falcon +0.00035802844396194623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document falcon +0.0003617277809353208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document falcon +0.00035034118898654814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document falcon +0.000351091193908611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document falcon +0.0003527914342210668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document falcon +0.00035028288369781376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document falcon +0.00035775745592780506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document falcon +0.0003449630690661468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document falcon +0.0003583490698830361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document falcon +0.0003476995746684122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document falcon +0.0003535632505019212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document falcon +0.00035640180641147417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document falcon +0.000361731045691765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document falcon +0.0003534082129597368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document falcon +0.0003550344149828664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document falcon +0.00035363002411364057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document falcon +0.0003537265579677396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document falcon +0.00034950531383577937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document falcon +0.00035008511827347514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document falcon +0.00035594533400871325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document falcon +0.00035266312861335946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document falcon +0.00035280268794863923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document falcon +0.0003565470391528536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document falcon +0.0003588492322689137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document falcon +0.00035469909697832775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document falcon +0.00034712082813410526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document falcon +0.000348701157101807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document falcon +0.0003500192014479944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document falcon +0.00035120560544669755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document falcon +0.00035403656850437445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document falcon +0.00035852376560749366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document falcon +0.0003534754068111774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document falcon +0.00035591740046720765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document falcon +0.000348522354782563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document falcon +0.0003533533959664415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document falcon +0.00035631425964030697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document falcon +0.0003485886551574741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document falcon +0.00035917652631065777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document falcon +0.0003482975272111288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document falcon +0.00035580661277480167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document falcon +0.0003492290722955348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document falcon +0.00034989284450240613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document falcon +0.0003545677216162781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document falcon +0.00034622286859463484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document falcon +0.00036070626989861965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document falcon +0.00035518365036320786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document falcon +0.00035272907057848406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document falcon +0.0003547343638218734 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document falcon +0.0003496450144966242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document falcon +0.0003537407829294287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document falcon +0.0003489722653985685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document falcon +0.00035057186899911295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document falcon +0.0003507566548933051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document falcon +0.00035630360179023747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document falcon +0.00035631362503416367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document falcon +0.0003490204248026821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document falcon +0.00035761724058371226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document falcon +0.00035037664777467137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document falcon +0.000353402110481068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document falcon +0.00034524163568371745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document falcon +0.00035528523728570974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document falcon +0.00034784916132431703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document falcon +0.00034928476408048925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document falcon +0.00034989205973784984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document falcon +0.00034201664404094254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document falcon +0.0003529676016338611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document falcon +0.00034643433682346637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document falcon +0.0003511666373001904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document falcon +0.00034828669066575333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document falcon +0.0003494625207264413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document falcon +0.0003458957535879216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document falcon +0.0003543020478990003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document falcon +0.00034754384069014956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document falcon +0.0003598856392240133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document falcon +0.0003503335458553846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document falcon +0.00035919595619778716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document falcon +0.00035767737970754404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document falcon +0.00035197152783998165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document falcon +0.0003549609834422404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document falcon +0.0003568184100569753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document falcon +0.0003512652818651935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document falcon +0.00035912648958665754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document falcon +0.00034764526964056546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document falcon +0.000352439784960359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document falcon +0.00035295886560764226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document falcon +0.0003518132693658672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document falcon +0.00035589987915465713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document falcon +0.00034923863317385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document falcon +0.0003457987267929692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document falcon +0.0003560928663480501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document falcon +0.0003529603811204932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document falcon +0.0003524438555443043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document falcon +0.0003438847030263783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document falcon +0.00035981978898461613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document falcon +0.0003446342778566972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document falcon +0.00035529584995236537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document falcon +0.00034855740895831116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document falcon +0.00034932634912802544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document falcon +0.00035805518303064666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document falcon +0.0003497941877073061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document falcon +0.00035774398685405447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document falcon +0.0003560421780316607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document falcon +0.0003508844468369392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document falcon +0.00035731928892270107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document falcon +0.0003557884626314314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document falcon +0.00034992996760289355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document falcon +0.000360752554360921 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document falcon +0.0003452321668708545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document falcon +0.0003591745226131023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document falcon +0.00035256981433229084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document falcon +0.00035378123159712034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document falcon +0.000350464354895999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document falcon +0.00035074625557389677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document falcon +0.00035025894701994667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document falcon +0.00035437902514857614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document falcon +0.0003514684519732232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document falcon +0.00035449717909633905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document falcon +0.0003436816402714221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document falcon +0.00035139158071782116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document falcon +0.0003509424079843335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document falcon +0.000343894618577506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document falcon +0.0003500789770661659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document falcon +0.0003407788080680086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document falcon +0.0003581908175239701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document falcon +0.0003465541618780918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document falcon +0.00034600228792437736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document falcon +0.00034416738982773204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document falcon +0.0003519900340150641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document falcon +0.000343369616864659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document falcon +0.0003544993883274688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document falcon +0.0003504441365073392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document falcon +0.00034859160702727056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document falcon +0.00035355909532647185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document falcon +0.0003471900922691849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document falcon +0.0003563015508709187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document falcon +0.0003487888744148821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document falcon +0.00034711767548688336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document falcon +0.0003530734609369085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document falcon +0.00035123969242560935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document falcon +0.0003517127620891489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document falcon +0.00035232835416868673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document falcon +0.0003524437481912308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document falcon +0.0003525996167005602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document falcon +0.00035064770545242043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document falcon +0.00035311558274981226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document falcon +0.00034952204800569914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document falcon +0.0003541471367344846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document falcon +0.00035418812454561825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document falcon +0.0003528951372900714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document falcon +0.0003542338042975688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document falcon +0.00034937738939942796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document falcon +0.0003522182190878447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document falcon +0.0003501406466507449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document falcon +0.00034973079877492633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document falcon +0.0003485274567713538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document falcon +0.00034999308679368985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document falcon +0.0003570051724707296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document falcon +0.00034567230462019706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document falcon +0.00035529000940160696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document falcon +0.00034956512308671755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document falcon +0.0003496962834028953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document falcon +0.0003468745282493457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document falcon +0.0003502717155809202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document falcon +0.0003556240880896514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document falcon +0.0003515109488424343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document falcon +0.0003563156688192592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document falcon +0.00035040277363989817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document falcon +0.0003481408593290717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document falcon +0.0003624575124332874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document falcon +0.0003522684124250313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document falcon +0.00035286996027653544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document falcon +0.00034967623997256725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document falcon +0.00035182649587602765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document falcon +0.0003524892557026489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document falcon +0.0003507642477451811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document falcon +0.00036190408389835666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document falcon +0.00035102739424880766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document falcon +0.00035239718753257265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document falcon +0.00035298076121821316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document falcon +0.0003478704389752654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document falcon +0.0003503109191567942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document falcon +0.00035143250975654426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document falcon +0.0003480663923069012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document falcon +0.00035691540219998623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document falcon +0.000348815437166351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document falcon +0.00035202073257766225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document falcon +0.0003491569096274706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document falcon +0.00035277390475511834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document falcon +0.0003524972090026609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document falcon +0.0003504854249750236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document falcon +0.00034740238025423914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document falcon +0.00034968015462277606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document falcon +0.0003493798632762674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document falcon +0.0003488202537862122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document falcon +0.0003525461864643725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document falcon +0.00034903815232825664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document falcon +0.00035536982539258216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document falcon +0.00034858083265155483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document falcon +0.0003505014973608067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document falcon +0.00035327984042622104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document falcon +0.0003503286677453136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document falcon +0.00035835274842442816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document falcon +0.00034970302660275595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document falcon +0.000357929573140149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document falcon +0.0003517238649788585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document falcon +0.00036097027318848475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document falcon +0.0003502734074110026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document falcon +0.00035801510806036273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document falcon +0.0003568006373479869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document falcon +0.00036128108717454636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document falcon +0.0003563436883111686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document falcon +0.00035559725321852463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document falcon +0.00035089656006854944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document falcon +0.000359453964362057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document falcon +0.00035629498059104033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document falcon +0.0003622207707090437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document falcon +0.0003540946784512821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document falcon +0.0003594750565232011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document falcon +0.0003566007415086991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document falcon +0.0003562142599126134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document falcon +0.0003569948186744601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document falcon +0.00035166554847920186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document falcon +0.00035047994419295137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document falcon +0.0003561578193739437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document falcon +0.00035470866838811544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document falcon +0.00034216920464876335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document falcon +0.0003550021513075795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document falcon +0.0003488045105938729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document falcon +0.0003513340720840151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document falcon +0.0003448558566387584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document falcon +0.0003460966026953241 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document falcon +0.0003488157616036459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document falcon +0.0003446120387842362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document falcon +0.000351528602987427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document falcon +0.00035661118227454713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document falcon +0.0003551342699877457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document falcon +0.0003478953397924445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document falcon +0.00034625782458988215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document falcon +0.0003527515447405871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document falcon +0.00034823744889805696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document falcon +0.00034823314560254406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document falcon +0.00035162668292961944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document falcon +0.0003477307716074623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document falcon +0.0003446457989477787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document falcon +0.00034782916273767795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document falcon +0.0003517249130302248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document falcon +0.0003449873430908556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document falcon +0.00034841291749669877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document falcon +0.0003466028498941749 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document falcon +0.0003486436831199424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document falcon +0.0003478279234211838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document falcon +0.0003495903653274374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document falcon +0.00034896893881218957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document falcon +0.000348941645312426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document falcon +0.0003474221308416894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document falcon +0.0003462621543839385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document falcon +0.0003669373860863891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document falcon +0.00034691156268163006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document falcon +0.0003527774103765281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document falcon +0.00034684565672734663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document falcon +0.0003454250599604457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document falcon +0.0003541536557159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document falcon +0.000345735737037366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document falcon +0.0003524669816385214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document falcon +0.0003441817133096468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document falcon +0.0003519093265859089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document falcon +0.00035080085480352095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document falcon +0.00035285227929327434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document falcon +0.00034354836346901676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document falcon +0.00034789770937373467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document falcon +0.000343665920520102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document falcon +0.0003490884931060568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document falcon +0.00034380029463398654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document falcon +0.00034874768005099945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document falcon +0.0003457058510967673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document falcon +0.00034644265227023904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document falcon +0.00035008339858594957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document falcon +0.0003462377193296194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document falcon +0.0003620491787114201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document falcon +0.000348717011044469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document falcon +0.00034370072363913706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document falcon +0.0003551981066775649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document falcon +0.0003500119496799342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document falcon +0.0003485082952669081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document falcon +0.0003508155580978919 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document falcon +0.00035311375163251416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document falcon +0.00034945972003423253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document falcon +0.0003474220353789879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document falcon +0.0003536443686585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document falcon +0.0003560350489042953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document falcon +0.0003493655927914396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document falcon +0.0003528423977146383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document falcon +0.00035255554724471217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document falcon +0.0003479760010190111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document falcon +0.00035458598862501956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document falcon +0.0003458990560538315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document falcon +0.00035157946422379875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document falcon +0.00034736860650169996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document falcon +0.0003529152313394119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document falcon +0.00034586294329524465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document falcon +0.00035707214923794877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document falcon +0.0003509580363496512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document falcon +0.00035244176725524474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document falcon +0.0003467539557999047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document falcon +0.00034919687962275546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document falcon +0.00035094031731719953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document falcon +0.0003484309008351352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document falcon +0.0003485409424916253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document falcon +0.0003499590776117838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document falcon +0.0003492842758957848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document falcon +0.0003529712275178912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document falcon +0.0003566141287087449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document falcon +0.0003649496522047409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document falcon +0.0003563218912208234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document falcon +0.00035614782126966145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document falcon +0.0003531944298453266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document falcon +0.0003535950949566616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document falcon +0.0003544295554928795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document falcon +0.0003519908503740376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document falcon +0.00035752817626134463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document falcon +0.0003515322689589972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document falcon +0.0003486893890307115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document falcon +0.0003446520464889867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document falcon +0.0003509421562481707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document falcon +0.00035335015702909084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document falcon +0.0003490178167345008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document falcon +0.0003520497821155174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document falcon +0.0003549762618908944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document falcon +0.00035072190850833103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document falcon +0.0003542458638526423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document falcon +0.000352419194572916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document falcon +0.0003545102564672614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document falcon +0.0003495437992331806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document falcon +0.0003542843376993964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document falcon +0.000352827529313958 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document falcon +0.00035442506093223886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document falcon +0.0003496970719044257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document falcon +0.0003553096424442362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document falcon +0.00034986845565067564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document falcon +0.000352131055186658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document falcon +0.0003527021708198983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document falcon +0.00034905885414547214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document falcon +0.0003583433842468394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document falcon +0.00034409435202828383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document falcon +0.00034846410520871483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document falcon +0.0003554459991927314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document falcon +0.00035310507471843076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document falcon +0.000350028910786098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document falcon +0.00035049727458009896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document falcon +0.0003519047735925826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document falcon +0.0003513027429919726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document falcon +0.0003626947260354396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document falcon +0.0003500087324849783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document falcon +0.0003618315726725285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document falcon +0.0003535385113938023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document falcon +0.0003487064058517615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document falcon +0.0003618709124780938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document falcon +0.00035040070335625915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document falcon +0.0003506279032267829 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document falcon +0.0003498435310527524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document falcon +0.0003554634749821431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document falcon +0.00035091209738758963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document falcon +0.00035034103678978573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document falcon +0.00035398931854386146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document falcon +0.00035495529304989485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document falcon +0.00036067883473356603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document falcon +6.322825248625475e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document megawika +2.4432314037946264e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document megawika +5.6313888721313454e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document megawika +2.4208171781595055e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document megawika +2.325811856369237e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document megawika +2.4010790356322705e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document megawika +5.36773610843632e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document megawika +1.360574433501002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document megawika +1.3076540344853244e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document megawika +1.3386534334886313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document megawika +1.2498103719605153e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document megawika +1.403763836949682e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document megawika +1.3636756723495417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document megawika +1.2242489446940814e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document megawika +1.2398255818973339e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document megawika +1.2972616994216281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document megawika +1.3947809855914134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document megawika +1.3144843787829514e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document megawika +1.1693809976572487e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document megawika +1.3677252682893802e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document megawika +1.3940876719849597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document megawika +1.4222245138730965e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document megawika +1.3201677767919704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document megawika +1.1421717796486169e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document megawika +1.2890514724498703e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document megawika +1.3649507648749037e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document megawika +1.2400732563490717e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document megawika +1.1557681453277616e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document megawika +1.2294483595964517e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document megawika +1.2137484472122283e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document megawika +1.3299663426456e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document megawika +1.2461984216479532e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document megawika +1.4666434217609636e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document megawika +1.1876997894686238e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document megawika +1.2939155338964078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document megawika +1.3859590039728515e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document megawika +1.317917848615668e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document megawika +1.1335281536110342e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document megawika +1.2889923952861426e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document megawika +1.3471671647053326e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document megawika +1.2221720014475102e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document megawika +1.2632647276287541e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document megawika +1.28276219004076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document megawika +1.36213704321643e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document megawika +1.2414858625261553e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document megawika +1.3173700421883744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document megawika +1.295597796725686e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document megawika +1.242783936442904e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document megawika +1.2417374088427464e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document megawika +1.2134479405400744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document megawika +1.3090040663304255e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document megawika +1.2713470581614905e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document megawika +5.5750231378906594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document megawika +5.777597358425469e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document megawika +5.349786767471258e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document megawika +5.675165050453583e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document megawika +5.482611216158831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document megawika +5.065421899890121e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document megawika +5.384718357480146e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document megawika +4.872037363236061e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document megawika +4.532709250783155e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document megawika +5.7257963030489613e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document megawika +4.9014365579652036e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document megawika +5.722863552770969e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document megawika +6.149911636146833e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document megawika +5.2178057608273506e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document megawika +4.990228161160431e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document megawika +5.866186875255134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document megawika +5.004185734360719e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document megawika +4.79401853705107e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document megawika +5.435219965052376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document megawika +5.035997225792266e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document megawika +5.622401774211625e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document megawika +5.028826157387559e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document megawika +5.596379470128795e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document megawika +6.027824493191489e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document megawika +5.5358270009931474e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document megawika +5.9839051807685496e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document megawika +5.1221077499249595e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document megawika +5.517228560620279e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document megawika +5.1687858285052305e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document megawika +5.684188244145645e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document megawika +5.212693275535878e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document megawika +4.8551007022784084e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document megawika +5.4888506639203145e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document megawika +5.345098688527242e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document megawika +4.8506420625516594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document megawika +5.132168603397676e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document megawika +5.719476795114223e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document megawika +5.7448621149792696e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document megawika +4.9068410568059265e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document megawika +5.382937299647678e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document megawika +4.8288432136304634e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document megawika +5.841703200305416e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document megawika +5.1589611587885584e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document megawika +6.031113829732574e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document megawika +5.4558202844532094e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document megawika +5.341852317196142e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document megawika +5.1402942738369954e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document megawika +5.735421384377395e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document megawika +5.473629863586958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document megawika +5.4708993245733936e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document megawika +4.931161863634078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document megawika +5.104173022127248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document megawika +5.510157161510824e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document megawika +5.652501401782597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document megawika +5.7273656573031666e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document megawika +5.638363224821738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document megawika +5.6128115396668704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document megawika +5.00304877998141e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document megawika +5.596120554779096e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document megawika +5.5280923889040006e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document megawika +5.223477917938408e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document megawika +5.29472809986569e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document megawika +2.205682378243213e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document megawika +1.4367563720603185e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document megawika +3.5506193487931076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document megawika +3.0442910855821778e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document megawika +2.2540042508019627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document megawika +2.6880163202623216e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document megawika +2.534473148048727e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document megawika +2.6560945431318916e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document megawika +2.547470248967691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document megawika +2.5248825388073738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document megawika +2.5828729575000054e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document megawika +2.4026583817957736e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document megawika +2.3930425429834413e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document megawika +2.5037365362599724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document megawika +2.6696745470595603e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document megawika +2.140323051341762e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document megawika +2.617354786691592e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document megawika +1.538359101762691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document megawika +1.2871029252377856e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document megawika +2.255195411289217e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document megawika +2.4832313897952067e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document megawika +9.303873918189968e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document megawika +2.179532302620228e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document megawika +1.9750517506901206e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document megawika +2.7740420380648435e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document megawika +2.7813714782319335e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document megawika +4.1595357937609806e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document megawika +2.741365122389175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document megawika +2.117451071361901e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document megawika +1.7132649760565998e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document megawika +1.7492547092602047e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document megawika +1.7499951097392276e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document megawika +1.6632444789170958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document megawika +1.6678802252361607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document megawika +1.5519208704558896e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document megawika +1.652420992967167e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document megawika +1.6119931034508755e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document megawika +1.6638882076736552e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document megawika +1.7198076782652946e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document megawika +1.572927860565175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document megawika +1.5194822618169918e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document megawika +1.6677776832669846e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document megawika +1.595612492245688e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document megawika +1.682350633181197e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document megawika +1.663983380609724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document megawika +1.710187842689243e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document megawika +1.5733697527539038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document megawika +1.6972104757911438e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document megawika +1.6610142847616577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document megawika +1.61094882403031e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document megawika +1.4789207305138325e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document megawika +1.639299617676302e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document megawika +1.3241204512116132e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document megawika +8.582260726625535e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document megawika +8.213000975576739e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document megawika +9.549247732811947e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document megawika +9.17242785339013e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document megawika +7.632868223725218e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document megawika +8.674401118222175e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document megawika +9.124384255505347e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document megawika +8.344222222417358e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document megawika +8.992299957499065e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document megawika +8.76689497361025e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document megawika +7.973396239586015e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document megawika +9.006935606644125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document megawika +8.725545954955498e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document megawika +1.215449694669174e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document megawika +3.3041720284158646e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document megawika +2.0593512412624502e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document megawika +1.893608946986248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document megawika +1.737111666788535e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document megawika +1.4915923449873955e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document megawika +2.289370239067605e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document megawika +2.8615335689614638e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document megawika +8.847283630883125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document megawika +1.8175470362373804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document megawika +1.8152226683368038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document megawika +1.789149655314284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document megawika +1.7690523036477663e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document megawika +1.8333732213753644e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document megawika +1.8794105687718654e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document megawika +1.721841156706417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document megawika +2.0612008685724796e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document megawika +1.9297370681336376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document megawika +2.0188440409661018e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document megawika +5.1741216329695265e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document megawika +1.3417913926038429e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document megawika +1.1010813016469651e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document megawika +1.1252416134320087e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document megawika +1.2801744104313002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document megawika +1.3041514955795817e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document megawika +1.3428837580879075e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document megawika +1.320809382267804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document megawika +1.3451566676555968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document megawika +1.228284926657501e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document megawika +1.2410599573923043e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document megawika +1.3815343367377182e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document megawika +1.3895126265148832e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document megawika +1.2306773644401741e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document megawika +1.32981021906281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document megawika +1.101337469221607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document megawika +1.513094184404692e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document megawika +1.1073759547073234e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document megawika +1.2879348765857567e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document megawika +9.619595770228435e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document megawika +1.2384340836286436e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document megawika +1.1766667232211577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document megawika +1.2871049236196452e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document megawika +1.2010645926497744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document megawika +1.3971428231518597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document megawika +1.2283733550547932e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document megawika +1.2659530508255308e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document megawika +1.551775613074462e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document megawika +1.1169413343776979e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document megawika +1.1433700593712463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document megawika +4.964773647323492e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document megawika +1.0995586595687313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document megawika +1.2957393071411267e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document megawika +2.75899247407709e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document megawika +2.8269344597344854e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document megawika +2.329108187246831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document megawika +2.4231761430460284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document megawika +1.2434140512230442e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document megawika +1.638718338352859e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document megawika +3.272953556801187e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document megawika +6.061314500486327e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document megawika +1.2465979731210292e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document megawika +1.2737557327967737e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document megawika +1.038428658075627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document megawika +2.61666472045566e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document megawika +3.6506873212272224e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document megawika +1.5066359138295701e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document megawika +1.1166290872121178e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document megawika +1.5546966228590285e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document megawika +1.2583434625014828e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document megawika +1.3398826881300862e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document megawika +1.2944933160515968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document megawika +1.0971437399901365e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document megawika +1.2787922795775774e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document megawika +1.404979227816985e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document megawika +1.3344734431324463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document megawika +4.886031157107555e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document megawika +3.277261443596394e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document megawika +3.5057957685786495e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document megawika +3.287625301718589e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document megawika +3.1370056372668855e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document megawika +3.186092015785841e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document megawika +7.271819324142512e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document megawika +0.001451215788905126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document open-web-math-train +0.0014486847196258788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document open-web-math-train +0.0008861032722895899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document open-web-math-train +0.0018119590809459816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document open-web-math-train +0.0008916937917547129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document open-web-math-train +6.960128832809415e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document open-web-math-train +0.002008403651063623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document open-web-math-train +0.0014374900742131454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document open-web-math-train +0.00180213596996716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document open-web-math-train +0.001956178877532413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document open-web-math-train +0.0008829547017667033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document open-web-math-train +0.0008910853619157279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document open-web-math-train +0.0018260998845299973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document open-web-math-train +0.0012499632072059553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document pes2o +0.00125398260359913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document pes2o +0.0012541704774729071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document pes2o +0.0012527268234360602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document pes2o +0.0012532925243737164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document pes2o +0.0012456396241204315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document pes2o +0.0012589894424352072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document pes2o +0.001508020123999618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document pes2o +0.00333096950781965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document pes2o +0.0033233414614415547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document pes2o +0.003512387990689828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document pes2o +0.0035091382940513126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document pes2o +0.003514155927147005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document pes2o +0.003327108000579638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document pes2o +0.003329106196589836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document pes2o +0.003505604148738077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document pes2o +0.003324825759567855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document pes2o +0.0033248240149804913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document pes2o +0.0033385962112851358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document pes2o +0.0035043186296553615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document pes2o +0.003340469505431529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document pes2o +0.0035106889084796276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document pes2o +0.0033309469281030167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document pes2o +0.003340337858029757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document pes2o +0.003505919861097801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document pes2o +0.0003882924098240512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document pes2o +0.0005759963691850877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document reddit +0.0005959971675332674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document reddit +0.0006026179290353799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document reddit +0.0005824184320784846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document reddit +0.0005854598548616037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document reddit +0.0005903767055633473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document reddit +0.0005930306490982049 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document reddit +0.000569425602700746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document reddit +0.0005675060415179408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document reddit +0.0005772431621253389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document reddit +0.0005678026053826858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document reddit +0.0005700398263483378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document reddit +0.0005669467963528824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document reddit +0.0005701015953324305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document reddit +0.0005795907287413296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document reddit +0.0005735602737531164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document reddit +0.0005749862745842101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document reddit +0.0005693257015931971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document reddit +0.0005716568794795563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document reddit +0.0005761083919774021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document reddit +0.0005688343169797355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document reddit +0.0005807913190929842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document reddit +0.0005710229258078636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document reddit +0.0005704083039826862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document reddit +0.0005862132348308056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document reddit +0.0005717662049559556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document reddit +0.0005858155213694451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document reddit +0.0005812012281792392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document reddit +0.0005803981414588498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document reddit +0.0005700102108287723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document reddit +0.0005719243459052329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document reddit +0.0005867253401661752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document reddit +0.0005731087218860733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document reddit +0.0005712197789109317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document reddit +0.0005702376926310089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document reddit +0.0005700411527742972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document reddit +0.0005828090098178196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document reddit +0.0005770140826168056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document reddit +0.0005723509664597896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document reddit +0.0005755499231836962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document reddit +0.0005636407438471367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document reddit +0.0005640281556500104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document reddit +0.0005633159058766496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document reddit +0.0005638034311151449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document reddit +0.0005630066273073224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document reddit +0.0005631803831128559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document reddit +0.0005631228881679657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document reddit +0.0005628178701487633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document reddit +0.0005624448092256196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document reddit +0.0005620957024062329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document reddit +0.0005614201504177484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document reddit +0.0005616890951464056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document reddit +0.0005611348559279058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document reddit +0.0005604238061828518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document reddit +0.0005603301490194237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document reddit +0.0005607291294548833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document reddit +0.0005605234569930727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document reddit +0.0005613778566640694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document reddit +0.0005610248539992471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document reddit +0.0005599977416780475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document reddit +0.0005603632562116935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document reddit +0.0005599177479509897 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document reddit +0.0005595202318298379 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document reddit +0.0005600975633499175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document reddit +0.0005614075491213365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document reddit +0.000612563885043477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document reddit +0.0005515469909644413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document reddit +0.0005526782014946906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document reddit +0.0005472463408095445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document reddit +0.0005502284746004587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document reddit +0.0005414514790555363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document reddit +0.0005513499500134784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document reddit +0.0005391391454105187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document reddit +0.0005415836910001838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document reddit +0.0005208132468536551 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document reddit +0.0005889827143132871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document reddit +0.0005822520817765276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document reddit +0.0004173155230758696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document reddit +0.0009994361338078242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document stackexchange +0.001087156194657966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document stackexchange +0.0010667737163656816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document stackexchange +0.0009602877882124873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document stackexchange +0.0008968956271971105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document stackexchange +0.0009198034843762967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document stackexchange +0.0009423901016715341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document stackexchange +0.0009674094553686345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document stackexchange +0.0009858331322519164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document stackexchange +0.0009970593645879198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document stackexchange +0.0010027035193731686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document stackexchange +0.0010128291154221853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document stackexchange +0.0010215631382631918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document stackexchange +0.0010288663771461238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document stackexchange +0.0010346219929285867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document stackexchange +0.00104544019940344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document stackexchange +0.0010525172676724333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document stackexchange +0.0010609529620775127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document stackexchange +0.0010725892748610153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document stackexchange +0.0010818563598181568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document stackexchange +0.0010992760196793917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document stackexchange +0.0011178992762079917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document stackexchange +0.001124687532085676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document stackexchange +0.001118303661267191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document stackexchange +0.0010206825575416534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document stackexchange +0.0005512280117499715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document stackexchange +0.004474659408857016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document starcoder +0.00409944473890653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document starcoder +0.005137179939941845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document starcoder +0.005143172251066109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document starcoder +0.005206134363352808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document starcoder +0.004892747858974329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document starcoder +0.004844731352552902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document starcoder +0.005308320169123755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document starcoder +0.005124709815666577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document starcoder +0.005424710744483826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document starcoder +0.00538244648861977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document starcoder +0.0029107284679086853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document starcoder +0.0026825258998444705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document starcoder +0.0026904503191419243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document starcoder +0.002687906577174073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document starcoder +0.002850165346048818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document starcoder +0.005322698571717847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document starcoder +0.004450334290869719 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document starcoder +0.004700990083440683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document starcoder +0.003903568556500995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document starcoder +0.00390561515396931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document starcoder +0.0039046402900912262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document starcoder +0.003907454839379547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document starcoder +0.0038583224578603824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document starcoder +0.0037914116657695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document starcoder +0.003786665266798682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document starcoder +0.003792000802430658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document starcoder +0.00319266847466091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document starcoder +0.0032658716699838944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document starcoder +0.0034801959532460023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document starcoder +0.0028307012092022594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document starcoder +0.0028420360878146276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document starcoder +0.0028410455248484914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document starcoder +0.00283497183526842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document starcoder +0.002840187195459487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document starcoder +0.0028398709431369834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document starcoder +0.004364722843422023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document starcoder +0.004093255713117101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document starcoder +0.004092331079566252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document starcoder +0.004005326985579649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document starcoder +0.0036205502856964207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document starcoder +0.003625316793034984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document starcoder +0.003604743435602363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document starcoder +0.0035405823343673125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document starcoder +0.0041601413517253945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document starcoder +0.005886303658937057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document starcoder +0.003600909532810332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document starcoder +0.0034941365817168658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document starcoder +0.0004992164842980224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document starcoder +0.00032927705604725614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document tulu +0.0002860154190878753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document tulu +0.0002845217585425619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document tulu +0.0002743528685497456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document tulu +0.00026025323737738766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document tulu +0.00023493876414603155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document tulu +0.00029665994994226705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document tulu +0.00031808102075993956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document tulu +0.00031813573046011285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document tulu +0.0002711905171855542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document tulu +0.00028892513401817095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document tulu +0.00030003908676979083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document tulu +0.00026839878771944684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document tulu +0.00029155935002690497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document tulu +0.0002998624927624209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document tulu +0.0003091705447974841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document tulu +0.00026873195794309786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document tulu +0.00027721873498527547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document tulu +0.0002841662554024377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document tulu +0.0002839461156551537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document tulu +0.0002861705604659811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document tulu +0.0002460995649635886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document tulu +0.00019420142619795496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document tulu +0.00021967677816173628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document tulu +0.0002620283200480949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document tulu +0.0002433390542188936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document tulu +0.00021254976608350767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document tulu +0.00022094815569522115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document tulu +0.000342862378668244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document tulu +0.00033784225259118157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document tulu +0.0003367278459543952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document tulu +0.00029843279042852765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document tulu +0.0002926583661257988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document tulu +0.00029320337282010673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document tulu +0.00029281450669483455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document tulu +0.0002915338187002653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document tulu +0.0002864226923084572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document tulu +0.00028643439083586396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document tulu +0.00028253710956299054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document tulu +0.0002810856078805806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document tulu +0.00031474941344656715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document tulu +0.0002139130222205655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document tulu +0.0003084648871862831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document tulu +0.0003309477872140129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document tulu +0.0003360096824695161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document tulu +0.0003355452655196557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document tulu +0.00038119390366386037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document tulu +0.00038078927630086064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document tulu +0.0003386200917551554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document tulu +0.0002158905159938882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document tulu +0.00021621682877018768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document tulu +0.00021553306942740535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document tulu +0.00021581563462722296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document tulu +0.0002157694110556169 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document tulu +0.000215643699847159 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document tulu +0.00021532716715168094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document tulu +0.00021531221326022472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document tulu +0.0002831801179028896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document tulu +0.0002514844936507595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document tulu +0.00031638782778107964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document tulu +0.0002749197545278445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document tulu +0.00026159721512464495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document tulu +0.0002630052420096968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document tulu +0.00031106811228913666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document tulu +0.0002852973415334161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document tulu +3.7555372465932136e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document tulu +0.003548077173506675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document wiki +0.0018372203137874265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document wiki diff --git a/ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt b/ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt new file mode 100644 index 0000000000..5d142522a7 --- /dev/null +++ b/ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document +0.0002583902668716813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document +0.0031025147279277244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.003102019887362634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.0009996745994661548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document +0.0002406272620255565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document +0.0003742481815405742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document +0.0003547982093445404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document +6.322825248625475e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document +0.001451215788905126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document +0.0012499632072059553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document +0.0005759963691850877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document +0.0009994361338078242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document +0.004474659408857016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document +0.00032927705604725614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document +0.003548077173506675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document diff --git a/ALCF/data-lists/sunspot/falcon.txt b/ALCF/data-lists/sunspot/falcon.txt new file mode 100644 index 0000000000..e5afb89283 --- /dev/null +++ b/ALCF/data-lists/sunspot/falcon.txt @@ -0,0 +1,501 @@ +0.0003547982093445404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document falcon +0.00035934014428504944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document falcon +0.00035707704501371544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document falcon +0.00035287930712815354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document falcon +0.00035977166728996823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document falcon +0.0003581675664109838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document falcon +0.0003548617059697185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document falcon +0.0003639582000286208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document falcon +0.00035375839698688127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document falcon +0.0003743722020080678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document falcon +0.0003530399715341242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document falcon +0.00035511875882752406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document falcon +0.0003618733574783154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document falcon +0.00035185243285420104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document falcon +0.0003541503739732106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document falcon +0.0003631679485751914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document falcon +0.00035748045578182274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document falcon +0.0003606490690555877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document falcon +0.0003626383296610091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document falcon +0.00035442644361264756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document falcon +0.00035978370170539796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document falcon +0.0003585562375341541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document falcon +0.0003601958372888019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document falcon +0.000350277765402227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document falcon +0.0003616521184211704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document falcon +0.0003620625543608188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document falcon +0.0003560781983850704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document falcon +0.0003553209610592676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document falcon +0.00035905348643915075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document falcon +0.00034744258805696526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document falcon +0.00035462784035661496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document falcon +0.00034768186175100895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document falcon +0.0003568534635532736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document falcon +0.00035586511544371234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document falcon +0.0003524567827568137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document falcon +0.0003512453770426313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document falcon +0.0003591792726468799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document falcon +0.0003514024529343127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document falcon +0.0003584880112586934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document falcon +0.00035133552916418045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document falcon +0.0003600811981350215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document falcon +0.0003571663974228119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document falcon +0.00035768103378874214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document falcon +0.00035939205561113694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document falcon +0.00035186773916029825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document falcon +0.0003542829672490847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document falcon +0.0003592783642898726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document falcon +0.0003556367340099302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document falcon +0.00035391392271377027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document falcon +0.00035486725707484836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document falcon +0.00034866743396828035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document falcon +0.0003517219808644735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document falcon +0.00034874458549673823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document falcon +0.000355773136961014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document falcon +0.00035611750387841917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document falcon +0.00035305602013916315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document falcon +0.0003578207127071924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document falcon +0.00035514635841943707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document falcon +0.00034816946212866206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document falcon +0.0003512707269761496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document falcon +0.0003483392117980654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document falcon +0.0003572169607204321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document falcon +0.00035139153281660794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document falcon +0.00035536422129036537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document falcon +0.000352017164107143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document falcon +0.000351889550179365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document falcon +0.000358759689953589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document falcon +0.0003569286079869268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document falcon +0.0003657752958602099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document falcon +0.00035396127934790697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document falcon +0.0003618565071224743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document falcon +0.00035146051531973204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document falcon +0.00036107135765783567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document falcon +0.00035019554279994576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document falcon +0.00035567858879904983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document falcon +0.0003504753174793183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document falcon +0.00035931140831329194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document falcon +0.0003502967866002823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document falcon +0.0003532911801041972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document falcon +0.0003583543013070199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document falcon +0.0003566243489931224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document falcon +0.0003468752314799221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document falcon +0.0003597840618138091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document falcon +0.00035128822484768084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document falcon +0.00035889496943437507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document falcon +0.000352400524650424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document falcon +0.0003518689536768735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document falcon +0.00035866864741303467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document falcon +0.0003454687659106334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document falcon +0.00035348007259317576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document falcon +0.0003539752270940644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document falcon +0.00035146495994081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document falcon +0.00035397212846310423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document falcon +0.00035208246467162587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document falcon +0.0003490843168676626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document falcon +0.00035299633658644394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document falcon +0.00034868327466167065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document falcon +0.00035941351365601583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document falcon +0.0003545343062735255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document falcon +0.0003528956380445978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document falcon +0.0003553355770443352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document falcon +0.0003644224004937743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document falcon +0.00035234291036216907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document falcon +0.0003596237469847771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document falcon +0.0003531996065735989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document falcon +0.0003547177054106099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document falcon +0.0003575586499260483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document falcon +0.00035262635135283667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document falcon +0.0003624191962188944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document falcon +0.0003488398052948616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document falcon +0.0003598294093147917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document falcon +0.00035583006534466323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document falcon +0.00035403139653225103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document falcon +0.00036134702642187156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document falcon +0.0003573689927162834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document falcon +0.0003577141131435527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document falcon +0.00035208814419277406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document falcon +0.00035996720683665625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document falcon +0.00035415304658912596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document falcon +0.00036353353029443546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document falcon +0.0003537326003150983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document falcon +0.00036053976358299083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document falcon +0.000352380489373494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document falcon +0.00036154661616900994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document falcon +0.00035959332325963614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document falcon +0.0003597954667189692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document falcon +0.0003563108270597542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document falcon +0.0003582891940460143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document falcon +0.0003497728210484297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document falcon +0.0003549834902179354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document falcon +0.0003529828233484542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document falcon +0.00034627483903285777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document falcon +0.00035569006572589215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document falcon +0.00035449377946910314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document falcon +0.00035802844396194623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document falcon +0.0003617277809353208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document falcon +0.00035034118898654814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document falcon +0.000351091193908611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document falcon +0.0003527914342210668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document falcon +0.00035028288369781376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document falcon +0.00035775745592780506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document falcon +0.0003449630690661468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document falcon +0.0003583490698830361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document falcon +0.0003476995746684122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document falcon +0.0003535632505019212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document falcon +0.00035640180641147417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document falcon +0.000361731045691765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document falcon +0.0003534082129597368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document falcon +0.0003550344149828664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document falcon +0.00035363002411364057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document falcon +0.0003537265579677396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document falcon +0.00034950531383577937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document falcon +0.00035008511827347514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document falcon +0.00035594533400871325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document falcon +0.00035266312861335946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document falcon +0.00035280268794863923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document falcon +0.0003565470391528536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document falcon +0.0003588492322689137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document falcon +0.00035469909697832775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document falcon +0.00034712082813410526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document falcon +0.000348701157101807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document falcon +0.0003500192014479944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document falcon +0.00035120560544669755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document falcon +0.00035403656850437445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document falcon +0.00035852376560749366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document falcon +0.0003534754068111774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document falcon +0.00035591740046720765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document falcon +0.000348522354782563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document falcon +0.0003533533959664415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document falcon +0.00035631425964030697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document falcon +0.0003485886551574741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document falcon +0.00035917652631065777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document falcon +0.0003482975272111288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document falcon +0.00035580661277480167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document falcon +0.0003492290722955348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document falcon +0.00034989284450240613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document falcon +0.0003545677216162781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document falcon +0.00034622286859463484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document falcon +0.00036070626989861965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document falcon +0.00035518365036320786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document falcon +0.00035272907057848406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document falcon +0.0003547343638218734 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document falcon +0.0003496450144966242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document falcon +0.0003537407829294287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document falcon +0.0003489722653985685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document falcon +0.00035057186899911295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document falcon +0.0003507566548933051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document falcon +0.00035630360179023747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document falcon +0.00035631362503416367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document falcon +0.0003490204248026821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document falcon +0.00035761724058371226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document falcon +0.00035037664777467137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document falcon +0.000353402110481068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document falcon +0.00034524163568371745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document falcon +0.00035528523728570974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document falcon +0.00034784916132431703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document falcon +0.00034928476408048925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document falcon +0.00034989205973784984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document falcon +0.00034201664404094254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document falcon +0.0003529676016338611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document falcon +0.00034643433682346637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document falcon +0.0003511666373001904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document falcon +0.00034828669066575333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document falcon +0.0003494625207264413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document falcon +0.0003458957535879216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document falcon +0.0003543020478990003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document falcon +0.00034754384069014956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document falcon +0.0003598856392240133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document falcon +0.0003503335458553846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document falcon +0.00035919595619778716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document falcon +0.00035767737970754404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document falcon +0.00035197152783998165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document falcon +0.0003549609834422404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document falcon +0.0003568184100569753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document falcon +0.0003512652818651935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document falcon +0.00035912648958665754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document falcon +0.00034764526964056546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document falcon +0.000352439784960359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document falcon +0.00035295886560764226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document falcon +0.0003518132693658672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document falcon +0.00035589987915465713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document falcon +0.00034923863317385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document falcon +0.0003457987267929692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document falcon +0.0003560928663480501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document falcon +0.0003529603811204932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document falcon +0.0003524438555443043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document falcon +0.0003438847030263783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document falcon +0.00035981978898461613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document falcon +0.0003446342778566972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document falcon +0.00035529584995236537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document falcon +0.00034855740895831116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document falcon +0.00034932634912802544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document falcon +0.00035805518303064666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document falcon +0.0003497941877073061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document falcon +0.00035774398685405447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document falcon +0.0003560421780316607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document falcon +0.0003508844468369392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document falcon +0.00035731928892270107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document falcon +0.0003557884626314314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document falcon +0.00034992996760289355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document falcon +0.000360752554360921 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document falcon +0.0003452321668708545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document falcon +0.0003591745226131023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document falcon +0.00035256981433229084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document falcon +0.00035378123159712034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document falcon +0.000350464354895999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document falcon +0.00035074625557389677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document falcon +0.00035025894701994667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document falcon +0.00035437902514857614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document falcon +0.0003514684519732232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document falcon +0.00035449717909633905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document falcon +0.0003436816402714221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document falcon +0.00035139158071782116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document falcon +0.0003509424079843335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document falcon +0.000343894618577506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document falcon +0.0003500789770661659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document falcon +0.0003407788080680086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document falcon +0.0003581908175239701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document falcon +0.0003465541618780918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document falcon +0.00034600228792437736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document falcon +0.00034416738982773204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document falcon +0.0003519900340150641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document falcon +0.000343369616864659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document falcon +0.0003544993883274688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document falcon +0.0003504441365073392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document falcon +0.00034859160702727056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document falcon +0.00035355909532647185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document falcon +0.0003471900922691849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document falcon +0.0003563015508709187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document falcon +0.0003487888744148821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document falcon +0.00034711767548688336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document falcon +0.0003530734609369085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document falcon +0.00035123969242560935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document falcon +0.0003517127620891489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document falcon +0.00035232835416868673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document falcon +0.0003524437481912308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document falcon +0.0003525996167005602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document falcon +0.00035064770545242043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document falcon +0.00035311558274981226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document falcon +0.00034952204800569914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document falcon +0.0003541471367344846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document falcon +0.00035418812454561825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document falcon +0.0003528951372900714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document falcon +0.0003542338042975688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document falcon +0.00034937738939942796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document falcon +0.0003522182190878447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document falcon +0.0003501406466507449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document falcon +0.00034973079877492633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document falcon +0.0003485274567713538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document falcon +0.00034999308679368985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document falcon +0.0003570051724707296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document falcon +0.00034567230462019706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document falcon +0.00035529000940160696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document falcon +0.00034956512308671755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document falcon +0.0003496962834028953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document falcon +0.0003468745282493457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document falcon +0.0003502717155809202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document falcon +0.0003556240880896514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document falcon +0.0003515109488424343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document falcon +0.0003563156688192592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document falcon +0.00035040277363989817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document falcon +0.0003481408593290717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document falcon +0.0003624575124332874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document falcon +0.0003522684124250313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document falcon +0.00035286996027653544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document falcon +0.00034967623997256725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document falcon +0.00035182649587602765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document falcon +0.0003524892557026489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document falcon +0.0003507642477451811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document falcon +0.00036190408389835666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document falcon +0.00035102739424880766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document falcon +0.00035239718753257265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document falcon +0.00035298076121821316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document falcon +0.0003478704389752654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document falcon +0.0003503109191567942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document falcon +0.00035143250975654426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document falcon +0.0003480663923069012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document falcon +0.00035691540219998623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document falcon +0.000348815437166351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document falcon +0.00035202073257766225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document falcon +0.0003491569096274706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document falcon +0.00035277390475511834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document falcon +0.0003524972090026609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document falcon +0.0003504854249750236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document falcon +0.00034740238025423914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document falcon +0.00034968015462277606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document falcon +0.0003493798632762674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document falcon +0.0003488202537862122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document falcon +0.0003525461864643725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document falcon +0.00034903815232825664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document falcon +0.00035536982539258216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document falcon +0.00034858083265155483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document falcon +0.0003505014973608067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document falcon +0.00035327984042622104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document falcon +0.0003503286677453136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document falcon +0.00035835274842442816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document falcon +0.00034970302660275595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document falcon +0.000357929573140149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document falcon +0.0003517238649788585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document falcon +0.00036097027318848475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document falcon +0.0003502734074110026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document falcon +0.00035801510806036273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document falcon +0.0003568006373479869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document falcon +0.00036128108717454636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document falcon +0.0003563436883111686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document falcon +0.00035559725321852463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document falcon +0.00035089656006854944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document falcon +0.000359453964362057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document falcon +0.00035629498059104033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document falcon +0.0003622207707090437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document falcon +0.0003540946784512821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document falcon +0.0003594750565232011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document falcon +0.0003566007415086991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document falcon +0.0003562142599126134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document falcon +0.0003569948186744601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document falcon +0.00035166554847920186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document falcon +0.00035047994419295137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document falcon +0.0003561578193739437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document falcon +0.00035470866838811544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document falcon +0.00034216920464876335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document falcon +0.0003550021513075795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document falcon +0.0003488045105938729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document falcon +0.0003513340720840151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document falcon +0.0003448558566387584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document falcon +0.0003460966026953241 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document falcon +0.0003488157616036459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document falcon +0.0003446120387842362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document falcon +0.000351528602987427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document falcon +0.00035661118227454713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document falcon +0.0003551342699877457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document falcon +0.0003478953397924445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document falcon +0.00034625782458988215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document falcon +0.0003527515447405871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document falcon +0.00034823744889805696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document falcon +0.00034823314560254406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document falcon +0.00035162668292961944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document falcon +0.0003477307716074623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document falcon +0.0003446457989477787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document falcon +0.00034782916273767795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document falcon +0.0003517249130302248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document falcon +0.0003449873430908556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document falcon +0.00034841291749669877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document falcon +0.0003466028498941749 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document falcon +0.0003486436831199424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document falcon +0.0003478279234211838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document falcon +0.0003495903653274374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document falcon +0.00034896893881218957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document falcon +0.000348941645312426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document falcon +0.0003474221308416894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document falcon +0.0003462621543839385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document falcon +0.0003669373860863891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document falcon +0.00034691156268163006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document falcon +0.0003527774103765281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document falcon +0.00034684565672734663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document falcon +0.0003454250599604457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document falcon +0.0003541536557159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document falcon +0.000345735737037366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document falcon +0.0003524669816385214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document falcon +0.0003441817133096468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document falcon +0.0003519093265859089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document falcon +0.00035080085480352095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document falcon +0.00035285227929327434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document falcon +0.00034354836346901676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document falcon +0.00034789770937373467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document falcon +0.000343665920520102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document falcon +0.0003490884931060568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document falcon +0.00034380029463398654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document falcon +0.00034874768005099945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document falcon +0.0003457058510967673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document falcon +0.00034644265227023904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document falcon +0.00035008339858594957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document falcon +0.0003462377193296194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document falcon +0.0003620491787114201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document falcon +0.000348717011044469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document falcon +0.00034370072363913706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document falcon +0.0003551981066775649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document falcon +0.0003500119496799342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document falcon +0.0003485082952669081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document falcon +0.0003508155580978919 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document falcon +0.00035311375163251416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document falcon +0.00034945972003423253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document falcon +0.0003474220353789879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document falcon +0.0003536443686585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document falcon +0.0003560350489042953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document falcon +0.0003493655927914396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document falcon +0.0003528423977146383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document falcon +0.00035255554724471217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document falcon +0.0003479760010190111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document falcon +0.00035458598862501956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document falcon +0.0003458990560538315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document falcon +0.00035157946422379875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document falcon +0.00034736860650169996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document falcon +0.0003529152313394119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document falcon +0.00034586294329524465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document falcon +0.00035707214923794877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document falcon +0.0003509580363496512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document falcon +0.00035244176725524474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document falcon +0.0003467539557999047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document falcon +0.00034919687962275546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document falcon +0.00035094031731719953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document falcon +0.0003484309008351352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document falcon +0.0003485409424916253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document falcon +0.0003499590776117838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document falcon +0.0003492842758957848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document falcon +0.0003529712275178912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document falcon +0.0003566141287087449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document falcon +0.0003649496522047409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document falcon +0.0003563218912208234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document falcon +0.00035614782126966145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document falcon +0.0003531944298453266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document falcon +0.0003535950949566616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document falcon +0.0003544295554928795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document falcon +0.0003519908503740376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document falcon +0.00035752817626134463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document falcon +0.0003515322689589972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document falcon +0.0003486893890307115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document falcon +0.0003446520464889867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document falcon +0.0003509421562481707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document falcon +0.00035335015702909084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document falcon +0.0003490178167345008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document falcon +0.0003520497821155174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document falcon +0.0003549762618908944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document falcon +0.00035072190850833103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document falcon +0.0003542458638526423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document falcon +0.000352419194572916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document falcon +0.0003545102564672614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document falcon +0.0003495437992331806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document falcon +0.0003542843376993964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document falcon +0.000352827529313958 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document falcon +0.00035442506093223886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document falcon +0.0003496970719044257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document falcon +0.0003553096424442362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document falcon +0.00034986845565067564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document falcon +0.000352131055186658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document falcon +0.0003527021708198983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document falcon +0.00034905885414547214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document falcon +0.0003583433842468394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document falcon +0.00034409435202828383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document falcon +0.00034846410520871483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document falcon +0.0003554459991927314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document falcon +0.00035310507471843076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document falcon +0.000350028910786098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document falcon +0.00035049727458009896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document falcon +0.0003519047735925826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document falcon +0.0003513027429919726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document falcon +0.0003626947260354396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document falcon +0.0003500087324849783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document falcon +0.0003618315726725285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document falcon +0.0003535385113938023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document falcon +0.0003487064058517615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document falcon +0.0003618709124780938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document falcon +0.00035040070335625915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document falcon +0.0003506279032267829 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document falcon +0.0003498435310527524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document falcon +0.0003554634749821431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document falcon +0.00035091209738758963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document falcon +0.00035034103678978573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document falcon +0.00035398931854386146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document falcon +0.00035495529304989485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document falcon +0.00036067883473356603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document falcon + diff --git a/ALCF/data-lists/sunspot/megawiki.txt b/ALCF/data-lists/sunspot/megawiki.txt new file mode 100644 index 0000000000..f7fbabc913 --- /dev/null +++ b/ALCF/data-lists/sunspot/megawiki.txt @@ -0,0 +1,262 @@ +6.322825248625475e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document megawika +2.4432314037946264e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document megawika +5.6313888721313454e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document megawika +2.4208171781595055e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document megawika +2.325811856369237e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document megawika +2.4010790356322705e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document megawika +5.36773610843632e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document megawika +1.360574433501002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document megawika +1.3076540344853244e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document megawika +1.3386534334886313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document megawika +1.2498103719605153e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document megawika +1.403763836949682e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document megawika +1.3636756723495417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document megawika +1.2242489446940814e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document megawika +1.2398255818973339e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document megawika +1.2972616994216281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document megawika +1.3947809855914134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document megawika +1.3144843787829514e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document megawika +1.1693809976572487e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document megawika +1.3677252682893802e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document megawika +1.3940876719849597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document megawika +1.4222245138730965e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document megawika +1.3201677767919704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document megawika +1.1421717796486169e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document megawika +1.2890514724498703e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document megawika +1.3649507648749037e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document megawika +1.2400732563490717e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document megawika +1.1557681453277616e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document megawika +1.2294483595964517e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document megawika +1.2137484472122283e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document megawika +1.3299663426456e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document megawika +1.2461984216479532e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document megawika +1.4666434217609636e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document megawika +1.1876997894686238e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document megawika +1.2939155338964078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document megawika +1.3859590039728515e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document megawika +1.317917848615668e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document megawika +1.1335281536110342e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document megawika +1.2889923952861426e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document megawika +1.3471671647053326e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document megawika +1.2221720014475102e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document megawika +1.2632647276287541e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document megawika +1.28276219004076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document megawika +1.36213704321643e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document megawika +1.2414858625261553e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document megawika +1.3173700421883744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document megawika +1.295597796725686e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document megawika +1.242783936442904e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document megawika +1.2417374088427464e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document megawika +1.2134479405400744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document megawika +1.3090040663304255e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document megawika +1.2713470581614905e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document megawika +5.5750231378906594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document megawika +5.777597358425469e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document megawika +5.349786767471258e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document megawika +5.675165050453583e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document megawika +5.482611216158831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document megawika +5.065421899890121e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document megawika +5.384718357480146e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document megawika +4.872037363236061e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document megawika +4.532709250783155e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document megawika +5.7257963030489613e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document megawika +4.9014365579652036e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document megawika +5.722863552770969e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document megawika +6.149911636146833e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document megawika +5.2178057608273506e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document megawika +4.990228161160431e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document megawika +5.866186875255134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document megawika +5.004185734360719e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document megawika +4.79401853705107e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document megawika +5.435219965052376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document megawika +5.035997225792266e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document megawika +5.622401774211625e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document megawika +5.028826157387559e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document megawika +5.596379470128795e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document megawika +6.027824493191489e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document megawika +5.5358270009931474e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document megawika +5.9839051807685496e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document megawika +5.1221077499249595e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document megawika +5.517228560620279e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document megawika +5.1687858285052305e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document megawika +5.684188244145645e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document megawika +5.212693275535878e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document megawika +4.8551007022784084e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document megawika +5.4888506639203145e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document megawika +5.345098688527242e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document megawika +4.8506420625516594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document megawika +5.132168603397676e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document megawika +5.719476795114223e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document megawika +5.7448621149792696e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document megawika +4.9068410568059265e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document megawika +5.382937299647678e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document megawika +4.8288432136304634e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document megawika +5.841703200305416e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document megawika +5.1589611587885584e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document megawika +6.031113829732574e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document megawika +5.4558202844532094e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document megawika +5.341852317196142e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document megawika +5.1402942738369954e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document megawika +5.735421384377395e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document megawika +5.473629863586958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document megawika +5.4708993245733936e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document megawika +4.931161863634078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document megawika +5.104173022127248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document megawika +5.510157161510824e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document megawika +5.652501401782597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document megawika +5.7273656573031666e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document megawika +5.638363224821738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document megawika +5.6128115396668704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document megawika +5.00304877998141e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document megawika +5.596120554779096e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document megawika +5.5280923889040006e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document megawika +5.223477917938408e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document megawika +5.29472809986569e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document megawika +2.205682378243213e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document megawika +1.4367563720603185e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document megawika +3.5506193487931076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document megawika +3.0442910855821778e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document megawika +2.2540042508019627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document megawika +2.6880163202623216e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document megawika +2.534473148048727e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document megawika +2.6560945431318916e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document megawika +2.547470248967691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document megawika +2.5248825388073738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document megawika +2.5828729575000054e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document megawika +2.4026583817957736e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document megawika +2.3930425429834413e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document megawika +2.5037365362599724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document megawika +2.6696745470595603e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document megawika +2.140323051341762e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document megawika +2.617354786691592e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document megawika +1.538359101762691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document megawika +1.2871029252377856e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document megawika +2.255195411289217e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document megawika +2.4832313897952067e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document megawika +9.303873918189968e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document megawika +2.179532302620228e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document megawika +1.9750517506901206e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document megawika +2.7740420380648435e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document megawika +2.7813714782319335e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document megawika +4.1595357937609806e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document megawika +2.741365122389175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document megawika +2.117451071361901e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document megawika +1.7132649760565998e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document megawika +1.7492547092602047e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document megawika +1.7499951097392276e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document megawika +1.6632444789170958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document megawika +1.6678802252361607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document megawika +1.5519208704558896e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document megawika +1.652420992967167e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document megawika +1.6119931034508755e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document megawika +1.6638882076736552e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document megawika +1.7198076782652946e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document megawika +1.572927860565175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document megawika +1.5194822618169918e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document megawika +1.6677776832669846e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document megawika +1.595612492245688e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document megawika +1.682350633181197e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document megawika +1.663983380609724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document megawika +1.710187842689243e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document megawika +1.5733697527539038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document megawika +1.6972104757911438e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document megawika +1.6610142847616577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document megawika +1.61094882403031e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document megawika +1.4789207305138325e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document megawika +1.639299617676302e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document megawika +1.3241204512116132e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document megawika +8.582260726625535e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document megawika +8.213000975576739e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document megawika +9.549247732811947e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document megawika +9.17242785339013e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document megawika +7.632868223725218e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document megawika +8.674401118222175e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document megawika +9.124384255505347e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document megawika +8.344222222417358e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document megawika +8.992299957499065e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document megawika +8.76689497361025e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document megawika +7.973396239586015e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document megawika +9.006935606644125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document megawika +8.725545954955498e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document megawika +1.215449694669174e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document megawika +3.3041720284158646e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document megawika +2.0593512412624502e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document megawika +1.893608946986248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document megawika +1.737111666788535e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document megawika +1.4915923449873955e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document megawika +2.289370239067605e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document megawika +2.8615335689614638e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document megawika +8.847283630883125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document megawika +1.8175470362373804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document megawika +1.8152226683368038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document megawika +1.789149655314284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document megawika +1.7690523036477663e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document megawika +1.8333732213753644e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document megawika +1.8794105687718654e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document megawika +1.721841156706417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document megawika +2.0612008685724796e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document megawika +1.9297370681336376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document megawika +2.0188440409661018e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document megawika +5.1741216329695265e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document megawika +1.3417913926038429e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document megawika +1.1010813016469651e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document megawika +1.1252416134320087e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document megawika +1.2801744104313002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document megawika +1.3041514955795817e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document megawika +1.3428837580879075e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document megawika +1.320809382267804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document megawika +1.3451566676555968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document megawika +1.228284926657501e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document megawika +1.2410599573923043e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document megawika +1.3815343367377182e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document megawika +1.3895126265148832e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document megawika +1.2306773644401741e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document megawika +1.32981021906281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document megawika +1.101337469221607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document megawika +1.513094184404692e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document megawika +1.1073759547073234e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document megawika +1.2879348765857567e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document megawika +9.619595770228435e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document megawika +1.2384340836286436e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document megawika +1.1766667232211577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document megawika +1.2871049236196452e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document megawika +1.2010645926497744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document megawika +1.3971428231518597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document megawika +1.2283733550547932e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document megawika +1.2659530508255308e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document megawika +1.551775613074462e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document megawika +1.1169413343776979e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document megawika +1.1433700593712463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document megawika +4.964773647323492e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document megawika +1.0995586595687313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document megawika +1.2957393071411267e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document megawika +2.75899247407709e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document megawika +2.8269344597344854e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document megawika +2.329108187246831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document megawika +2.4231761430460284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document megawika +1.2434140512230442e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document megawika +1.638718338352859e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document megawika +3.272953556801187e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document megawika +6.061314500486327e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document megawika +1.2465979731210292e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document megawika +1.2737557327967737e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document megawika +1.038428658075627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document megawika +2.61666472045566e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document megawika +3.6506873212272224e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document megawika +1.5066359138295701e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document megawika +1.1166290872121178e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document megawika +1.5546966228590285e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document megawika +1.2583434625014828e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document megawika +1.3398826881300862e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document megawika +1.2944933160515968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document megawika +1.0971437399901365e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document megawika +1.2787922795775774e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document megawika +1.404979227816985e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document megawika +1.3344734431324463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document megawika +4.886031157107555e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document megawika +3.277261443596394e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document megawika +3.5057957685786495e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document megawika +3.287625301718589e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document megawika +3.1370056372668855e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document megawika +3.186092015785841e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document megawika +7.271819324142512e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document megawika diff --git a/ALCF/data-lists/sunspot/open-web-math-train.txt b/ALCF/data-lists/sunspot/open-web-math-train.txt new file mode 100644 index 0000000000..ffa745cd76 --- /dev/null +++ b/ALCF/data-lists/sunspot/open-web-math-train.txt @@ -0,0 +1,13 @@ +0.001451215788905126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document open-web-math-train +0.0014486847196258788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document open-web-math-train +0.0008861032722895899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document open-web-math-train +0.0018119590809459816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document open-web-math-train +0.0008916937917547129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document open-web-math-train +6.960128832809415e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document open-web-math-train +0.002008403651063623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document open-web-math-train +0.0014374900742131454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document open-web-math-train +0.00180213596996716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document open-web-math-train +0.001956178877532413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document open-web-math-train +0.0008829547017667033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document open-web-math-train +0.0008910853619157279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document open-web-math-train +0.0018260998845299973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document open-web-math-train diff --git a/ALCF/data-lists/sunspot/pes2o.txt b/ALCF/data-lists/sunspot/pes2o.txt new file mode 100644 index 0000000000..a2a1209d8c --- /dev/null +++ b/ALCF/data-lists/sunspot/pes2o.txt @@ -0,0 +1,26 @@ +0.0012499632072059553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document pes2o +0.00125398260359913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document pes2o +0.0012541704774729071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document pes2o +0.0012527268234360602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document pes2o +0.0012532925243737164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document pes2o +0.0012456396241204315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document pes2o +0.0012589894424352072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document pes2o +0.001508020123999618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document pes2o +0.00333096950781965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document pes2o +0.0033233414614415547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document pes2o +0.003512387990689828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document pes2o +0.0035091382940513126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document pes2o +0.003514155927147005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document pes2o +0.003327108000579638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document pes2o +0.003329106196589836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document pes2o +0.003505604148738077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document pes2o +0.003324825759567855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document pes2o +0.0033248240149804913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document pes2o +0.0033385962112851358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document pes2o +0.0035043186296553615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document pes2o +0.003340469505431529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document pes2o +0.0035106889084796276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document pes2o +0.0033309469281030167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document pes2o +0.003340337858029757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document pes2o +0.003505919861097801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document pes2o +0.0003882924098240512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document pes2o diff --git a/ALCF/data-lists/sunspot/reddit.txt b/ALCF/data-lists/sunspot/reddit.txt new file mode 100644 index 0000000000..a1de492a2f --- /dev/null +++ b/ALCF/data-lists/sunspot/reddit.txt @@ -0,0 +1,78 @@ +0.0005759963691850877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document reddit +0.0005959971675332674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document reddit +0.0006026179290353799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document reddit +0.0005824184320784846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document reddit +0.0005854598548616037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document reddit +0.0005903767055633473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document reddit +0.0005930306490982049 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document reddit +0.000569425602700746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document reddit +0.0005675060415179408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document reddit +0.0005772431621253389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document reddit +0.0005678026053826858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document reddit +0.0005700398263483378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document reddit +0.0005669467963528824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document reddit +0.0005701015953324305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document reddit +0.0005795907287413296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document reddit +0.0005735602737531164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document reddit +0.0005749862745842101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document reddit +0.0005693257015931971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document reddit +0.0005716568794795563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document reddit +0.0005761083919774021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document reddit +0.0005688343169797355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document reddit +0.0005807913190929842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document reddit +0.0005710229258078636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document reddit +0.0005704083039826862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document reddit +0.0005862132348308056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document reddit +0.0005717662049559556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document reddit +0.0005858155213694451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document reddit +0.0005812012281792392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document reddit +0.0005803981414588498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document reddit +0.0005700102108287723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document reddit +0.0005719243459052329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document reddit +0.0005867253401661752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document reddit +0.0005731087218860733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document reddit +0.0005712197789109317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document reddit +0.0005702376926310089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document reddit +0.0005700411527742972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document reddit +0.0005828090098178196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document reddit +0.0005770140826168056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document reddit +0.0005723509664597896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document reddit +0.0005755499231836962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document reddit +0.0005636407438471367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document reddit +0.0005640281556500104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document reddit +0.0005633159058766496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document reddit +0.0005638034311151449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document reddit +0.0005630066273073224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document reddit +0.0005631803831128559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document reddit +0.0005631228881679657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document reddit +0.0005628178701487633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document reddit +0.0005624448092256196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document reddit +0.0005620957024062329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document reddit +0.0005614201504177484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document reddit +0.0005616890951464056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document reddit +0.0005611348559279058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document reddit +0.0005604238061828518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document reddit +0.0005603301490194237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document reddit +0.0005607291294548833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document reddit +0.0005605234569930727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document reddit +0.0005613778566640694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document reddit +0.0005610248539992471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document reddit +0.0005599977416780475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document reddit +0.0005603632562116935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document reddit +0.0005599177479509897 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document reddit +0.0005595202318298379 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document reddit +0.0005600975633499175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document reddit +0.0005614075491213365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document reddit +0.000612563885043477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document reddit +0.0005515469909644413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document reddit +0.0005526782014946906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document reddit +0.0005472463408095445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document reddit +0.0005502284746004587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document reddit +0.0005414514790555363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document reddit +0.0005513499500134784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document reddit +0.0005391391454105187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document reddit +0.0005415836910001838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document reddit +0.0005208132468536551 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document reddit +0.0005889827143132871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document reddit +0.0005822520817765276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document reddit +0.0004173155230758696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document reddit diff --git a/ALCF/data-lists/sunspot/stack.txt b/ALCF/data-lists/sunspot/stack.txt new file mode 100644 index 0000000000..60cf4451ab --- /dev/null +++ b/ALCF/data-lists/sunspot/stack.txt @@ -0,0 +1,26 @@ +0.0009994361338078242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document stackexchange +0.001087156194657966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document stackexchange +0.0010667737163656816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document stackexchange +0.0009602877882124873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document stackexchange +0.0008968956271971105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document stackexchange +0.0009198034843762967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document stackexchange +0.0009423901016715341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document stackexchange +0.0009674094553686345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document stackexchange +0.0009858331322519164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document stackexchange +0.0009970593645879198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document stackexchange +0.0010027035193731686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document stackexchange +0.0010128291154221853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document stackexchange +0.0010215631382631918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document stackexchange +0.0010288663771461238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document stackexchange +0.0010346219929285867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document stackexchange +0.00104544019940344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document stackexchange +0.0010525172676724333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document stackexchange +0.0010609529620775127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document stackexchange +0.0010725892748610153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document stackexchange +0.0010818563598181568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document stackexchange +0.0010992760196793917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document stackexchange +0.0011178992762079917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document stackexchange +0.001124687532085676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document stackexchange +0.001118303661267191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document stackexchange +0.0010206825575416534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document stackexchange +0.0005512280117499715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document stackexchange diff --git a/ALCF/data-lists/sunspot/starcoder.txt b/ALCF/data-lists/sunspot/starcoder.txt new file mode 100644 index 0000000000..0011e33989 --- /dev/null +++ b/ALCF/data-lists/sunspot/starcoder.txt @@ -0,0 +1,50 @@ +0.004474659408857016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document starcoder +0.00409944473890653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document starcoder +0.005137179939941845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document starcoder +0.005143172251066109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document starcoder +0.005206134363352808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document starcoder +0.004892747858974329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document starcoder +0.004844731352552902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document starcoder +0.005308320169123755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document starcoder +0.005124709815666577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document starcoder +0.005424710744483826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document starcoder +0.00538244648861977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document starcoder +0.0029107284679086853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document starcoder +0.0026825258998444705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document starcoder +0.0026904503191419243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document starcoder +0.002687906577174073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document starcoder +0.002850165346048818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document starcoder +0.005322698571717847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document starcoder +0.004450334290869719 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document starcoder +0.004700990083440683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document starcoder +0.003903568556500995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document starcoder +0.00390561515396931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document starcoder +0.0039046402900912262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document starcoder +0.003907454839379547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document starcoder +0.0038583224578603824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document starcoder +0.0037914116657695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document starcoder +0.003786665266798682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document starcoder +0.003792000802430658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document starcoder +0.00319266847466091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document starcoder +0.0032658716699838944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document starcoder +0.0034801959532460023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document starcoder +0.0028307012092022594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document starcoder +0.0028420360878146276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document starcoder +0.0028410455248484914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document starcoder +0.00283497183526842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document starcoder +0.002840187195459487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document starcoder +0.0028398709431369834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document starcoder +0.004364722843422023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document starcoder +0.004093255713117101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document starcoder +0.004092331079566252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document starcoder +0.004005326985579649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document starcoder +0.0036205502856964207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document starcoder +0.003625316793034984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document starcoder +0.003604743435602363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document starcoder +0.0035405823343673125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document starcoder +0.0041601413517253945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document starcoder +0.005886303658937057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document starcoder +0.003600909532810332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document starcoder +0.0034941365817168658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document starcoder +0.0004992164842980224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document starcoder + diff --git a/ALCF/data-lists/sunspot/tulu.txt b/ALCF/data-lists/sunspot/tulu.txt new file mode 100644 index 0000000000..b2e1425784 --- /dev/null +++ b/ALCF/data-lists/sunspot/tulu.txt @@ -0,0 +1,66 @@ +0.00032927705604725614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document tulu +0.0002860154190878753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document tulu +0.0002845217585425619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document tulu +0.0002743528685497456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document tulu +0.00026025323737738766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document tulu +0.00023493876414603155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document tulu +0.00029665994994226705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document tulu +0.00031808102075993956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document tulu +0.00031813573046011285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document tulu +0.0002711905171855542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document tulu +0.00028892513401817095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document tulu +0.00030003908676979083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document tulu +0.00026839878771944684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document tulu +0.00029155935002690497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document tulu +0.0002998624927624209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document tulu +0.0003091705447974841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document tulu +0.00026873195794309786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document tulu +0.00027721873498527547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document tulu +0.0002841662554024377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document tulu +0.0002839461156551537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document tulu +0.0002861705604659811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document tulu +0.0002460995649635886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document tulu +0.00019420142619795496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document tulu +0.00021967677816173628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document tulu +0.0002620283200480949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document tulu +0.0002433390542188936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document tulu +0.00021254976608350767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document tulu +0.00022094815569522115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document tulu +0.000342862378668244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document tulu +0.00033784225259118157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document tulu +0.0003367278459543952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document tulu +0.00029843279042852765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document tulu +0.0002926583661257988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document tulu +0.00029320337282010673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document tulu +0.00029281450669483455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document tulu +0.0002915338187002653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document tulu +0.0002864226923084572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document tulu +0.00028643439083586396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document tulu +0.00028253710956299054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document tulu +0.0002810856078805806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document tulu +0.00031474941344656715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document tulu +0.0002139130222205655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document tulu +0.0003084648871862831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document tulu +0.0003309477872140129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document tulu +0.0003360096824695161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document tulu +0.0003355452655196557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document tulu +0.00038119390366386037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document tulu +0.00038078927630086064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document tulu +0.0003386200917551554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document tulu +0.0002158905159938882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document tulu +0.00021621682877018768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document tulu +0.00021553306942740535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document tulu +0.00021581563462722296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document tulu +0.0002157694110556169 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document tulu +0.000215643699847159 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document tulu +0.00021532716715168094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document tulu +0.00021531221326022472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document tulu +0.0002831801179028896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document tulu +0.0002514844936507595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document tulu +0.00031638782778107964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document tulu +0.0002749197545278445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document tulu +0.00026159721512464495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document tulu +0.0002630052420096968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document tulu +0.00031106811228913666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document tulu +0.0002852973415334161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document tulu +3.7555372465932136e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document tulu diff --git a/ALCF/data-lists/sunspot/wiki.txt b/ALCF/data-lists/sunspot/wiki.txt new file mode 100644 index 0000000000..172bd0323b --- /dev/null +++ b/ALCF/data-lists/sunspot/wiki.txt @@ -0,0 +1,2 @@ +0.003548077173506675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document wiki +0.0018372203137874265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document wiki diff --git a/ALCF/ds_to_universal.py b/ALCF/ds_to_universal.py new file mode 100755 index 0000000000..da7fcb708e --- /dev/null +++ b/ALCF/ds_to_universal.py @@ -0,0 +1,693 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from functools import partial +from itertools import chain +import argparse +import glob +import itertools +import math +from concurrent.futures import ProcessPoolExecutor +import os +import re +import shutil +import torch +import tqdm +# from pprint import pprint + +from deepspeed.checkpoint import DeepSpeedCheckpoint +from deepspeed.checkpoint import ( + OPTIMIZER_STATE_DICT, + ZERO_STAGE, + BASE_OPTIMIZER_STATE, + SINGLE_PARTITION_OF_FP32_GROUPS, + PARAM_GROUPS, + PARAM_SLICE_MAPPINGS, + PARAM_SHAPES, + PARAM, + CAT_DIM, + PARAM_N_SUB_PARAMS, + SUB_PARAM_SHAPE, + VOCAB_TENSOR, + UNIVERSAL_CHECKPOINT_INFO, + UNIVERSAL_CHECKPOINT_VERSION_KEY, + UNIVERSAL_CHECKPOINT_VERSION_VALUE, + VOCABULARY_PARAMETER_PATTERNS, + PIPELINE_REPLICATED_PARAMETER_PATTERNS, + TP_REPLICATED_PARAMETER_PATTERNS, + PARAMETER_TO_AVERAGE_PATTERNS, + PARAMETER_WITH_ROW_PARALLELISM_PATTERNS, + PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0, + PARAMETER_WITH_SUB_PARAMS, + SubparamShape, +) + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_folder", + type=str, + required=True, + help="Input DeepSpeed Checkpoint folder", + ) + parser.add_argument( + "--output_folder", + type=str, + required=False, + default=None, + help="Output DeepSpeed checkpoint folder", + ) + parser.add_argument( + "--num_extract_workers", + default=4, + type=int, + help="How many parallel processes to extract zero shards", + ) + parser.add_argument( + "--num_merge_workers", + default=2, + type=int, + help="How many parallel processes to merge tp slices (more memory intensive, use much fewer than --num_extract_workers))", + ) + parser.add_argument( + "--keep_temp_folder", + action="store_true", + help="Preserve temporary folder of intermediate checkpoint slice files. Useful for debugging.", + ) + parser.add_argument( + "--no_strict", + dest="strict", + action="store_false", + help="Do not perform validity checks on converted checkpoint.", + ) + parser.add_argument( + "--inject_missing_state", + action="store_true", + help="Inject missing checkpoint state into the checkpoint if it is absent.", + ) + args = parser.parse_args() + print(f"args = {args}") + return args + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + """ + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + """ + return [atoi(c) for c in re.split(r"(\d+)", text)] + + +def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree): + path_list = [] + iter_folder = f"iter_{iteration:07d}" + for i in range(0, tp_degree): + path_list.append([]) + for j in range(0, pp_degree): + rank_folder = ( + f"mp_rank_{i:02d}" if pp_degree == 1 else f"mp_rank_{i:02d}_{j:03d}" + ) + ckpt_path = os.path.join(rank_folder, "model_optim_rng.pt") + path_list[i].append(os.path.join(base_folder, iter_folder, ckpt_path)) + + return path_list + + +def _save_checkpoint(file_path, chkpt_sd): + dir, _ = os.path.split(file_path) + os.makedirs(dir, exist_ok=True) + torch.save(chkpt_sd, file_path) + + +def extract_zero_shards(dir, ds_checkpoint, indices_3D): + pp_index, tp_index, dp_index = indices_3D + sd = ds_checkpoint.get_zero_checkpoint_state( + pp_index=pp_index, + tp_index=tp_index, + dp_index=dp_index, + strip_tensor_paddings=False, + ) + + # pprint(f"Processing {dp_index=} {pp_index=}, {tp_index=}") + + optim_sd = sd[OPTIMIZER_STATE_DICT] + param_slice_mappings = optim_sd[PARAM_SLICE_MAPPINGS] + universal_checkpoint_info = ds_checkpoint.get_checkpoint_info( + UNIVERSAL_CHECKPOINT_INFO + ) + pipeline_replicated_params = universal_checkpoint_info.get( + PIPELINE_REPLICATED_PARAMETER_PATTERNS, [] + ) + # print(f'{pipeline_replicated_params=}') + + # dict + state_groups = optim_sd[BASE_OPTIMIZER_STATE]["state"] + # list + fp32_groups = optim_sd[SINGLE_PARTITION_OF_FP32_GROUPS] + param_groups_cnt = len(state_groups) + + for param_group_id in range(param_groups_cnt): + flat_state = dict( + exp_avg=state_groups[param_group_id]["exp_avg"], + exp_avg_sq=state_groups[param_group_id]["exp_avg_sq"], + fp32=fp32_groups[param_group_id], + ) + + if "step" in state_groups[param_group_id]: + flat_state["step"] = state_groups[param_group_id]["step"] + + for name, fragment_mapping in param_slice_mappings[param_group_id].items(): + if pp_index > 0 and any( + re.match(pattern, name) for pattern in pipeline_replicated_params + ): + # Skip tied weights that are replicated in first and last pp stages + continue + + # pprint(f"dpt{dp_index}{pp_index}{tp_index} {param_group_id} {name} => {fragment_mapping.start}:{fragment_mapping.numel}") + for state_key in flat_state.keys(): + dump_param_fragment( + dir, + tp_index, + dp_index, + state_key, + flat_state[state_key], + name, + fragment_mapping.start, + fragment_mapping.numel, + ) + + +def extract_zero_shards_stage3( + optim_files, param_shapes, dp_degree, temp_dir, dp_index +): + state_dict = torch.load(optim_files[dp_index], map_location="cpu") + + flat_state = dict( + exp_avg=state_dict[OPTIMIZER_STATE_DICT]["optimizer_state_dict"]["state"][0][ + "exp_avg" + ], + exp_avg_sq=state_dict[OPTIMIZER_STATE_DICT]["optimizer_state_dict"]["state"][0][ + "exp_avg_sq" + ], + fp32=state_dict[OPTIMIZER_STATE_DICT]["fp32_flat_groups"][0], + ) + + offset = 0 + for name, shape in param_shapes.items(): + unpartitioned_numel = shape.numel() + partitioned_numel, _ = _zero_partitioned_param_info( + unpartitioned_numel, dp_degree + ) + padding_free_numel = min( + partitioned_numel, abs(unpartitioned_numel - dp_index * partitioned_numel) + ) + for state_key in flat_state.keys(): + dump_param_fragment( + temp_dir, + 0, + dp_index, + state_key, + flat_state[state_key], + name, + offset, + padding_free_numel, + ) + offset += partitioned_numel + + +cnt = 0 + + +def dp_index_to_str(dp_index): + return f"{dp_index:0>2d}" + + +def dump_param_fragment( + dir, tp_index, dp_index, state_name, state_flat_tensor, param_name, offset, numel +): + global cnt # temp hack + + param_base_path = os.path.join(dir, param_name, str(tp_index)) + os.makedirs(param_base_path, exist_ok=True) + + cnt += 1 + + path = os.path.join(param_base_path, f"{state_name}.{dp_index_to_str(dp_index)}") + + # print(f"{param_name}: {offset}: {numel} => {path}") + + # State might be a python int or a tensor + if state_name != "step" and torch.is_tensor(state_flat_tensor): + state_flat_tensor = state_flat_tensor.narrow(0, offset, numel).clone() + _save_checkpoint(path, state_flat_tensor) + + +def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape=None): + slices = [] + for tp_index in range(tp_degree): + prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}") + paths = glob.glob(f"{prefix_path}.*") + + if len(paths) == 0: + continue + + pattern = re.compile(f"{prefix_path}\\.([0-9]+)") + dp_indices = set() + for p in paths: + m = pattern.match(p) + if m: + dp_indices.add(int(m.group(1))) + else: + raise ValueError(f"Cannot parse dp_rank from {p}") + + paths = [ + f"{prefix_path}.{dp_index_to_str(dp_index)}" + for dp_index in sorted(list(dp_indices)) + ] + shards = [torch.load(p) for p in paths] + + if state == "step": + assert all( + v == shards[0] for v in shards + ), "All shards must have the same step value" + slice = shards[0] + else: + if slice_shape is None: + slice = torch.cat(shards, dim=0) + else: + slice = torch.cat(shards, dim=0).reshape(slice_shape) + + slices.append(slice) + return slices + + +def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape): + name, shape = name_and_shape + slice_base_path = os.path.join(slice_dir, name) + param_base_path = os.path.join(dir, name) + + universal_checkpoint_info = ds_checkpoint.get_checkpoint_info( + UNIVERSAL_CHECKPOINT_INFO + ) + replicated_parameters = universal_checkpoint_info.get( + TP_REPLICATED_PARAMETER_PATTERNS, [] + ) + parameters_to_average = universal_checkpoint_info.get( + PARAMETER_TO_AVERAGE_PATTERNS, [] + ) + parameters_with_row_parallelism = universal_checkpoint_info.get( + PARAMETER_WITH_ROW_PARALLELISM_PATTERNS, [] + ) + vocabulary_parameters = universal_checkpoint_info.get( + VOCABULARY_PARAMETER_PATTERNS, [] + ) + parameters_with_2_sub_params_cat_dim_0 = universal_checkpoint_info.get( + PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0, [] + ) + parameter_with_sub_params = universal_checkpoint_info.get( + PARAMETER_WITH_SUB_PARAMS, [] + ) + + unmatched_patterns = set( + replicated_parameters + + parameters_to_average + + parameters_with_row_parallelism + + vocabulary_parameters + + parameters_with_2_sub_params_cat_dim_0 + ) + unmatched_patterns.update( + chain.from_iterable( + SubparamShape(**s).patterns for s in parameter_with_sub_params + ) + ) + + def get_matched_pattern(patterns_, name_): + matched_ = [pattern_ for pattern_ in patterns_ if re.match(pattern_, name_)] + assert ( + len(matched_) <= 1 + ), f"Got more than one matching patterns={matched_} for {name_}" + if matched_: + pattern_ = matched_[0] + unmatched_patterns.discard(pattern_) + return pattern_ + return None + + def get_matched_sub_params_pattern(name_): + for subparam_shape_dict in parameter_with_sub_params: + subparam_shape = SubparamShape(**subparam_shape_dict) + for pattern_ in subparam_shape.patterns: + if re.match(pattern_, name_): + unmatched_patterns.discard(pattern_) + return subparam_shape + return None + + matched_sub_params_shape = get_matched_sub_params_pattern(name) + + step_merged = _merge_zero_shards(slice_base_path, "step", tp_degree, shape) + if step_merged: + _save_checkpoint(os.path.join(param_base_path, f"step.pt"), step_merged[0]) + + for state in ("fp32", "exp_avg", "exp_avg_sq"): + slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape) + final_path = os.path.join(param_base_path, f"{state}.pt") + + # print(f"Expected shape: {shape}") + # print(f"Fragment sizes:", list(frag.shape for frag in slices)) + ckpt_dict = {} + if get_matched_pattern(replicated_parameters, name): + if len(slices) > 1: + assert all([slices[0].equal(other_slice) for other_slice in slices[1:]]) + param = slices[0] + # print(f'replicate {name} using first slice') + elif get_matched_pattern(parameters_to_average, name): + param = sum(slices) / len(slices) + # print(f'merge {name} using average') + elif get_matched_pattern(parameters_with_2_sub_params_cat_dim_0, name): + cat_dim = 0 + chunked_slices = [torch.chunk(s, 2, dim=cat_dim) for s in slices] + merged_chunks_0 = torch.cat([s[0] for s in chunked_slices], dim=cat_dim) + merged_chunks_1 = torch.cat([s[1] for s in chunked_slices], dim=cat_dim) + param = torch.cat([merged_chunks_0, merged_chunks_1], dim=cat_dim) + ckpt_dict[CAT_DIM] = cat_dim + ckpt_dict[PARAM_N_SUB_PARAMS] = 2 + elif matched_sub_params_shape: + merged_chunks = [] + partition_dim = matched_sub_params_shape.partition_dim + + sub_dim_sizes = matched_sub_params_shape.shape[partition_dim] + if not isinstance(sub_dim_sizes, tuple): + sub_dim_sizes = (sub_dim_sizes,) + + partition_shape = [ + sum(d) if isinstance(d, tuple) else d + for d in matched_sub_params_shape.shape + ] + partition_shape = [ + d // tp_degree if i == partition_dim else d + for i, d in enumerate(partition_shape) + ] + slices = [s.view(partition_shape) for s in slices] + + offset = 0 + for sub_dim_size in sub_dim_sizes: + part_sub_dim_size = sub_dim_size // tp_degree + merged_chunks.append( + torch.cat( + [ + s.narrow(partition_dim, offset, part_sub_dim_size) + for s in slices + ], + dim=partition_dim, + ) + ) + offset += part_sub_dim_size + param = torch.cat(merged_chunks, dim=partition_dim) + ckpt_dict[SUB_PARAM_SHAPE] = matched_sub_params_shape + else: + cat_dim = ( + 1 if get_matched_pattern(parameters_with_row_parallelism, name) else 0 + ) + # print(f"merge {name} with CAT DIM: {cat_dim}") + param = torch.cat(slices, dim=cat_dim) + ckpt_dict[CAT_DIM] = cat_dim + + if get_matched_pattern(vocabulary_parameters, name): + # print(f"Before {param.shape=}") + # strip padding + original_vocab_size = universal_checkpoint_info["original_vocab_size"] + param = param[:original_vocab_size, :] + ckpt_dict[VOCAB_TENSOR] = True + # print(f"After {param.shape=}") + + # print(f"Final shape: {param.shape}") + ckpt_dict[PARAM] = param + _save_checkpoint(final_path, ckpt_dict) + + return unmatched_patterns + + +def merge_zero3_slices(dp_degree, dir, slice_dir, name): + slice_base_path = os.path.join(slice_dir, name) + param_base_path = os.path.join(dir, name) + + for state in ("fp32", "exp_avg", "exp_avg_sq"): + slices = _merge_zero_shards(slice_base_path, state, 1) + final_path = os.path.join(param_base_path, f"{state}.pt") + _save_checkpoint(final_path, slices[0]) + + +def _do_parallel_work(do_work, work_chunks, num_workers): + results = [] + if num_workers > 1: + with ProcessPoolExecutor(max_workers=num_workers) as executor: + future_list = [executor.submit(do_work, work) for work in work_chunks] + for f in tqdm.tqdm(future_list): + results.append(f.result()) + else: + # No parallel pass for unit testing + # We can't create child processes in tests + for work in tqdm.tqdm(work_chunks): + results.append(do_work(work)) + return results + + +def _extract_zero_shard_files(args, ds_checkpoint, temp_dir): + _3d_range_list = list( + itertools.product( + range(ds_checkpoint.pp_degree), + range(ds_checkpoint.tp_degree), + range(ds_checkpoint.dp_degree), + ) + ) + # pprint(f'{_3d_range_list=}') + + do_work = partial(extract_zero_shards, temp_dir, ds_checkpoint) + _do_parallel_work(do_work, _3d_range_list, args.num_extract_workers) + + +def _extract_zero_shard_files_stage3( + args, optim_files, param_shapes, dp_degree, temp_dir +): + do_work = partial( + extract_zero_shards_stage3, optim_files, param_shapes, dp_degree, temp_dir + ) + _do_parallel_work(do_work, list(range(dp_degree)), args.num_extract_workers) + + +def _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir): + zero_output_folder = os.path.join(args.output_folder, "zero") + do_work = partial( + merge_tp_slices, + ds_checkpoint, + zero_output_folder, + temp_dir, + ds_checkpoint.tp_degree, + ) + unmatched_patterns_lists = _do_parallel_work( + do_work, list(slice_shapes.items()), args.num_merge_workers + ) + + # verify that all patterns were used + # if a pattern was not used by any of the workers, then it was not used at all -> assert/alert + sets = [set(lst) for lst in unmatched_patterns_lists] + unmatched_patterns = list(set.intersection(*sets)) + if args.strict: + assert ( + not unmatched_patterns + ), f"Unused patterns={unmatched_patterns} while merging tp slices" + elif unmatched_patterns: + print(f"Warning: Unused patterns={unmatched_patterns} while merging tp slices") + + +def _merge_zero3_slice_files(args, param_shapes, dp_degree, temp_dir): + zero_output_folder = os.path.join(args.output_folder, "zero") + do_work = partial(merge_zero3_slices, dp_degree, zero_output_folder, temp_dir) + _do_parallel_work(do_work, param_shapes.keys(), args.num_merge_workers) + + +def _zero_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _parse_model_states_stage3(files): + return torch.load(files[0], map_location=torch.device("cpu"))[PARAM_SHAPES] + + +def _save_optimizer_state(args, ds_checkpoint): + sharded_states = [ + BASE_OPTIMIZER_STATE, + PARAM_SLICE_MAPPINGS, + SINGLE_PARTITION_OF_FP32_GROUPS, + ] + sd = ds_checkpoint.get_zero_checkpoint_state( + pp_index=0, tp_index=0, dp_index=0, strip_tensor_paddings=False + ) + + optim_sd = sd[OPTIMIZER_STATE_DICT] + output_sd = {k: v for k, v in optim_sd.items() if k not in sharded_states} + output_sd[PARAM_GROUPS] = optim_sd[BASE_OPTIMIZER_STATE][PARAM_GROUPS] + zero_output_folder = os.path.join(args.output_folder, "zero") + output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt") + _save_checkpoint(output_file_path, output_sd) + + +def _save_optimizer_state_stage3(args, optim_files): + sd = torch.load(optim_files[0], map_location=torch.device("cpu")) + output_sd = sd[OPTIMIZER_STATE_DICT] + output_sd[PARAM_GROUPS] = output_sd[OPTIMIZER_STATE_DICT][PARAM_GROUPS] + zero_output_folder = os.path.join(args.output_folder, "zero") + output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt") + _save_checkpoint(output_file_path, output_sd) + + +def _get_optim_files(checkpoint_dir): + return _get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def _get_model_state_files(checkpoint_dir): + return _get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def _get_checkpoint_files(checkpoint_dir, glob_pattern): + ckpt_files = sorted( + glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys + ) + + if len(ckpt_files) == 0: + raise FileNotFoundError( + f"can't find {glob_pattern} files in directory '{checkpoint_dir}'" + ) + + return ckpt_files + + +def _get_zero_stage(optim_files): + state_dict = torch.load(optim_files[0], map_location=torch.device("cpu")) + optimizer_state = state_dict[OPTIMIZER_STATE_DICT] + zero_stage = optimizer_state.get(ZERO_STAGE, 1) + return zero_stage + + +def _inject_missing_state(ds_checkpoint): + if UNIVERSAL_CHECKPOINT_INFO not in ds_checkpoint.global_state: + sd = torch.load( + ds_checkpoint.mp_rank_files[0], map_location=torch.device("cpu") + ) + if UNIVERSAL_CHECKPOINT_INFO not in sd: + ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO] = {} + ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO][ + UNIVERSAL_CHECKPOINT_VERSION_KEY + ] = UNIVERSAL_CHECKPOINT_VERSION_VALUE + + +def _check_for_required_state(ds_checkpoint): + universal_checkpoint_info = ds_checkpoint.get_checkpoint_info( + UNIVERSAL_CHECKPOINT_INFO + ) + assert ( + universal_checkpoint_info is not None + ), f"Required {UNIVERSAL_CHECKPOINT_INFO} state is missing in checkpoint. Verify that client creates this state." + + +def main(args): + print(f"Convert DeepSpeed Checkpoint to Universal Checkpoint") + + print( + f"Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}" + ) + + optim_files = _get_optim_files(args.input_folder) + zero_stage = _get_zero_stage(optim_files) + + if zero_stage <= 2: + ds_checkpoint = DeepSpeedCheckpoint(args.input_folder) + if args.inject_missing_state: + _inject_missing_state(ds_checkpoint) + else: + _check_for_required_state(ds_checkpoint) + + iteration = ds_checkpoint.get_iteration() + # _create_latest_file(args.output_folder, iteration) + checkpoint_paths = _create_checkpoint_paths( + args.output_folder, + iteration, + ds_checkpoint.tp_degree, + ds_checkpoint.pp_degree, + ) + + slice_shapes = [] + for mp_rank_file in ds_checkpoint.mp_rank_files: + mp_sd = torch.load(mp_rank_file, map_location=torch.device("cpu")) + slice_shapes += mp_sd[PARAM_SHAPES] + + # fix back to normal flat dict, merge duplicates for tp>1 + slice_shapes = dict((k, v) for d in slice_shapes for k, v in d.items()) + temp_dir = os.path.join(args.output_folder, "tmp") + + print("1. Extracting ZeRO fragments") + _extract_zero_shard_files(args, ds_checkpoint, temp_dir) + + print("2. Merging slices .....") + _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir) + + print("3. Saving common optimizer states") + _save_optimizer_state(args, ds_checkpoint) + + if not args.keep_temp_folder: + shutil.rmtree(temp_dir, ignore_errors=True) + + # Copy mp* files into output folder + for f in glob.glob(os.path.join(args.input_folder, "mp*")): + shutil.copy2(f, args.output_folder) + + else: + model_files = _get_model_state_files(args.input_folder) + param_shapes = _parse_model_states_stage3(model_files) + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + dp_degree = len(model_files) + + temp_dir = os.path.join(args.output_folder, "tmp") + + print("*** 1. Extracting ZeRO fragments") + _extract_zero_shard_files_stage3( + args, optim_files, param_shapes, dp_degree, temp_dir + ) + + print("*** 2. Merging slices .....") + _merge_zero3_slice_files(args, param_shapes, dp_degree, temp_dir) + + print("*** 3. Saving common optimizer states") + _save_optimizer_state_stage3(args, optim_files) + + if not args.keep_temp_folder: + shutil.rmtree(temp_dir, ignore_errors=True) + + # Copy *model_states files into output folder + for f in glob.glob(os.path.join(args.input_folder, "*model_states.pt")): + shutil.copy2(f, args.output_folder) + + # Update latest to output folder + checkpoint_root_folder, step_folder = os.path.split(args.output_folder) + latest_file = os.path.join(checkpoint_root_folder, "latest_universal") + with open(latest_file, "w") as f: + f.write(step_folder) + + print("*** Done!") + + +if __name__ == "__main__": + args = parse_arguments() + main(args) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index bc2adb26fa..a52fc2bbb1 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -1,112 +1,771 @@ #!/bin/bash --login +############################################################################### +# [`ALCF/helpers.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/helpers.sh) +# +# Contains helper functions for launching `../train_llama_alcf.sh` +# +# To use, on any of {Polaris, Aurora, Sunspot} @ ALCF: +# +# ```bash +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed +# $ export PBS_O_WORKDIR=$(pwd) && source ALCF/helpers.sh && setup +# ``` +# +# and this will, automatically: +# +# 1. Setup python (conda + virtual environment) +# +# 2. Parse `$PBS_*` env vars to build appropriate `alias launch='mpiexec ...'` +# command for launching across all GPUs in our active PBS job. +############################################################################### +################## +# helpers_main +# +# This will get called automatically when running: +# +# ```bash +# $ cd Megatron-DeepSpeed +# $ PBS_O_WORKDIR=$(pwd) source ALCF/helpers.sh +# ``` +# +# - This will set `"${WORKING_DIR}"`, according to: +# +# 1. if `${PBS_O_WORKDIR}` is nonzero, use this +# 2. else, if `${SLURM_SUBMIT_DIR}` is nonzero use this +# 3. else, use `$(pwd)` +# +# this is _crucial_ since many of the functions below use paths +# which are defined relative to this "${WORKING_DIR}" +# (e.g. virtual environment, location of executables, etc.) +################## +helpers_main() { + # NOTE: for debug mode, run with `DEBUG=1` + if [[ -n "${DEBUG:-}" ]]; then + set -euxo + fi + if [[ -n "${PBS_O_WORKDIR}" ]]; then + WORKING_DIR="${PBS_O_WORKDIR}" + elif [[ -n "${SLURM_SUBMIT_DIR}" ]]; then + WORKING_DIR="${SLURM_SUBMIT_DIR}" + else + echo "Unable to detect PBS or SLURM working directory info..." + WORKING_DIR=$(python3 -c 'import os; print(os.getcwd())') + echo "Using ${WORKING_DIR} as working directory..." + fi + export WORKING_DIR="${WORKING_DIR}" + printf "Using WORKING_DIR: %s\n" "${WORKING_DIR}" +} + +############################################################################## +# setup +# +# All-in-one helper function. +# +# - Explicitly, this will: +# - Identify the machine we're on +# +# - Setup `python` +# 1. Load `conda` +# 2. Setup `venv` on top of `conda` +# +# - Ensure all dependencies are installed +# +# - Clone + Install [`saforem2/ezpz`](https://github.com/saforem2/ezpz) +# - Source [`ezpz/utils.sh`](https://github.com/saforem2/ezpz/blob/main/src/ezpz/bin/utils.sh) +# - This provides `{ezpz_setup_python, ezpz_setup_job}` (called below) +# +# - Set runtime options +# +# - Build `deepspeed_config.json` +# +# - Build {logs, checkpoints, etc} dirs, named according to specifics of +# current run +# +# - Specify additional `deepspeed` arguments +# +# - Ensure executable exists at expected path +# +# - Setup data + tokenizer via `TOKENIZER_TYPE` +# +# - Print job info +# +# - Save `.env` to `CKPT_DIR` for safe keeping +# +# - Check that we're not already running, and if so, exit. +# +# - Setup run command to be executed. +############################################################################## +setup() { + # Identify machine we're on + get_machine || exit + ########################################################################## + # ezpz_setup will: + # 1. Setup python + # - load base conda + # - (if necessary) create virtual environment on top of base conda + # - activate virtual environment from ^ + # 2. Install ezpz (if needed) + # 3. Parse PBS_* environment variables to determine: + # - NHOSTS (by counting number of lines in $PBS_NODEFILE) + # - NGPU_PER_HOST (by magic) + # - NGPUS (= NHOSTS * NGPU_PER_HOST) + # 4. Use these (^) to build our launch command + ezpz_setup || exit + ########################################################################## + install_dependencies + # Set command line arguments to pass to `"${EXEC}"` + setParams || exit + # Create `deepspeed_config.json` from runtime params from ^ + buildDSconfig || exit + # Specify output directory for {logs, checkpoints, etc.} + setup_checkpoint || exit + setOutput || exit + # Specify additional `deepspeed` arguments (dependent on _newly created_ variables) + set_args || exit + # Ensure executable exists in expected path + check_executable "${EXEC:-${WORKING_DIR}/pretrain_gpt_alcf.py}" + dfl="${DATA_FILE_LIST:-"${PBS_O_WORKDIR}/ALCF/data-lists/$(get_machine_name)/dolma.txt"}" + # Setup data + tokenizer via `DATA_FILE_LIST` and `TOKENIZER_TYPE` + tok="${TOKENIZER_TYPE:-Llama2Tokenizer}" + setup_tokenizer_and_data "${tok}" "${dfl}" || exit + make_data || exit + # Print job info + printJobInfo || exit + # Save `.env` to `CKPT_DIR` for safe keeping + save_dotenv "${CKPT_DIR}" || exit + # Check that were not already running, if so, exit. + check_and_kill_if_running || exit + # Setup run command to be executed + setup_run_cmd "$@" || exit +} + +##################################################### +# setup_run_cmd +# +# Build run command to be executed. +##################################################### +setup_run_cmd() { + ############################## + # take in additional arguments + # and append them directly to + # the end of the `run_cmd` + # custom_args="$@" + custom_args=("$@") + ############################## + #### Make it easy to track experiments by date ################### + year="$(date "+%Y")" + month="$(date "+%m")" + day="$(date "+%Y-%m-%d")" + today="$(date "+%Y-%m-%d")" # kept for backwards compatibility + started_at="$(date "+%Y-%m-%d-%H%M%S")" + export YEAR="${year}" + export MONTH="${month}" + export DAY="${day}" + export TODAY="${today}" + export STARTED_AT="${started_at}" + ################################################################## + # NOTE: to launch with DeepSpeed instead of mpiexec: + # `export LAUNCH_WITH=deepspeeed && bash train_llama_alcf.sh` + ################################################################## + setupLauncher "${LAUNCH_WITH:-MPICH}" || exit + export data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" && mkdir -p "${data_cache_path}" + printf "\n" + echo "Using data_cache_path: ${data_cache_path}" + ################################################################## + # WARN: to disable Llama-type architectures, toggle via: + # `NO_LLAMA=1 bash train_llama_alcf.sh` + ################################################################## + if [[ -z "${NO_LLAMA:-}" ]]; then + llama_flags=( + "--swiglu" + "--hidden-dropout 0" + "--attention-dropout 0" + "--normalization rmsnorm" + "--disable-bias-linear" + "--no-query-key-layer-scaling" + "--use-rotary-position-embeddings" + "--untie-embeddings-and-output-weights" + "--num-key-value-heads ${NUM_KV_HEAD}" + "--ffn-hidden-size ${FFN_HIDDEN_SIZE}" + ) + fi + # min_lr=$(python3 -c 'print(f"{2 / (10 ** 5):.8f}")') + # "--min-lr ${LR:-${min_lr}}" # 2e-5 + # "--min-lr ${MIN_LR:-"2e-6"}" # 2e-5 + export LR="${LR:-0.0002}" + export LR_DECAY_STYLE="${LR_DECAY_STYLE:-cosine}" + export LR_WARMUP_FRAC="${LR_WARMUP_FRAC:-0.05}" + lr_flags=( + "--lr ${LR}" + "--lr-decay-style ${LR_DECAY_STYLE}" + "--lr-warmup-fraction ${LR_WARMUP_FRAC}" + ) + if [[ -n "${LR_DECAY_ITERS:-}" ]]; then + lr_flags+=("--lr-decay-iters ${LR_DECAY_ITERS:-}") + fi + + tb_flags=() + if [[ -z "${NO_TENSORBOARD:-}" ]]; then + TBDIR="${CKPT_DIR}/tensorboard" + mkdir -p "${TBDIR}" + tb_flags+=( + "--log-timers-to-tensorboard" + "--log-optimizer-states-to-tensorboard" + "--tensorboard-dir ${TBDIR}" + ) + fi + dfl_fallback="${DATA_FILE_LIST:-${PBS_O_WORKDIR}/ALCF/data-lists/$(get_machine_name)/dolma.txt}" + + train_args=() + if [[ -z "${OVERRIDE_CKPT_OPT_PARAM:-}" ]]; then + train_args+=("--use-checkpoint-opt_param-scheduler") + fi + # "--init-method-std ${INIT_METHOD_STD:-0.0006}" + # "--shuffle-sample" + train_args+=( + "${lr_flags[@]}" + "${custom_args[@]}" + "${llama_flags[@]}" + "${FLASH_ARG}" + "${TIMING_STR:-}" + "${DATA_FLAGS}" + "${TOKENIZER_FLAGS}" + "${tb_flags[@]}" + "${ds_args[@]}" + "${gpt_args[@]}" + "--${DTYPE}" + "--shuffle-sample-in-corpus" + "--blend-sample-in-corpus" + "--accumulate-allreduce-grads-in-fp32" + "--no-bias-gelu-fusion" + "--no-bias-dropout-fusion" + "--no-masked-softmax-fusion" + "--no-gradient-accumulation-fusion" + "--optimizer=${OPT}" + "--tensor-model-parallel-size=${TP}" + "--pipeline-model-parallel-size=${PP}" + "--max-position-embeddings=${SEQ}" + "--micro-batch-size=${MICRO_BATCH}" + "--ds-sequence-parallel-size=${SP}" + "--global-batch-size=${GLOBAL_BATCH}" + "--split=${TRAIN_SPLIT:-990},${VAL_SPLIT:-10},${TEST_SPLIT:-0}" + "--timing-log-level=${TIMING_LOG_LEVEL:-1}" + "--eval-interval=${EVAL_INTERVAL:-100}" + "--eval-iters=${EVAL_ITERS:-20}" + "--save-interval=${SAVE_INTERVAL:-50}" + "--log-interval=${LOG_INTERVAL:-1}" + "--save=${SAVE:-${CKPT_DIR}}" + "--load=${LOAD:-${CKPT_DIR}}" + "--seq-length=${SEQ}" + "--num-layers=${NLAYERS}" + "--hidden-size=${HIDDEN}" + "--train-iters=${TRAIN_ITERS}" + "--distributed-backend=${BE}" + "--weight-decay=${WEIGHT_DECAY:-0.1}" + "--adam-beta1=${ADAM_BETA1:-0.9}" + "--adam-beta2=${ADAM_BETA2:-0.95}" + "--adam-eps=${ADAM_EPS:-0.00001}" + "--clip-grad=${CLIP_GRAD:-1.0}" + "--num-attention-heads=${HEADS}" + "--data-cache-path=${data_cache_path}" + "--data-file-list=${DATA_FILE_LIST:-${dfl_fallback}}" + ) + # "--adam-eps ${ADAM_EPS:-0.00001}" + cache_dir="${PBS_O_WORKDIR}/.cache/" + mkdir -p "${cache_dir}" + targs_cache="${cache_dir}/train_args.txt" + for arg in "${train_args[@]}"; do echo "${arg}" >>"${targs_cache}"; done + export TRAIN_ARGS=("$(printf '%s\n' "${train_args[@]}" | sort)") + printf "Training Arguments: %s\n" "${TRAIN_ARGS[@]}" + export run_cmd=("${LAUNCHER}" "${train_args[@]}") +} + +save_dotenv() { + if [[ "$#" -ne 1 ]]; then + estr="[error]" + printf "%s Expected one argument (outdir). Received: %s" "$(printRed "${estr}")" "$#" + else + outdir="$1" + mkdir -p "${outdir}" + module list + dotenv_file="${outdir}/.env" + echo "Saving environment to ${dotenv_file}" + printenv | grep -v "LS_COLORS" >"${dotenv_file}" + export DOTENV_FILE="${dotenv_file}" + fi +} + +###################################################################### +# get_machine_name: +# +# Return current machine name, as lowercase string +# +# Example: +# ```bash +# $ machine_name=$(get_machine_name) +# $ echo "machine_name: ${machine_name}" +# ``` +###################################################################### +get_machine_name() { + if [[ $(hostname) == x4* || $(hostname) == aurora* ]]; then + machine="aurora" + elif [[ $(hostname) == x1* || $(hostname) == uan* ]]; then + machine="sunspot" + elif [[ $(hostname) == x3* || $(hostname) == polaris* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + machine="sirius" + else + machine="polaris" + fi + elif [[ $(hostname) == sophia* ]]; then + machine="sophia" + elif [[ $(hostname) == nid* ]]; then + machine="perlmutter" + else + machine=$(hostname) + fi + echo "${machine}" +} + +get_machine() { + machine=$(hostname) + if [[ $(hostname) == x4* ]]; then + machine="aurora" + elif [[ $(hostname) == x1* ]]; then + machine="sunspot" + elif [[ $(hostname) == x3* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + machine="sirius" + else + machine="polaris" + fi + elif [[ $(hostname) == sophia* ]]; then + machine="sophia" + elif [[ $(hostname) == nid* ]]; then + machine="perlmutter" + else + echo "Unknown MACHINE. Setting MACHINE to $(hostname) and continuing..." + fi + export MACHINE="${machine}" + printf "Running on: %s\n" "$(printBlue "${MACHINE}")" +} + +check_and_kill_if_running() { + RUNNING_PIDS=$(lsof -i:29500 -Fp | head -n 1 | sed 's/^p//') + if [[ -n "${RUNNING_PIDS}" ]]; then + echo "Caught ${RUNNING_PIDS}" && kill "${RUNNING_PIDS}" + else + echo "Not currently running. Continuing!" + fi +} + +setupSrun() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$((NHOSTS * NGPU_PER_HOST))" + export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} printJobInfo() { echo "++++++++++++++++++++++++++++++++++++++++++++++++++" - echo "- MPICH_DIR=$MPICH_DIR" + echo "- MPICH_DIR=${MPICH_DIR:-${MPI_ROOT:-}}" echo "- Using $(which python3)" - echo "- WORLD_SIZE:${WORLD_SIZE}" - echo "- NCCL: ${NCCL:-nccl}" - echo "- MODEL_TYPE: ${MODEL_TYPE}" - echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" + echo "- WORLD_SIZE:${WORLD_SIZE-}" + echo "- BACKEND: ${BE:-}" + echo "- MODEL_TYPE: ${MODEL_TYPE:-}" + echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST:-}" echo "++++++++++++++++++++++++++++++++++++++++++++++++++" } -function setDSlauncher() { - # launcher setting - outdir=$1 - # hfds=$1 - # hfmpi=$2 - # here=$(python3 -c 'import os; print(os.getcwd())') - export hfds="$outdir/hostfile_deepspeed" - export hfmpi="$outdir/hostfile_mpich" - [ -f "$hfds" ] || exit - [ -f "$hfmpi" ] || exit - export LAUNCHER=${LAUNCHER:-MPICH} - if [[ $LAUNCHER == "deepspeed" ]]; then - export launcher="" +############################################################################# +# setupLauncher: Launch with one of `{mpiexec, deepspeed}`. +# +# Explicitly, look for `LAUNCH_CMD` in environment and launch accordingly. +# Will use `mpiexec` by default. +# To launch with `deepspeed` instead, specify `LAUNCH_CMD=deepspeed`, e.g. +# +# ```bash +# PBS_O_WORKDIR=$(pwd) LAUNCH_CMD=deepspeed bash train_llama_alcf.sh +# ``` +# +# will launch with `deepspeed` instead of `mpiexec`. +############################################################################# +setupLauncher() { + if [[ "$#" == 1 ]]; then + local dist_launcher="$1" + else + local dist_launcher="${LAUNCH_WITH:-${LAUNCH_CMD:-"MPICH"}}" + fi + if [[ "${dist_launcher}" == "deepspeed" ]]; then + # Save {PATH, LD_LIBRARY_PATH, ...} to .deepspeed_env + saveDSenv || exit + # Assert `./hostfile_deepspeed` exists + export hfds="${WORKING_DIR}/hostfile_deepspeed" + make_ds_hostfile || exit + export LAUNCHER="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" + else + if [[ -n "${DIST_LAUNCH}" ]]; then + mn=$(get_machine_name) + if [[ "${mn}" == "aurora" || "${mn}" == "sunspot" ]]; then + LAUNCHER="${DIST_LAUNCH} --pmi=pmix --genvall $(which python3) -Wignore ${EXEC}" + elif [[ "${mn}" == "sophia" ]]; then + LAUNCHER="${DIST_LAUNCH} $(which python3) -Wignore ${EXEC}" + else + LAUNCHER="${DIST_LAUNCH} --genvall $(which python3) -Wignore ${EXEC}" + fi + export LAUNCHER="${LAUNCHER}" + else + echo "[setupLauncher][INFO]: Saving environment to: .env-${PBS_JOBID}" + printenv | tee ".env-${PBS_JOBID}" + echo "[setupLauncher][ERROR]: DIST_LAUNCH not found in environment !!" + fi + fi + printf "Launching with: %s\n" "$(printRed "${dist_launcher}")" + printf " %s" "$(printMagenta "${LAUNCHER}")" +} + +# set_lr_args() { +# export LR=${LR:-0.0002} # LEARNING_RATE +# export LR_WARMUP_FRAC=${LR_WARMUP_FRAC:-0.05} # LEARNING RATE WARMUP +# export LR_DECAY_ITERS=${LR_DECAY_ITERS:-} # LR DECAY ITERS +# LR_ARGS="--lr ${LR} --lr-decay-style cosine" +# if [[ -n "${LR_DECAY_ITERS:-}" ]]; then +# LR_ARGS="${LR_ARGS} --lr-decay-iters ${LR_DECAY_ITERS}" +# fi +# if [[ -n "${LR_WARMUP_FRAC}" ]]; then +# LR_ARGS="${LR_ARGS} --lr-warmup-fraction ${LR_WARMUP_FRAC}" +# fi +# echo "LR_ARGS: ${LR_ARGS}" +# export LR_ARGS="${LR_ARGS}" +# } + +######################################################################### +# `get_batch_size_on_polaris`: Identify MICRO_BATCH to use on Polaris. +# +# - In particular, it seems that different node counts allow for different +# `MICRO_BATCH` sizes. +# +# Explicitly: +# +# - [1 <= NHOSTS <= 2]: `MICRO_BATCH=1` +# - [3 <= NHOSTS <= 7]: `MICRO_BATCH=2` +# - [8 <= NHOSTS]: `MICRO_BATCH=4` +# +# are the largest batch sizes that fit in memory at various node counts. +######################################################################### +get_batch_size_on_polaris() { + if [[ $(hostname) == x3* ]]; then + nhosts=$(wc -l <"${HOSTFILE:-${PBS_NODEFILE}}") + if [[ "${nhosts}" == 1 || "${nhosts}" == 2 ]]; then + mbs=1 + elif [[ "${nhosts}" -ge 3 && "${nhosts}" -le 7 ]]; then + mbs=2 + elif [[ "${nhosts}" -ge 8 ]]; then + mbs=4 + fi + fi + echo "${mbs}" +} + +_get_num_hosts_from_hostfile() { + if [[ "$#" == 1 ]]; then + if [[ -f "$1" ]]; then + nhosts=$(wc -l <"$1") + echo "${nhosts}" + else + exit 1 + fi + else + exit 1 + fi +} + +########################################### +# get_grad_acc_steps_on_aurora +# +# NOTE: +# We use different numbers of gradient +# accumulation steps (GAS) depending +# on the number of hosts in our job. +# +# Each host has: +# +# [2 tiles] x [6 xpus / tile] = 12 xpus +# +# | nnhosts | nhosts | GAS | +# |:---------------:|:----------:|:-----:| +# | 256 <= n < inf | [256, inf) | 1 | +# | 128 <= n < 256 | [128, 256) | 2 | +# | 32 <= n < 128 | [32, 128) | 4 | +# | 16 <= n < 32 | [16, 32) | 8 | +# | 0 <= n < 16 | [0, 16) | 16 | +# +########################################### +get_grad_acc_steps_on_aurora() { + if [[ "$#" == 0 ]]; then + hf="${HOSTFILE:-${PBS_NODEFILE:-$(ezpz_get_pbs_nodefile_from_hostname)}}" + elif [[ "$#" == 1 ]]; then + hf="$1" + else + echo "Usage: get_grad_acc_steps_on_aurora" + echo "Expected exactly 0 or 1 arguments, received: $#" + exit 1 + fi + nhosts=$(wc -l <"${hf}") + if [[ "${nhosts}" -gt 256 ]]; then + gas=1 + elif [[ 128 -le "${nhosts}" && "${nhosts}" -lt 256 ]]; then + gas=2 + elif [[ 32 -lt "${nhosts}" && "${nhosts}" -lt 129 ]]; then + gas=4 + elif [[ 16 -le "${nhosts}" && "${nhosts}" -le 32 ]]; then + gas=8 else - export launcher="--force_multi --hostfile $hfds --launcher=${LAUNCHER} --launcher_args='-hostfile ${hfmpi}'" + gas=16 fi + echo "${gas}" } +set_ccl_vars_on_aurora() { + export CCL_KVS_MODE=mpi + export CCL_CONFIGURATION_PATH="" + export CCL_CONFIGURATION=cpu_gpu_dpcpp + # export CCL_ROOT=/tmp/oneccl/ + # export LD_LIBRARY_PATH=${CCL_ROOT}/lib:$LD_LIBRARY_PATH + # export CPATH=${CCL_ROOT}/include:$CPATH + # export LIBRARY_PATH=${CCL_ROOT}/lib:$LIBRARY_PATH + export CCL_KVS_CONNECTION_TIMEOUT=3600 + export FI_CXI_RX_MATCH_MODE=hybrid + export CCL_BCAST=double_tree + + export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 + export CCL_PROCESS_LAUNCHER=pmix # Required by Aurora mpich + export FI_PROVIDER=cxi # Required by Aurora mpich + export PALS_PMI=pmix # Required by Aurora mpich + export CCL_ATL_TRANSPORT=mpi # Required by Aurora mpich + export TORCH_LLM_ALLREDUCE=1 + export CCL_SYCL_ESIMD=1 + export CCL_ALLGATHERV_MEDIUM_SIZE_THRESHOLD=0 # Required by current oneCCL (MLSL-2881) + export CCL_ENABLE_SYCL_KERNELS=1 + export CCL_WORKER_AFFINITY=5,13,21,29,37,45,57,65,73,81,89,97 + export CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD=32768 + export FI_CXI_DEFAULT_CQ_SIZE=1048576 + export FI_CXI_RX_MATCH_MODE=hybrid + export CCL_BCAST=double_tree +} + +############################################################################## +# setParams +# +# Set / configure run options by parsing environment. +# +# - any of the declared options below can be overridden +# dynamically at runtime, e.g. to run with a `MICRO_BATCH` size of 2: +# ```bash +# $ PBS_O_WORKDIR=$(pwd) MICRO_BATCH=2 bash train_llama_alcf.sh +# ``` +############################################################################## setParams() { - # ---- [Parallelism Settings] -------------------------------------------- - # -------- [Aurora] ---- || ----- [SunSpot] ------------ - if [[ $(hostname) == x4* || $(hostname) == x1* ]]; then - TP=${TP:-1} # TP = 1 - PP=${PP:-1} # PP = 1 - export CCL=${CCL:-ccl} # CCL - export BE="${CCL}" # BE = CCL - export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 - MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4 - echo "!!!! Using CPU_OPTIMIZER on Intel XPU by Default !!!!" - export CPU_OPTIMIZER=${CPU_OPTIMIZER:-1} # CPU OPTIMIZER ON INTEL XPU - # -------- [Polaris] ----------------------------------- - elif [[ $(hostname) == x3* ]]; then - TP=${TP:-2} # TP = 2 - PP=${PP:-1} # PP = 1 - export NCCL=${NCCL:-nccl} # NCCL - export BE="${NCCL}" # BE = NCCL - # export DTYPE=${DTYPE:-bf16} # DTYPE: BF16 ?? - export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 - MICRO_BATCH=${MICRO_BATCH:-8} # MICRO_BATCH = 8 - fi - # ------------------------------------------------------------------------ - export PP="${PP}" + FLASH_ARG="" + # ---- [Parallelism Settings] -------------------------------------------+ + # ------ [Aurora] -------||------ [SunSpot] ------------- + # if [[ $(hostname) == x4* || $(hostname) == x1* ]]; then + mn=$(get_machine_name) + if [[ "${mn}" == "aurora" || "${mn}" == "sunspot" ]]; then + TP=${TP:-1} # TP = 1 + export SAVE_INTERVAL="${SAVE_INTERVAL:-50}" + export CCL=${CCL:-ccl} # CCL + export BE="${CCL}" # COMMUNICATION BACKEND = CCL + export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 + # export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} # GRADIENT_ACC_STEPS + gas=$(get_grad_acc_steps_on_aurora "${PBS_NODEFILE:-${HOSTFILE:-${hostfile}}}") + export GRAD_ACC_STEPS="${GRAD_ACC_STEPS:-${gas}}" + # export GRAD_ACC_STEPS="${GRAD_ACC_STEPS:-$(get_grad_acc_steps_on_aurora "$@)}" + echo "[setParams] Using GRAD_ACC_STEPS: ${GRAD_ACC_STEPS}" + MICRO_BATCH=${MICRO_BATCH:-1} + if [[ -n "${NO_FLASH_ATTN-}" ]]; then + echo "Not using flash-attn!!" + else + FLASH_ARG="--use-flash-attn-builder" + fi + #### [sam: 08/17/2024] ########################################## + # Use best set of CCL env vars from Gordon Bell runs on Aurora + set_ccl_vars_on_aurora + ################################################################# + #### [sam: 06/20/2024] ############################################### + # export CCL_PROCESS_LAUNCHER=pmix + # export CCL_ATL_TRANSPORT=mpi + # !XXX: USE KEY VALUE STORE FIX ON AURORA [2024-06-20] + # use_kvs_fix_on_aurora # <-- why are these different from those in update_ccl_env_vars_aurora ?? + # update_ccl_env_vars_aurora + ###################################################################### + # if [[ -z "${USE_FLASH_ATTN:-}" ]]; then + # # NOTE: if NO_FLASH_ATTN is NON-empty; then NO FLASH ATTN !! + # export NO_FLASH_ATTN=1 # disabled on [2024-06-20] waiting on fix... + # if [[ -n "${NO_FLASH_ATTN-}" ]]; then + # echo "Not using flash-attn!!" + # else + # FLASH_ARG="--use-flash-attn-builder" + # fi + # else + # echo "Using flash-attn !!" + # FLASH_ARG="--use-flash-attn-builder" + # fi + # [Polaris] + elif [[ "${mn}" == "polaris" || "${mn}" == "sirius" ]]; then + # export LAUNCH_CMD="${LAUNCH_CMD:-deepspeed}" + TP=${TP:-1} # TP = 2 + export NCCL=${NCCL:-nccl} # NCCL + export BE="${NCCL}" # BE = NCCL + # export DTYPE=${DTYPE:-bf16} # DTYPE: BF16 ?? + export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-8} # GRADIENT_ACC_STEPS + # NOTE: MICRO_BATCH is exported below + # MICRO_BATCH=${MICRO_BATCH:-2} # MICRO_BATCH = 8 + export MICRO_BATCH="${MICRO_BATCH:-$(get_batch_size_on_polaris)}" + if [[ -n "${NO_FLASH_ATTN-}" ]]; then + echo "Not using flash-attn!!" + else + FLASH_ARG="--use-flash-attn-v2" + fi + echo "Setting up AWS NCCL OFI Plugin on Polaris..." + source "${WORKING_DIR}/ALCF/aws_ofi_nccl_plugin.sh" || exit + # ---- [Sophia] ---------------------- + elif [[ "${mn}" == sophia* ]]; then + # export LAUNCH_CMD="${LAUNCH_CMD:-deepspeed}" + TP=${TP:-1} # TP = 2 + export NCCL=${NCCL:-nccl} # NCCL + export BE="${NCCL}" # BE = NCCL + export DTYPE=${DTYPE:-bf16} # DTYPE: FP16 + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-8} # GRADIENT_ACC_STEPS + export MICRO_BATCH="${MICRO_BATCH:-$(get_batch_size_on_polaris)}" + if [[ -n "${NO_FLASH_ATTN-}" ]]; then + echo "Not using flash-attn!!" + else + FLASH_ARG="--use-flash-attn-v2" + fi + # echo "Setting up AWS NCCL OFI Plugin on Polaris..." + # source "${WORKING_DIR}/ALCF/aws_ofi_nccl_plugin.sh" || exit + # [Perlmutter] + elif [[ "${mn}" == login* || "${mn}" == nid* ]]; then + TP="${TP:-2}" + export NCCL="${NCCL:-nccl}" + export BE="${NCCL}" + export DTYPE="${DTYPE:-bf16}" + MICRO_BATCH="${MICRO_BATCH:-1}" + if [[ -n "${NO_FLASH_ATTN-}" ]]; then + echo "Not using flash-attn!!" + else + FLASH_ARG="--use-flash-attn-v2" + fi + fi export TP="${TP}" + export PP="${PP:-1}" + export SP="${SP:-1}" + export FLASH_ARG="${FLASH_ARG}" + export DTYPE="${DTYPE:-bf16}" + export OPT="${OPT:-adamw}" + export WEIGHT_DECAY="${WEIGHT_DECAY:-0.1}" export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" - export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} - # ---- Llama2 7B Config ------------------------------ - export MODEL_KEY="Llama-7B" - export HEADS=${HEADS:-32} - export NLAYERS=${NLAYERS:-32} - export HIDDEN=${HIDDEN:-4096} - export NUM_KV_HEAD=${NUM_KV_HEAD:-8} - export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} - # ---- Run Settings ---------------------------------- - export LR=${LR:-0.0003} - export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 - export ZERO_STAGE=${ZERO_STAGE:-2} - export MICRO_BATCH=${MICRO_BATCH:-8} - export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} - export EVAL_ITERS="${EVAL_ITERS:-10}" - export TRAIN_ITER=${TRAIN_ITER:-317892} - export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" - export SAVE_INTERVAL=${SAVE_INTERVAL:-200} - export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} - # export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} - # export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) - export GLOBAL_BATCH_MAX=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) - export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" - tm="${PBS_O_WORKDIR}/ALCF/tokenizer.model" - # tm_a=/home/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/tokenizer.model - # tm_p="/eagle/datasets/dolma/utils/tokenizer.model" - # export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm_p:-${tm_a}}}" - export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" - export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" - export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" - # if [[ "${CPU_OPTIMIZER:-0}" ]]; then - if [[ -n "${CPU_OPTIMIZER}" ]]; then - echo "\n!!! Appending \`--cpu-optimizer\` to LLAMA_ARGS..." - export LLAMA_ARGS="${LLAMA_ARGS} --cpu-optimizer" - fi - # ---------------------------------------------------- -} - - -setArgs() { + NHOSTS=$(wc -l <"${HOSTFILE}") + if [[ -z "${NGPU_PER_HOST:-}" ]]; then + NGPU_PER_HOST=$(python3 -c 'import ezpz as ez; print(ez.get_gpus_per_node())') + fi + export NGPU_PER_HOST="${NGPU_PER_HOST}" + export WORLD_SIZE="${WORLD_SIZE:-$((NHOSTS * NGPU_PER_HOST))}" + # +---[Llama2 7B Config]--------------------------------------------------+ + # export MODEL_KEY="Llama-7B" + export HEADS=${HEADS:-${NHEADS:-32}} # NUMBER OF ATEN HEADS + export NLAYERS=${NLAYERS:-${NUM_LAYERS:-32}} # NUMBER OF LAYERS + export HIDDEN=${HIDDEN:-4096} # HIDDEN SIZE + export NUM_KV_HEAD=${NUM_KV_HEAD:-8} # GROUP ATTENTION + export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} # FFN HIDDEN SIZE + # +---[Run Settings]------------------------------------------------------+ + export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 + export ZERO_STAGE=${ZERO_STAGE:-1} # ZERO OFFLOADING STAGE + export MICRO_BATCH=${MICRO_BATCH:-1} # MICRO BATCH SIZE + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} # GRADIENT ACCUMULATION STEPS + export TIMING_LOG_LEVEL="${TIMING_LOG_LEVEL:-1}" # TIMING VERBOSITY IN LOGS + export ACT_CKPT_NUM_LAYERS="${ACT_CKPT_NUM_LAYERS:-1}" # NUM LAYERS TO CHECKPOINT ACTIVATIONS + export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-} # USE ACTIVATION CHECKPOINTING ? + export GLOBAL_BATCH_MAX=$((WORLD_SIZE * MICRO_BATCH * GRAD_ACC_STEPS / TP / PP / SP)) # MAX GLOBAL BATCH SIZE + export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" # WILL USE MAX IF NOT SET IN ENVIRONMENT + if [[ -n "${TRAIN_TOKENS:-}" ]]; then + export TRAIN_TOKENS="${TRAIN_TOKENS}" + export TRAIN_ITERS=$((TRAIN_TOKENS / SEQ / GLOBAL_BATCH)) + printf "TRAIN_TOKENS=%s (=%sB tokens)\n" "${TRAIN_TOKENS}" "$((TRAIN_TOKENS / 10 ** 9))" + printf "TRAIN_ITERS=%s\n" "${TRAIN_ITERS}" + elif [[ -z "${TRAIN_ITERS:-${TRAIN_ITER:-}}" ]]; then + export TRAIN_TOKENS=${TRAIN_TOKENS:-2000000000000} + export TRAIN_ITERS=$((TRAIN_TOKENS / SEQ / GLOBAL_BATCH)) + printf "TRAIN_TOKENS=%s (=%sB tokens)\n" "${TRAIN_TOKENS}" "$((TRAIN_TOKENS / 10 ** 9))" + printf "TRAIN_ITERS=%s\n" "${TRAIN_ITERS}" + else + export TRAIN_ITERS="${TRAIN_ITERS:-${TRAIN_ITER:-}}" + fi + export MODEL_TYPE="llama-gb${GLOBAL_BATCH}-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" # STRING FOR IDENTIFYING MODEL + # NOTE: [2024-07-10] ##################################################### + # - [sam]: For whatever reason, it seems that using + # sequence-parallelism (SP) > 1 is INCOMPATIBLE with + # rotary-position-embeddings (ROPE). + # + # For this reason, we only use the default LLAMA_ARGS when SP=0. + ########################################################################## + # # -----[Learning Rate Settings]-------------------------------------------- + # export LR=${LR:-0.0002} # LEARNING_RATE + # export LR_WARMUP_FRAC=${LR_WARMUP_FRAC:-0.05} # LEARNING RATE WARMUP + # export LR_DECAY_ITERS=${LR_DECAY_ITERS:-} # LR DECAY ITERS + # set_lr_args + # -----[Learning Rate Settings]-------------------------------------------- + # # if [[ "${TIMING_LOG_LEVEL:-1}" -gt 1 ]]; then + # if [[ "${TIMING_LOG_LEVEL:-1}" -gt 1 ]]; then + # TIMING_STR="\ + # --timing-log-level ${TIMING_LOG_LEVEL}" + # # " + # else + # TIMING_STR="" + # fi +} + +############################################## +# set_args +# +# Specify additional (DeepSpeed specific) +# arguments to pass to pretrain_gpt_alcf.py +############################################## +set_args() { # ---- Set DeepSpeed arguments -------------------------------- - ds_args=" " - ds_args=" --deepspeed ${ds_args}" - if [[ $PP == 1 ]]; then - ds_args=" --no-pipeline-parallel ${ds_args}" + ds_args=( + "--deepspeed" + ) + if [[ "${PP:-1}" == 1 ]]; then + ds_args+=("--no-pipeline-parallel") fi - ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" - ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" - if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + ds_args+=("--deepspeed_config=${DS_CONFIG}") + ds_args+=("--zero-stage=$ZERO_STAGE") + if [[ "${ZERO_STAGE}" == 3 ]]; then + ds_args+=("--use-mics") + fi + # ds_args=" " + # ds_args=" --deepspeed ${ds_args}" + # if [[ $PP == 1 ]]; then + # ds_args=" --no-pipeline-parallel ${ds_args}" + # fi + # ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" + # ds_args="--zero-stage=$ZERO_STAGE ${ds_args}" + # if [[ "${ZERO_STAGE}" == 3 ]]; then + # ds_args="--use-mics ${ds_args}" + # fi + if [[ -n "${USE_ACTIVATION_CHECKPOINTING:-}" ]]; then echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" - ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + ds_args+=("--deepspeed-activation-checkpointing") + # ds_args=" --deepspeed-activation-checkpointing ${ds_args}" # --checkpoint-activations \ # --deepspeed-activation-checkpointing fi @@ -119,74 +778,165 @@ setArgs() { echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" gpt_args+=( "--checkpoint-activations" - "--checkpoint-num-layers 1" + "--checkpoint-num-layers ${ACT_CKPT_NUM_LAYERS}" ) fi export gpt_args } -ezpz() { - if [[ ! -d ezpz ]]; then - git clone https://github.com/saforem2/ezpz +make_ds_hostfile() { + export GPUS_PER_NODE="${GPUS_PER_NODE:-${NGPU_PER_HOST:-${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}}}" + # ---- Make MPICH hostfile ---------------- + hf="${HOSTFILE:-${PBS_NODEFILE}}" + export hostfile_mpich=hostfile_mpich + cat "${hf}" >"${hostfile_mpich}" + # ---- Make DeepSpeed hostfile ------------------- + export hostfile_deepspeed=hostfile_deepspeed + cat "${hf}" >"${hostfile_deepspeed}" + sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" +} + +########################################### +# ezpz_setup +# +# 1. Clone [`saforem2/ezpz`](https://github.com/saforem2/ezpz) (if necessary) +# to `"${WORKING_DIR}/deps/ezpz/"` +# +# 2. Source [`ezpz/src/ezpz/bin/utils.sh`](https://github.com/saforem2/ezpz/blob/main/src/ezpz/bin/utils.sh) +# - This provides `{ezpz_setup_python, ezpz_setup_job}` (called below) +# +# 3. Call `ezpz_setup_python` (from `ezpz/bin/utils.sh`): +# - This will setup conda + virtual enviroment +# +# 4. Call `ezpz_setup_job` (from `ezpz/bin/utils.sh`): +# - This will parse `$PBS_*` variables and build launch cmd +# +# 3. Call `_ezpz_install` (from `Megatron-DeepSpeed/ALCF/helpers.sh`): +# - Install ezpz from `"${WORKING_DIR}/depz/ezpz/"` +########################################### +ezpz_setup() { + # setup_alcf "$@" + # file=$(mktemp) + # curl -Ls https://raw.githubusercontent.com/saforem2/ezpz/main/src/ezpz/bin/getjobenv > "${file}" + # shellcheck source=../deps/ezpz/src/ezpz/bin/utils.sh + ezdir="${WORKING_DIR}/deps/ezpz" + if [[ -d "${ezdir}" ]]; then + echo "Found ezpz in ${ezdir}" else - echo "Found ezpz!" + mkdir -p "$(dirname "${ezdir}")" + git clone https://github.com/saforem2/ezpz "${ezdir}" fi - if python3 -c 'import ezpz; print(ezpz.__file__)' 2> '/dev/null'; then - echo "Has ezpz installed. Nothing to do." + # shellcheck source=../deps/ezpz/src/ezpz/bin/utils.sh + source "${ezdir}/src/ezpz/bin/utils.sh" || exit + ezpz_setup_python + ezpz_setup_job "$@" + ezpz_pip_loc=$(python3 -m pip list | grep ezpz | awk '{print $NF}') + if [[ -z "${ezpz_pip_loc:-}" ]]; then + printf "[ezpz_install] Installing ezpz from %s\n" "${ezdir}" + python3 -m pip install -e "${ezdir}" --require-virtualenv else - echo "Does not have ezpz installed. Installing..." - echo "Using $(which python3) to install \`ezpz\`:" - python3 -m pip install -e ezpz > ezpz-install.log 2>&1 + printf "[ezpz_install] Found ezpz @ %s\n" "${ezpz_pip_loc}" fi - echo "Done with ezpz." - # source ezpz/src/ezpz/bin/savejobenv || exit # > /tmp/savejobenv.log 2>&1 || exit - # source ezpz/src/ezpz/bin/getjobenv || exit } +####################################################################### +# ezpz_test: Run simple test to make sure all nodes in working order +####################################################################### +ezpz_test() { + printf "%s" "[$(printBlue 'ezpz:test_dist')][INFO] Running ezpz.test_dist...\n" + # [ -n "${PBS_O_WORKIR}" ] && ezpz_savejobenv || ezpz_getjobenv + # python3 -Wignore -m ezpz.jobs && source "${PBS_O_WORKDIR}/.jobenv" + printf "%s" "[$(printBlue 'ezpz:test_dist')] Running test: ${eztest}\n" + eztest="TRAIN_ITERS=50 ${LAUNCH_CMD} python3 -Wignore -m ezpz.test_dist" + eval "${eztest}" + printf "%s" "[$(printBlue 'ezpz:test_dist')] Done with test!\n" +} + +############################################################################ +# saveDSenv +# +# Save important environment variables to .deepspeed_env, which will be +# forwarded to ALL ranks with DeepSpeed +############################################################################ saveDSenv() { echo "Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env" { - echo "PATH=${PATH}" ; - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" ; - echo "http_proxy=${http_proxy}" ; - echo "https_proxy=${https_proxy}" ; - echo "CFLAGS=${CFLAGS}" ; - echo "PYTHONUSERBASE=$PYTHONUSERBASE" ; - } > .deepspeed_env + echo "PATH=${PATH}" + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + echo "http_proxy=${http_proxy:-}" + echo "https_proxy=${https_proxy:-}" + echo "CFLAGS=${CFLAGS}" + echo "PYTHONUSERBASE=$PYTHONUSERBASE" + } >.deepspeed_env } -setOutput() { +get_output_prefix() { # ---- Specify output location -------------------------------- - export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" - OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" - export OUTPUT_DIR="${OUTPUT_DIR}" + pre="ws${WORLD_SIZE}_ds_stage${ZERO_STAGE}_nl${NLAYERS}" + pre="${pre}_hs${HIDDEN}_mb${MICRO_BATCH}" + pre="${pre}_seq${SEQ}_gb${GLOBAL_BATCH}" + pre="${pre}_sp${SP}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}" + pre="${pre}_lr${LR}_lwf${LR_WARMUP_FRAC}" + if [[ -n "${TOKENIZER_TYPE:-}" ]]; then + _tok=$(echo "${TOKENIZER_TYPE}" | sed 's/Tokenizer//g') # noqa + pre="${pre}_tok${_tok}" + fi + if [[ -n "${LR_DECAY_ITERS}" ]]; then + pre="${pre}_ldi${LR_DECAY_ITERS}" + fi + if [[ -z "${NO_FLASH_ATTN:-}" ]]; then + pre="${pre}_flash" + fi + export OUTPUT_PREFIX="${pre}" + echo "${pre}" +} + +setOutput() { + # OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" + OUTPUT_PREFIX=$(get_output_prefix) + OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%Y%m%d-%H%M%S)_${WORLD_SIZE}_${HOSTNAME}" + export OUTPUT_DIR="${OUTPUT_DIR}" && mkdir -p "${OUTPUT_DIR}" export OUTPUT_LOG="${OUTPUT_DIR}/output.log" - export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}" - echo "${OUTPUT_LOG}" >> "logs/latest" - mkdir -p "${OUTPUT_DIR}" - echo "!!!Please see logs at ${OUTPUT_DIR}" + echo "${OUTPUT_LOG}" >>"logs/latest" + printf "\n Please see logs at: %s\n" "$(printGreen "${OUTPUT_DIR}")" } -buildDSconfig() { - # ---- Build DeepSpeed Config --------------------------------- - export DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" - echo "DS_CONFIG: ${DS_CONFIG}" - printf "ZS: %s, MB: %s, GB: %s, PP: %s, DTYPE: %s" ${ZERO_STAGE} ${MICRO_BATCH} ${GLOBAL_BATCH} ${PP} ${DTYPE} - if [[ -z "${CPU_OPTIMIZER}" ]]; then - bash "${PBS_O_WORKDIR}/generate_config.sh" "${DS_CONFIG}" #|| exit 1 +get_checkpoint_dir() { + if [[ -n "${CKPT_DIR:-}" ]]; then + echo "${CKPT_DIR}" else - echo "!!! Using CPU Optimizer !!!" - bash "${PBS_O_WORKDIR}/generate_config_cpu_optimizer.sh" "${DS_CONFIG}" + echo "checkpoints/$(get_output_prefix)" fi - # ------------------------------------------------------------- } +setup_checkpoint() { + ckpt_dir=$(get_checkpoint_dir) + export CKPT_DIR="${ckpt_dir}" + printf "Checkpoints will be saved to: %s\n" "$(printYellow "${CKPT_DIR}")" +} +############################################# +# Build DeepSpeed config and write to .json +############################################# +buildDSconfig() { + # export CPU_OPTIMIZER="${CPU_OPTIMIZER:-0}" + export DS_CONFIG="${WORKING_DIR}/ds-configs/ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" + mkdir -p "$(dirname "${DS_CONFIG}")" + echo "DS_CONFIG: ${DS_CONFIG}" + printf "ZS: %s, MB: %s, GB: %s, PP: %s, DTYPE: %s" "${ZERO_STAGE}" "${MICRO_BATCH}" "${GLOBAL_BATCH}" "${PP}" "${DTYPE}" + generateDSconfig "${DS_CONFIG}" + cat "${DS_CONFIG}" | jq . +} + +############################################################################### +# sumWeights +# +# This will sum the weights (first column) from each line in the passed +# `file_list`. +############################################################################### sumWeights() { local file_list=$1 weights=$(cat "${file_list}" | awk '{print $1}' | tr '\n' '\ ,\ ' | sed 's/^/[/g' | sed 's/$/]/g' | tr '\ ' "\,\ ") - # weights=$(echo "$weights" | tr ",]" "]") - # echo "weights: $weights" python3 -c "import numpy as np; print(np.sum(${weights}))" } @@ -198,82 +948,190 @@ sumFiles() { done } +########################################### +# make_data +# +# This will run `make` in `megatron/data` +# prior to launching, ensuring that +# `megatron/data/helpers.cpp` +# is built appropriately. +########################################### +make_data() { + python3 -m pip install pybind11 + mdir="${WORKING_DIR}/megatron/data" + cd "${mdir}" && make && cd - +} -setEnv() { - # ---- [SunSpot] ------- || ---- [Aurora] -------------- - if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then - PBS_PARENT=$(dirname ${PBS_O_WORKDIR}) - echo "Sourcing ${PBS_PARENT}/setenv.sh..." - source "${PBS_PARENT}/setenv.sh" || exit - # ----- [Aurora] ----------------------------------- - if [[ $(hostname) == x4* ]]; then - eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate anl_release_q4v2 - # ----- [SunSpot] ---------------------------------- - elif [[ $(hostname) == x1* ]]; then - echo "Running on SunSpot !!" - eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate q4-drop - fi - # ----- [Polaris] --------------------------------------- - elif [[ $(hostname) == x3* ]]; then - echo "Running on Polaris !!" - # ---- [load conda] --------------------- - module load conda/2023-10-04; conda activate cu118-pt221 ; unset PYTHONUSERBASE - # module load conda/2023-10-04 ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/py311-cu118 - # ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 - # export PYTHONUSERBASE="${HOME}/.local/polaris/conda/py311-cu118" - # mkdir -p "${PYTHONUSERBASE}" - # if [[ "${VIRTUAL_ENV}" ]]; then - # echo "Caught VIRTUAL_ENV = ${VIRTUAL_ENV} from environment!!" - # else - # echo "Not using VIRTUAL_ENV" - # # sourceFile "${HERE}/venvs/polaris/2023-10-04/bin/activate" || exit +############################################################################## +# install_dependencies +# +# Ensure all dependencies installed from `ALCF/requirements/requirements.txt` +############################################################################## +install_dependencies() { + depsfile="${WORKING_DIR}/ALCF/requirements/requirements.txt" + echo "[install_dependencies] Ensuring all dependencies from ${depsfile} installed..." + python3 -m pip install -r "${depsfile}" --require-virtualenv 1>/dev/null + if [[ ! -x "$(command -v deepspeed)" ]]; then + mn=$(get_machine_name) + # if [[ "${mn}" == aurora* || "${mn}" == sunspot* ]]; then + # install_deepspeed_for_xpu || exit # fi - else # ------------------------------------- [Unknown] ------------------- - echo "Unknown hostname $(hostname)" - exit 1 + printf "[install_dependencies] No 'deepspeed' command found on %s" "${mn}" + printf "[install_dependencies] !! No deepsepeed in %s" "$(which python3)" + fi +} + +################################################# +# Fix for distributed key value store on Aurora +################################################# +use_kvs_fix_on_aurora() { + export CCL_KVS_MODE=mpi + export CCL_CONFIGURATION_PATH="" + export LD_LIBRARY_PATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/lib:$LD_LIBRARY_PATH + export CPATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/include:$CPATH + export LIBRARY_PATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/lib:$LIBRARY_PATH + ######################################################### + # if not set, CCL will complain... ? + export NUMEXPR_MAX_THREADS="${NUMEXPR_MAX_THREADS:-16}" + ######################################################### +} + +update_ccl_env_vars_aurora() { + # export CCL_KVS_MODE=mpi + # # export CCL_CONFIGURATION_PATH="" + # # unset CCL_CONFIGURATION_PATH + # # export CCL_CONFIGURATION=cpu_gpu_dpcpp + # # export CCL_ROOT="/flare/Aurora_deployment/intel/ccl/_install_release_2021_13" + # export LD_LIBRARY_PATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/lib:$LD_LIBRARY_PATH + # export CPATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/include:$CPATH + # export LIBRARY_PATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/lib:$LIBRARY_PATH + # # export CCL_ALLREDUCE_SCALEOUT=direct + # printenv | grep -E -v "^__" | grep -E "CCL|LD|CPATH|LIBRARY_PATH" + ######################################################### + # if not set, CCL will complain... ? + export NUMEXPR_MAX_THREADS="${NUMEXPR_MAX_THREADS:-16}" + ######################################################### + # Sam: [2024-06-29] + export CCL_KVS_MODE=mpi + export CCL_CONFIGURATION_PATH="" + export CCL_CONFIGURATION=cpu_gpu_dpcpp + export CCL_ROOT="/flare/Aurora_deployment/intel/ccl/_install_release_2021_13" + export LD_LIBRARY_PATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/lib:$LD_LIBRARY_PATH + export CPATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/include:$CPATH + export LIBRARY_PATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/lib:$LIBRARY_PATH +} + +########################################################## +# Check that we can find the `.py` file we wish to launch +########################################################## +check_executable() { + fp=$1 + if [[ -f "${fp}" ]]; then + export EXEC="${fp}" + # ----[1.5 Keep track of stem from file path]------------------------- + exec_stem=$(echo "${EXEC}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.py//g") + export EXEC_STEM="${exec_stem}" + else + estr="Unable to locate executable ${fp}" + printf "[ALCF.helpers:check_executable] %s\n" "$(printRed "${estr}")" fi } +###################################################################### +# `makeHostiles`: +# Detect if `HOSTFILE` set in active environment. +# - If so, use this. +# - Otherwise, make default HOSTFILEs from "${PBS_NODEFILE}" +###################################################################### makeHostfiles() { - # GPUS_PER_NODE=$(python3 -Wignore -c 'import ezpz; print(ezpz.get_gpus_per_node())') - # source $(python3 -c 'import ezpz; print(ezpz.SAVEJOBENV.as_posix())') || exit - # source $(python3 -c 'import ezpz; print(ezpz.GETJOBENV.as_posix())') || exit - source ezpz/src/ezpz/bin/savejobenv || exit #> /tmp/savejobenv.log 2>&1 & - source ezpz/src/ezpz/bin/getjobenv || exit - export GPUS_PER_NODE="${GPUS_PER_NODE:-${NGPU_PER_HOST}}" - # ---- Make MPICH hostfile ---------------- - hf="${HOSTFILE:-${PBS_NODEFILE}}" - export hostfile_mpich=hostfile_mpich - cat "${hf}" > "${hostfile_mpich}" - # ---- Make DeepSpeed hostfile ------------------- - export hostfile_deepspeed=hostfile_deepspeed - cat "${hf}" > "${hostfile_deepspeed}" - sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" + if [[ -n "${HOSTFILE}" ]]; then + printf "!! USING CUSTOM HOSTFILE FROM: %s" "${HOSTFILE}" + else + make_ds_hostfile + fi } -setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- - if [[ $(hostname) == x4* ]]; then # ---- [AURORA] ---- - dfl_fallback="/home/foremans/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed/data_file_list_reweighted.txt" - elif [[ $(hostname) == x1* ]]; then - dfl_fallback="/gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_file_list_reweighted.txt" - elif [[ $(hostname) == x3* ]]; then - dfl_fallback="/eagle/datasets/dolma/data_file_list_reweighted.txt" +################################################## +# Setup tokenizer as either Llama2 or GPT2 style +################################################## +setup_tokenizer_and_data() { + if [[ "$#" == 1 ]]; then + tok="$1" + dfl="${DATA_FILE_LIST:-}" + elif [[ "$#" == 2 ]]; then + tok="$1" + dfl="$2" else - echo "Unknown hostname. Must manually specify DATA_FILE_LIST." + echo "Incorrect number of arguments passed. Received: $#, expected 2" fi + echo "Setting up tokenizer with ${tok}" + echo "Using data_file_list: ${dfl}" + _data_flags=() + _tokenizer_flags=() + if [[ ${tok} == gpt* || ${tok} == GPT* ]]; then + export TOKENIZER_TYPE="GPT2" + _tokenizer_flags+=("--tokenizer-type GPT2BPETokenizer") + machine=$(get_machine_name) + if [[ ${machine} == "polaris" || ${machine} == "sophia" ]]; then + export DATA_PARENT="${DATA_PARENT:-/eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed/dataset}" + elif [[ ${machine} == "sunspot" ]]; then + export DATA_PARENT="${DATA_PARENT:-/gila/Aurora_deployment/foremans/anl_24_q2_release/Megatron-DeepSpeed/dataset}" + elif [[ ${machine} == "aurora" ]]; then + export DATA_PARENT="${DATA_PARENT:-/gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/dataset}" + else + export DATA_PARENT="${DATA_PARENT:-${WORKING_DIR}/dataset}" + fi + export VOCAB_FILE="${DATA_PARENT}/gpt2-vocab.json" + export MERGE_FILE="${DATA_PARENT}/gpt2-merges.txt" + export DATA_PATH="${DATA_PARENT}/BookCorpusDataset_text_document" + _data_flags+=( + "--data-path ${DATA_PATH}" + "--vocab-file ${VOCAB_FILE}" + "--merge-file ${MERGE_FILE}" + ) + else + export TOKENIZER_TYPE="${TOKENIZER_TYPE:-Llama2Tokenizer}" + tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model + export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ + _tokenizer_flags+=( + "--tokenizer-type ${TOKENIZER_TYPE}" + "--tokenizer-model ${TOKENIZER_MODEL}" + ) + # if [[ "${TOKENIZER_TYPE}" != "GPT2" ]]; then + echo "Using tokenizer: ${TOKENIZER_TYPE}. Setting up data with ${DATA_FILE_LIST:-}" + setData "${dfl}" || exit + fi + export DATA_FLAGS="${_data_flags[*]:-}" + export TOKENIZER_FLAGS="${_tokenizer_flags[*]}" + printf "[setData] DATA_FLAGS: %s\n" "$(printGreen "${DATA_FLAGS}")" + printf "[setData] TOKENIZER_FLAGS: %s\n" "$(printMagenta "${TOKENIZER_FLAGS}")" +} + +############################################### +# setData +# +# Ensure `DATA_FILE_LIST` is set, +# fallback to default values if necessary. +############################################### +setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST] + ####### [Set DATA_FILE_LIST_FALLBACK based on current machine] ############# + mn=$(get_machine_name) + dfl_fallback="${WORKING_DIR}/ALCF/data-lists/${mn}/dolma.txt" + ############################################################################ + # set `dfl` to `dfl_fallback` if not passed as an argument, + # use this data file list to call `setData` dfl="${1:-${dfl_fallback}}" - # dfl_fallback="/eagle/datasets/dolma/data_file_list_reweighted.txt" - printf "Calling: \`setData()\` with %s\n" "${dfl}" - ndocs=$(wc -l < "${dfl}") + printf "Calling: setData() with %s\n" "${dfl}" + ndocs=$(wc -l <"${dfl}") ws=$(sumWeights "${dfl}") dfl_stem=$(echo "${dfl}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") - dcp="${HERE}/.cache/${dfl_stem}/index-cache" - mkdir -p dcp + dcp=".cache/${dfl_stem}/index-cache" export DATA_FILE_LIST="${dfl}" export NUM_DOCS="${ndocs}" export WEIGHT_SUM="${ws}" export DFL_STEM="${dfl_stem}" export DATA_CACHE_PATH="${dcp}" + # export DATA_FLAGS="${DATA_FLAGS} --data-file-list ${DATA_FILE_LIST}" # --data-cache-path ${DATA_CACHE_PATH}" echo "--------------------" echo "Updated environment:" printf "DATA_FILE_LIST: %s\n" "${DATA_FILE_LIST}" @@ -281,54 +1139,259 @@ setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- printf "WEIGHT_SUM: %s\n" "${WEIGHT_SUM}" printf "DFL_STEM: %s\n" "${DFL_STEM}" printf "DATA_CACHE_PATH: %s\n" "${DATA_CACHE_PATH}" + printf "DATA_FLAGS: %s\n" "${DATA_FLAGS:-}" echo "--------------------" } -# buildCLIargs() { # ---- [BROKEN] ------------------------------------------- -# custom_args=" $@" -# export CLI_ARGS=" -# --$DTYPE \ -# --num-workers 0 \ -# --split 100,0,0 \ -# --log-interval 1 \ -# --use-flash-attn-v2 \ -# --no-bias-gelu-fusion \ -# --lr-decay-style cosine \ -# --no-bias-dropout-fusion \ -# --no-masked-softmax-fusion \ -# --tokenizer-type Llama2Tokenizer \ -# --no-gradient-accumulation-fusion \ -# --accumulate-allreduce-grads-in-fp32 \ -# --use-checkpoint-opt_param-scheduler \ -# --lr ${LR} \ -# --save ${CKPT_DIR} \ -# --load ${CKPT_DIR} \ -# --seq-length ${SEQ} \ -# --num-layers ${NLAYERS} \ -# --hidden-size ${HIDDEN} \ -# --train-iters ${TRAIN_ITER} \ -# --eval-iters ${EVAL_ITERS} \ -# --distributed-backend ${BE} \ -# --num-attention-heads ${HEADS} \ -# --save-interval ${SAVE_INTERVAL} \ -# --eval-interval ${EVAL_INTERVAL} \ -# --max-position-embeddings ${SEQ} \ -# --micro-batch-size ${MICRO_BATCH} \ -# --data-file-list ${DATA_FILE_LIST} \ -# --tensor-model-parallel-size ${TP} \ -# --global-batch-size ${GLOBAL_BATCH} \ -# --pipeline-model-parallel-size ${PP} \ -# --num-key-value-heads ${NUM_KV_HEAD} \ -# --data-cache-path ${DATA_CACHE_PATH} \ -# --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ -# --tokenizer-model ${TOKENIZER_MODEL} \ -# $ds_args \ -# ${LLAMA_ARGS} \ -# ${gpt_args[*]} \ -# ${custom_args} \ -# " +generateDSconfig_new() { + cat <"${CONFIG_JSON}" + { + "train_batch_size" : $GLOBAL_BATCH, + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "steps_per_print": 1, + + "zero_optimization": { + "stage": $ZERO_STAGE + }, + + "bf16": { + "enabled": true + }, + + "data_types": { + "grad_accum_dtype": "fp32" + }, + + "wall_clock_breakdown" : false + } +EOT +} + +################################################################################ +# generateDSconfig +# +# Create and save a deepspeed config .json +# +# This will contain the appropriate variables as set in the current environment. +################################################################################ +generateDSconfig() { + if [ $# -ne 1 ]; then + echo "Usage: $0 config_file" + exit 1 + fi + for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" "$PP" "$DTYPE"; do + if [ -z "$v" ]; then + echo "Please export required envs before execute $0" + exit 1 + fi + done + # \"scheduler\": { + # \"type\": \"WarmupLR\", + # \"params\": { + # \"warmup_min_lr\": 0.00003, + # \"warmup_max_lr\": 0.0003, + # \"warmup_num_steps\": 5000 + # } + # }, + extra="" + common="\ + \"train_batch_size\": $GLOBAL_BATCH, + \"train_micro_batch_size_per_gpu\": $MICRO_BATCH, + \"gradient_clipping\": 1.0, + \"steps_per_print\": 1, + \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, + \"zero_force_ds_cpu_optimizer\": false, + \"zero_allow_untested_optimizer\": true, + \"wall_clock_breakdown\": false," + # if [[ "${USE_ACTIVATION_CHECKPOINTING}" == 1 ]]; then + # activation_checkpointing="\ + # \"activation_checkpointing\": { + # \"partition_activations\": true, + # \"contiguous_memory_optimization\": true + # }," + # fi + if [[ $DTYPE == "bf16" ]]; then + # \"communication_data_type\": \"bf16\", + dtype="\ + \"fp16\": { + \"enabled\": false, + \"loss_scale\": 0, + \"loss_scale_window\": 1000, + \"hysteresis\": 2, + \"min_loss_scale\": 1 + }, + \"bfloat16\": { + \"enabled\": true, + \"loss_scale\": 1.0 + }," + elif [[ $DTYPE == "fp16" ]]; then + dtype="\ + \"communication_data_type\": \"fp16\", + \"fp16\": { + \"enabled\": true, + \"loss_scale\": 0, + \"loss_scale_window\": 1000, + \"hysteresis\": 2, + \"min_loss_scale\": 1 + }, + \"bfloat16\": { + \"enabled\": false, + \"loss_scale\": 1.0 + }," + else + dtype="\"communication_data_type\": \"fp32\"," + fi + if [[ "${OPT:-}" == "ds.adamw" ]]; then + optimizer="\ + \"optimizer\": { + \"type\": \"AdamW\", + \"params\": { + \"lr\": ${LR}, + \"beta1\": ${ADAM_BETA1}, + \"beta2\": ${ADAM_BETA2}, + \"eps\": ${ADAM_EPS}, + \"weight_decay\": 1e-1 + }, + }," + elif [[ "${OPT:-}" == "ds.onebitlamb" ]]; then + optimizer="\ + \"optimizer\": { + \"type\": \"OneBitLamb\", + \"params\": { + \"lr\": 11e-3, + \"max_coeff\": 0.3, + \"min_coeff\": 0.01, + \"freeze_step\": 1000, + \"cuda_aware\": false, + \"comm_backend_name\": \"${BE}\", + \"coeff_beta\": 0.9, + \"factor_max\": 4.0, + \"factor_min\": 0.5, + \"factor_threshold\": 0.1 + } + }," + else + optimizer="" + fi + if [[ "${ZERO_STAGE}" == 3 ]]; then + # \"mics_shard_size\": 2, + zero="\ + \"zero_optimization\": { + \"stage\": 3, + \"reduce_scatter\": false, + \"mics_hierarchical_params_gather\": true, + \"stage3_max_live_parameters\": 3e9, + \"stage3_max_reuse_distance\": 3e9, + \"stage3_param_persistence_threshold\": 1e5, + \"stage3_prefetch_bucket_size\": 5e7, + \"contiguous_gradients\": true, + \"overlap_comm\": true, + \"reduce_bucket_size\": 90000000, + \"sub_group_size\": 1e9, + \"offload_optimizer\": { + \"device\": \"none\", + \"buffer_count\": 4, + \"pipeline_read\": false, + \"pipeline_write\": false, + \"pin_memory\": true + } + }," + # elif [[ $ZERO_STAGE == 2 ]]; then + elif [[ "${ZERO_STAGE}" == 2 || "${ZERO_STAGE}" == 1 ]]; then + if [[ -n "${CPU_OPTIMIZER:-}" ]]; then + echo "!!!! CAUGHT CPU_OPTIMIZER !!!!" + zero="\ + \"zero_optimization\": { + \"stage\": $ZERO_STAGE, + \"offload_optimizer\": { + \"device\": \"cpu\" + } + }," + else + zero="\ + \"zero_optimization\": { + \"stage\": $ZERO_STAGE + }," + fi + if [[ "${PP}" -gt 1 ]]; then + extra="\ + \"data_types\": { + \"grad_accum_dtype\": \"fp32\" + }, + \"comms_logger\": { + \"enabled\": true, + \"verbose\": false, + \"prof_all\": true, + \"debug\": false + }," + else + extra="\ + \"comms_logger\": { + \"enabled\": ${COMMS_LOGGER:-false}, + \"verbose\": false, + \"debug\": false + }," + fi + else + echo 'Please add the correct config set!!!' + fi + flops_profiler="\ + \"flops_profiler\": { + \"enabled\": true, + \"profile_step\": 2, + \"module_depth\": -1, + \"top_modules\": 1, + \"detailed\": true, + \"output_file\": null + }" + cat <"$1" +{ +$common +$optimizer +$zero +$dtype +$extra +$flops_profiler +} +EOT +} + +# ##################### +# # train +# ##################### +# train() { +# # 1. Navigate into `$PBS_O_WORKDIR` <-- [should be Megatron-Deepspeed] +# cd "${PBS_O_WORKDIR}" || exit +# HERE=$(python3 -c 'import os; print(os.getcwd())') && export HERE +# # 2. source `ALCF/helpers.sh` <-- [should be ./ALCF/helpers.sh] +# source "${HERE}/ALCF/helpers.sh" || exit +# # 3. call `setup` from `./ALCF/helpers.sh` +# # export DATA_FILE_LIST="${HERE}/ALCF/data-lists/$(get_machine_name)/books.txt" +# setup || exit +# # 4. Take custom args +# export custom_args=" $@" +# # 5. Update ${run_cmd} (from setup ALCF/helpers.sh) with ${custom_args} +# export run_cmd="${run_cmd} ${custom_args}" +# # 6. Add "${run_cmd}" to output log +# echo "${run_cmd}" | tee -a "${OUTPUT_LOG}" +# # 7. Tell user where to find output +# printf "[!! %s] View output at:\n %s\n" "$(printBlue "NOTE")" "$(printYellow "${OUTPUT_LOG}")" | tee -a "${OUTPUT_LOG}" +# # 8. Evaluate ${run_cmd} and append outputs to ${OUTPUT_LOG} +# eval "${run_cmd}" |& tee -a "${OUTPUT_LOG}" +# set +x # } +############################################### +# Helper functions for printing colored text +############################################### +RESET="\e[0m" +BLACK="\e[1;30m" +RED="\e[1;31m" +GREEN="\e[1;32m" +YELLOW="\e[1;33m" +BLUE="\e[1;34m" +CYAN="\e[1;35m" +# WHITE="\e[1;36m" printBlack() { printf "\e[1;30m%s\e[0m\n" "$@" @@ -357,6 +1420,93 @@ printMagenta() { printCyan() { printf "\e[1;36m%s\e[0m\n" "$@" } + printWhite() { printf "\e[1;37m%s\e[0m\n" "$@" } + +reset_env() { + custom_vars=( + NO_FLASH_ATTN + USE_FLASH_ATTN + TP + PP + SP + FLASH_ARG + OPT + ADAM_BETA1 + ADAM_BETA2 + ADAM_EPS + WEIGHT_DECAY + HEADS + NLAYERS + HIDDEN + NUM_KV_HEAD + FFN_HIDDEN_SIZE + SEQ + ZERO_STAGE + MICRO_BATCH + EVAL_ITERS + EVAL_INTERVAL + TIMING_LOG_LEVEL + ACT_CKPT_NUM_LAYERS + USE_ACTIVATION_CHECKPOINTING + GLOBAL_BATCH_MAX + GLOBAL_BATCH + TRAIN_TOKENS + TRAIN_ITERS + MODEL_TYPE + LR + LR_WARMUP_FRAC + LR_DECAY_ITERS + LR_ARGS + CPU_OPTIMIZER + DS_CONFIG + OUTPUT_DIR + OUTPUT_LOG + CKPT_DIR + ds_args + EXEC + EXEC_STEM + DATA_FLAGS + TOKENIZER_TYPE + TOKENIZER_MODEL + TOKENIZER_FLAGS + DATA_FILE_LIST + NUM_DOCS + WEIGHT_SUM + DFL_STEM + DATA_CACHE_PATH + DOTENV_FILE + YEAR + MONTH + DAY + TODAY + STARTED_AT + LAUNCHER + data_cache_path + DEFAULTS + ) + # LLAMA_ARGS + printf "Unsetting custom vars: %s\n" "${custom_vars[*]}" + unset "${custom_vars[@]}" +} + +convert_ckpt_to_universal() { + if [[ "$#" -ne 1 ]]; then + echo "Usage: convert_ckpt_to_universal ckpt_dir" + echo "Expected one argument (ckpt_dir), received: $#" + exit 1 + fi + ckptdir=$1 + gs=$(cat "${ckptdir}/latest_checkpointed_iteration.txt") + src="${ckptdir}/global_step${gs}" + dst="${ckptdir}/global_step${gs}_universal" + convert_script="${PBS_O_WORKDIR}/deps/DeepSpeed/checkpoint/ds_to_universal.py" + python3 "${convert_script}" --input_folder "${src}" --output_folder "${dst}" +} + +########################### +# call helpers_main() +########################### +helpers_main diff --git a/ALCF/notes/checkpoints.md b/ALCF/notes/checkpoints.md new file mode 100644 index 0000000000..f5acd425a7 --- /dev/null +++ b/ALCF/notes/checkpoints.md @@ -0,0 +1,207 @@ +# Converting Checkpoints + +## Megatron $\rightarrow$ 🤗 HuggingFace + +On Aurora, + +- Setup: + + ```bash + CKPT_ROOT="/flare/Aurora_deployment/AuroraGPT-Testing/foremans/rollback-41k8/Megatron-DeepSpeed-41800/checkpoints/ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb3072_sp1_pp1_tp1_bf16_optadamw_lr0.00020_lwf0.05"; + + LAST_STEP=$(cat "${CKPT_ROOT}/latest_checkpointed_iteration.txt") + GLOBAL_STEP="${GLOBAL_STEP:-${LAST_STEP}}" + + SRC="${CKPT_ROOT}/global_step${GLOBAL_STEP}" + + OUTPUT_PARENT="/flare/Aurora_deployment/AuroraGPT-Checkpoints/production-checkpoints/aGPT-7B/HF" + DST="${OUTPUT_PARENT}/global_step${GLOBAL_STEP}_hf" + + printf "SRC: %s\n DST: %s\n" "${SRC}" "${DST}" + ``` + +- Convert: + + ```bash + python3 Megatron-DeepSpeed/mds_to_hf.py \ + --mds_checkpoint "${SRC}/mp_rank_00_model_states.pt" \ + --output_dir "${DST}" \ + --cache_dir "./.cache" + ``` + + + + + + + + + + [DST] --> + + + + + + + + + + + +## Use in 🤗 `transformers` + +```python +from pathlib import Path +import time +from rich import print +from typing import Optional +from transformers import LlamaForCausalLM, AutoTokenizer + +def load_model(ckpt_dir: str, step: Optional[int] = None): + if step is None: + fp = Path(ckpt_dir) + else: + fp = Path(ckpt_dir).joinpath(f"global_step{step}_hf") + print(f"Loading ckpt from: {fp}") + if fp.exists(): + model = LlamaForCausalLM.from_pretrained(fp.as_posix()) + print(f"{model=}") + return model + + raise FileNotFoundError(f"Unable to locate checkpoint at: {fp}") + + +def eval_model( + model: torch.nn.Module, + max_length: int = 64, + prompt: Optional[str] = None, + tokenizer: Optional[AutoTokenizer] = None, +) -> str: + prompt = "What is it like in there?" if prompt is None else prompt + tokenizer = ( + AutoTokenizer.from_pretrained("meta-llama/Llama-2-7B-hf") + if tokenizer is None else tokenizer + ) + output = ( + tokenizer.batch_decode( + model.generate( + **tokenizer(prompt, return_tensors="pt"), + max_length=max_length, + ), + clean_up_tokenization_spaces=True, + skip_special_tokens=True, + )[0] + ) + return output + + +def loop_over_checkpoints( + steps_list: list[int], + ckpt_dir: str, + max_length: int = 128, + prompt: Optional[str] = None, +): + for step in steps_list: + t0 = time.perf_counter() + prompt = "What is it like in there?" if prompt is None else prompt + print(f"\n Loading model from checkpoint at global step: {step}") + outputs = eval_model( + load_model(step, ckpt_dir), + max_length=max_length, + prompt=prompt, + ) + print(f"{outputs}") + print(f"\ntook: {time.perf_counter() - t0:.6f}s\n") +``` + +```python +>>> ckpt_dir = "/flare/Aurora_deployment/AuroraGPT-Checkpoints/production-checkpoints/aGPT-7B/HF/" +>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7B-hf") +>>> model = load_model(76300, ckpt_dir) +Loading ckpt from: +/flare/Aurora_deployment/AuroraGPT-Checkpoints/production-checkpoints/aGPT-7B/HF/global_step76300_hf +model=LlamaForCausalLM( + (model): LlamaModel( + (embed_tokens): Embedding(32000, 4096) + (layers): ModuleList( + (0-31): 32 x LlamaDecoderLayer( + (self_attn): LlamaSdpaAttention( + (q_proj): Linear(in_features=4096, out_features=4096, bias=False) + (k_proj): Linear(in_features=4096, out_features=1024, bias=False) + (v_proj): Linear(in_features=4096, out_features=1024, bias=False) + (o_proj): Linear(in_features=4096, out_features=4096, bias=False) + (rotary_emb): LlamaRotaryEmbedding() + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) + (up_proj): Linear(in_features=4096, out_features=11008, bias=False) + (down_proj): Linear(in_features=11008, out_features=4096, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05) + ) + ) + (norm): LlamaRMSNorm((4096,), eps=1e-05) + (rotary_emb): LlamaRotaryEmbedding() + ) + (lm_head): Linear(in_features=4096, out_features=32000, bias=False) +) + +>>> print( +... eval_model( +... model, +... max_length=128, +... prompt="What is it like in there?", +... tokenizer=tokenizer +... ) +... ) +Setting `pad_token_id` to `eos_token_id`:None for open-end generation. +Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, whereit will always be FP32) +What is it like in there? +I've been in there a few times. It's a pretty cool place. +I've been in there a few times. It's a pretty cool place. +I've been in there a few times. It's a pretty cool place. +I've been in there a few times. It's a pretty cool place. +I've been in there a few times. It's a pretty cool place. +I've been in there a few times. It's a pretty cool place. +I've been in +``` + +## Helper Script + +```bash +convert_mds_to_hf() { + if [[ "$#" -eq 3 ]]; then + GLOBAL_STEP=$1 + CKPT_ROOT=$2 + OUTPUT_PARENT=$3 + elif [[ "$#" -eq 2 ]]; then + GLOBAL_STEP=$1 + CKPT_ROOT=$2 + OUPUT_PARENT=$(pwd) + elif [[ "$#" -eq 1 ]]; then + GLOBAL_STEP=$1 + CKPT_ROOT="/flare/Aurora_deployment/AuroraGPT-Testing/foremans/rollback-41k8/Megatron-DeepSpeed-41800/checkpoints/ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb3072_sp1_pp1_tp1_bf16_optadamw_lr0.00020_lwf0.05/"; + OUPUT_PARENT=$(pwd) + else + echo "Expected exactly 1, 2, or 3 arguments (global_step, src, dst, respectively)" + exit + fi + SRC="${CKPT_ROOT}/global_step${GLOBAL_STEP}" + DST="${OUTPUT_PARENT}/global_step${GLOBAL_STEP}_hf" + if [[ -d "${SRC}" ]]; then + echo "Converting checkpoint @ global step ${GLOBAL_STEP}" + echo "\tsrc = ${SRC}\n" + echo "\tdst = ${DST}\n" + python3 mds_to_hf.py \ + --mds_checkpoint "${SRC}/mp_rank_00_model_states.pt" \ + --output_dir "${DST}" \ + --cache_dir "./.cache" + else + echo "Unable to locate directory ${SRC}. Exiting" + exit 1 + fi +} +``` diff --git a/ALCF/notes/deepspeed_init_time.md b/ALCF/notes/deepspeed_init_time.md new file mode 100644 index 0000000000..a355a082a9 --- /dev/null +++ b/ALCF/notes/deepspeed_init_time.md @@ -0,0 +1,269 @@ +# DeepSpeed Initialization Time on Aurora + +## TODOs + +- [ ] Use `ZeRO={1, 2}` @ 256 Nodes of Aurora +- [ ] Figure out bottleneck in startup time on Aurora +- [ ] Use GAS=8 on Aurora +- [ ] Weight decay too high +- [ ] Save checkpoints every ~ 1 hr +- [ ] Write weekly updates and post to GitHub + +## Initialization Times + +- Search for "deepspeed.initialize" in `Megatron-DeepSpeed/logs/`: + +```bash +#[🌌][11:44:57 PM][foremans@aurora-uan-0010][…/Megatron-DeepSpeed/logs][🌱 alcf-startup-time][$!?] +$ rg --hidden "deepspeed\.initialize" **/**/*.log | grep took +``` + +### Measurements + +| NUM_NODES | WORLD_SIZE | TIME | +|:---------:|:----------:|:----------:| +| 8 | 96 | 61.073 | +| | | | +| 16 | 192 | 107.74411 | +| 16 | 192 | 107.201338 | +| 16 | 192 | 107.10853 | +| | | | +| 32 | 384 | 200.23095 | +| 32 | 384 | 206.49485 | +| 32 | 384 | 200.49485 | +| | | | +| 64 | 768 | 413.55765 | +| 64 | 768 | 394.92617 | +| 64 | 768 | 414.725 | +| 64 | 768 | 387.987 | +| 64 | 768 | 411.72035 | +| 64 | 768 | 394.926 | +| 64 | 768 | 409.375 | +| 64 | 768 | 393.091 | +| 64 | 768 | 412.600 | +| | | | +| 128 | 1536 | 789.30077 | +| 128 | 1536 | 788.86531 | +| 128 | 1536 | 792.71864 | +| 128 | 1536 | 836.98 | +| 128 | 1536 | 801.205 | +| 128 | 1536 | 836.98 | +| 128 | 1536 | 820.9538 | +| 128 | 1536 | 707.048 | +| | | | +| 256 | 3072 | 1639.62374 | +| 256 | 3072 | 1591.345 | +| 256 | 3072 | 1632.12712 | +| 256 | 3072 | 1674.444 | +| 256 | 3072 | 1618.100 | + + +-
WORLD_SIZE=96: + + ```bash title="deepspeed_init_times.sh" + ws96_ds_stage1_nl32_hs4096_mb4_seq4096_gb3072_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-125717_96_x4420c5s5b0n0.hostmgmt2420.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:59:19][INFO][training:795] - 'deepspeed.initialize' took: 61.07362s + ws96_ds_stage1_nl32_hs4096_mb4_seq4096_gb3072_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-125717_96_x4420c5s5b0n0.hostmgmt2420.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:59:19][INFO][training:795] - 'deepspeed.initialize' took: 61.07362s + ws96_ds_stage1_nl32_hs4096_mb4_seq4096_gb3072_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-125717_96_x4420c5s5b0n0.hostmgmt2420.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:59:19][INFO][training:795] - 'deepspeed.initialize' took: 61.07362s + ``` + +
+ +-
WORLD_SIZE = 192: + + ```bash + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb6144_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-154948_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:52:30][INFO][training:795] - 'deepspeed.initialize' took: 107.74411s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb6144_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-154948_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:52:30][INFO][training:795] - 'deepspeed.initialize' took: 107.74411s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb6144_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-154948_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:52:30][INFO][training:795] - 'deepspeed.initialize' took: 107.74411s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamwschedulefree_lr0.0003_lwf0.05/20240623-163640_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:38:52][INFO][training:800] - 'deepspeed.initialize' took: 107.10853s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb6144_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-160332_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:05:43][INFO][training:800] - 'deepspeed.initialize' took: 107.20138s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb6144_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-160332_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:05:43][INFO][training:800] - 'deepspeed.initialize' took: 107.20138s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb6144_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-160332_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:05:43][INFO][training:800] - 'deepspeed.initialize' took: 107.20138s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamwschedulefree_lr0.0003_lwf0.05/20240623-163640_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:38:52][INFO][training:800] - 'deepspeed.initialize' took: 107.10853s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamwschedulefree_lr0.0003_lwf0.05/20240623-163640_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:38:52][INFO][training:800] - 'deepspeed.initialize' took: 107.10853s + ``` + +
+ +-
WORLD_SIZE = 384: + + ```bash + ws384_ds_stage1_nl32_hs4096_mb4_seq4096_gb12288_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-164607_384_x4402c6s7b0n0.hostmgmt2402.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:52:15][INFO][training:800] - 'deepspeed.initialize' took: 206.49485s + ws384_ds_stage1_nl32_hs4096_mb4_seq4096_gb12288_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-164607_384_x4402c6s7b0n0.hostmgmt2402.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:52:15][INFO][training:800] - 'deepspeed.initialize' took: 206.49485s + ws384_ds_stage1_nl32_hs4096_mb4_seq4096_gb12288_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-164607_384_x4402c6s7b0n0.hostmgmt2402.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:52:15][INFO][training:800] - 'deepspeed.initialize' took: 206.49485s + ws384_ds_stage1_nl32_hs4096_mb4_seq4096_gb12288_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-223159_384_x4706c1s6b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 22:37:53][INFO][training:800] - 'deepspeed.initialize' took: 200.23095s + ws384_ds_stage1_nl32_hs4096_mb4_seq4096_gb12288_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-223159_384_x4706c1s6b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 22:37:53][INFO][training:800] - 'deepspeed.initialize' took: 200.23095s + ws384_ds_stage1_nl32_hs4096_mb4_seq4096_gb12288_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-223159_384_x4706c1s6b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 22:37:53][INFO][training:800] - 'deepspeed.initialize' took: 200.23095s + ``` + +
+ +-
WORLD_SIZE=768: + + ```bash + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-180052_768_x4704c4s1b0n0.hostmgmt2704.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 18:12:43][INFO][training:800] - 'deepspeed.initialize' took: 394.92617s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-185626_768_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 19:05:45][INFO][training:800] - 'deepspeed.initialize' took: 414.72580s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-185626_768_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 19:05:45][INFO][training:800] - 'deepspeed.initialize' took: 414.72580s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-233045_768_x4711c0s1b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 23:39:19][INFO][training:797] - 'deepspeed.initialize' took: 387.98744s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-233045_768_x4711c0s1b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 23:39:19][INFO][training:797] - 'deepspeed.initialize' took: 387.98744s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-233045_768_x4711c0s1b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 23:39:19][INFO][training:797] - 'deepspeed.initialize' took: 387.98744s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-141802_768_x4706c2s0b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:27:50][INFO][training:795] - 'deepspeed.initialize' took: 411.72035s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-141802_768_x4706c2s0b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:27:50][INFO][training:795] - 'deepspeed.initialize' took: 411.72035s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-180052_768_x4704c4s1b0n0.hostmgmt2704.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 18:12:43][INFO][training:800] - 'deepspeed.initialize' took: 394.92617s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-180052_768_x4704c4s1b0n0.hostmgmt2704.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 18:12:43][INFO][training:800] - 'deepspeed.initialize' took: 394.92617s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-134324_768_x4705c2s1b0n0.hostmgmt2705.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:51:19][INFO][training:795] - 'deepspeed.initialize' took: 393.09134s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-185626_768_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 19:05:45][INFO][training:800] - 'deepspeed.initialize' took: 414.72580s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-165713_768_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:06:47][INFO][training:800] - 'deepspeed.initialize' took: 389.15768s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-122601_768_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:35:18][INFO][training:793] - 'deepspeed.initialize' took: 409.37578s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-134324_768_x4705c2s1b0n0.hostmgmt2705.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:51:19][INFO][training:795] - 'deepspeed.initialize' took: 393.09134s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-134324_768_x4705c2s1b0n0.hostmgmt2705.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:51:19][INFO][training:795] - 'deepspeed.initialize' took: 393.09134s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-141802_768_x4706c2s0b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:27:50][INFO][training:795] - 'deepspeed.initialize' took: 411.72035s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-165713_768_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:06:47][INFO][training:800] - 'deepspeed.initialize' took: 389.15768s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-124517_768_x4315c4s1b0n0.hostmgmt2315.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:57:42][INFO][training:795] - 'deepspeed.initialize' took: 395.05079s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-124517_768_x4315c4s1b0n0.hostmgmt2315.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:57:42][INFO][training:795] - 'deepspeed.initialize' took: 395.05079s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-165713_768_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:06:47][INFO][training:800] - 'deepspeed.initialize' took: 389.15768s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-130702_768_x4420c6s7b0n0.hostmgmt2420.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:15:22][INFO][training:795] - 'deepspeed.initialize' took: 412.60004s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-130702_768_x4420c6s7b0n0.hostmgmt2420.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:15:22][INFO][training:795] - 'deepspeed.initialize' took: 412.60004s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-122601_768_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:35:18][INFO][training:793] - 'deepspeed.initialize took: 409.37578s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-122601_768_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:35:18][INFO][training:793] - 'deepspeed.initialize took: 409.37578s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-173730_768_x4707c5s6b0n0.hostmgmt2707.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:45:33][INFO][training:800] - 'deepspeed.initialize' took: 400.74402s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-124517_768_x4315c4s1b0n0.hostmgmt2315.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:57:42][INFO][training:795] - 'deepspeed.initialize' took: 395.05079s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-130702_768_x4420c6s7b0n0.hostmgmt2420.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:15:22][INFO][training:795] - 'deepspeed.initialize' took: 412.60004s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-132452_768_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:34:32][INFO][training:795] - 'deepspeed.initialize' took: 413.55765s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-173730_768_x4707c5s6b0n0.hostmgmt2707.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:45:33][INFO][training:800] - 'deepspeed.initialize' took: 400.74402s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-173730_768_x4707c5s6b0n0.hostmgmt2707.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:45:33][INFO][training:800] - 'deepspeed.initialize' took: 400.74402s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-132452_768_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:34:32][INFO][training:795] - 'deepspeed.initialize' took: 413.55765s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-132452_768_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:34:32][INFO][training:795] - 'deepspeed.initialize' took: 413.55765s + ``` + +
+ +-
WORLD_SIZE = 1536: + + ```bash + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-162028_1536_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:37:49][INFO][training:800] - 'deepspeed.initialize' took: 789.30077s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-162028_1536_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:37:49][INFO][training:800] - 'deepspeed.initialize' took: 789.30077s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-162028_1536_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:37:49][INFO][training:800] - 'deepspeed.initialize' took: 789.30077s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-145656_1536_x4119c5s7b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:14:35][INFO][training:795] - 'deepspeed.initialize' took: 788.86531s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-145656_1536_x4119c5s7b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:14:35][INFO][training:795] - 'deepspeed.initialize' took: 788.86531s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-145656_1536_x4119c5s7b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:14:35][INFO][training:795] - 'deepspeed.initialize' took: 788.86531s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-122207_1536_x4309c6s4b0n0.hostmgmt2309.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:39:56][INFO][training:793] - 'deepspeed.initialize' took: 792.71864s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-122207_1536_x4309c6s4b0n0.hostmgmt2309.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:39:56][INFO][training:793] - 'deepspeed.initialize' took: 792.71864s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-122207_1536_x4309c6s4b0n0.hostmgmt2309.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:39:56][INFO][training:793] - 'deepspeed.initialize' took: 792.71864s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-125001_1536_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:06:47][INFO][training:795] - 'deepspeed.initialize' took: 836.98388s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-125001_1536_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:06:47][INFO][training:795] - 'deepspeed.initialize' took: 836.98388s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-175213_1536_x4702c1s4b0n0.hostmgmt2702.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 18:10:54][INFO][training:800] - 'deepspeed.initialize' took: 801.20500s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-175213_1536_x4702c1s4b0n0.hostmgmt2702.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 18:10:54][INFO][training:800] - 'deepspeed.initialize' took: 801.20500s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-184503_1536_x4702c1s4b0n0.hostmgmt2702.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 19:04:07][INFO][training:800] - 'deepspeed.initialize' took: 801.15950s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-131641_1536_x4315c4s1b0n0.hostmgmt2315.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:33:00][INFO][training:795] - 'deepspeed.initialize' took: 801.11322s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-213107_1536_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 21:46:29][INFO][training:800] - 'deepspeed.initialize' took: 820.95380s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-155216_1536_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:07:23][INFO][training:795] - 'deepspeed.initialize' took: 787.04806s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-141727_1536_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:34:20][INFO][training:795] - 'deepspeed.initialize' took: 809.36787s + ``` + +
+ +-
WORLD_SIZE = 3072: + + ```bash + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-144534_3072_x4309c6s2b0n0.hostmgmt2309.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:18:41][INFO][training:795] - 'deepspeed.initialize' took: 1639.62374s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-144534_3072_x4309c6s2b0n0.hostmgmt2309.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:18:41][INFO][training:795] - 'deepspeed.initialize' took: 1639.62374s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-144534_3072_x4309c6s2b0n0.hostmgmt2309.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:18:41][INFO][training:795] - 'deepspeed.initialize' took: 1639.62374s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-213304_3072_x4704c0s6b0n0.hostmgmt2704.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 22:03:15][INFO][training:800] - 'deepspeed.initialize' took: 1591.34487s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-213304_3072_x4704c0s6b0n0.hostmgmt2704.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 22:03:15][INFO][training:800] - 'deepspeed.initialize' took: 1591.34487s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-213304_3072_x4704c0s6b0n0.hostmgmt2704.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 22:03:15][INFO][training:800] - 'deepspeed.initialize' took: 1591.34487s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-170636_3072_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:37:20][INFO][training:800] - 'deepspeed.initialize' took: 1632.12712s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-170636_3072_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:37:20][INFO][training:800] - 'deepspeed.initialize' took: 1632.12712s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-170636_3072_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:37:20][INFO][training:800] - 'deepspeed.initialize' took: 1632.12712s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-124519_3072_x4119c5s3b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:16:22][INFO][training:795] - 'deepspeed.initialize' took: 1674.44393s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-124519_3072_x4119c5s3b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:16:22][INFO][training:795] - 'deepspeed.initialize' took: 1674.44393s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-124519_3072_x4119c5s3b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:16:22][INFO][training:795] - 'deepspeed.initialize' took: 1674.44393s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-140113_3072_x4119c5s3b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:30:23][INFO][training:795] - 'deepspeed.initialize' took: 1618.10035s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-140113_3072_x4119c5s3b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:30:23][INFO][training:795] - 'deepspeed.initialize' took: 1618.10035s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-140113_3072_x4119c5s3b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:30:23][INFO][training:795] - 'deepspeed.initialize' took: 1618.10035s + ``` + +
diff --git a/ALCF/requirements/requirements.txt b/ALCF/requirements/requirements.txt new file mode 100644 index 0000000000..03541ba514 --- /dev/null +++ b/ALCF/requirements/requirements.txt @@ -0,0 +1,18 @@ +hjson +ninja +psutil +py-cpuinfo +pydantic +tqdm +transformers +bitsandbytes +sentencepiece +einops +xgboost +fixedint +pybind11 +six +numpy<2 +schedulefree +packaging>=20.0 +wandb diff --git a/ALCF/sunspot-env-2024-04-15-002.sh b/ALCF/sunspot-env-2024-04-15-002.sh new file mode 100644 index 0000000000..3b7155675d --- /dev/null +++ b/ALCF/sunspot-env-2024-04-15-002.sh @@ -0,0 +1,4 @@ +#!/bin/bash --login + +module use /soft/preview-modulefiles/24.086.0 +module load frameworks/2024.04.15.002.lua diff --git a/ALCF/sunspot-env.sh b/ALCF/sunspot-env.sh new file mode 100644 index 0000000000..8b02542b20 --- /dev/null +++ b/ALCF/sunspot-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash --login +# +module use /home/ftartagl/graphics-compute-runtime/modulefiles +module load graphics-compute-runtime/agama-ci-devel-803.29 +module load spack-pe-gcc/0.6.1-23.275.2 +module load gcc/12.2.0 +module use /soft/preview-modulefiles/24.086.0 +module load oneapi/release/2024.04.15.001 diff --git a/ALCF/test_alcf.sh b/ALCF/test_alcf.sh new file mode 100644 index 0000000000..853addc59d --- /dev/null +++ b/ALCF/test_alcf.sh @@ -0,0 +1,166 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on {Polaris, Sunspot, Sirius} @ ALCF +# to launch (inside an interactive `qsub -I` job) on Polaris: +# +# ```bash` +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_alcf.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +setup_conda_sunspot() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$(~/miniconda3/bin/conda shell hook -s posix)" + conda activate q4-drop + else + echo "Found existing python at: $(which python3)" + fi +} + +setup_conda_sirius() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" + micromamba activate 2024-04-23 + else + echo "Found existing python at: $(which python3)" + fi +} + +setup_conda_polaris() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + # export CUDA_HOME=/soft/compilers/cudatoolkit/cuda-12.2.2 + # && export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba && eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" ; mm activate 2024-04-25 + export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" + micromamba activate 2024-04-25 + else + echo "Found existing python at: $(which python3)" + fi +} + + +function setEnv() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + # setup_conda + # ---- [SunSpot] ------- || ---- [Aurora] -------------- + if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then + source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit + # ----- [Aurora] ----------------------------------- + if [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + if [[ $(hostname) == x4* ]]; then + eval "$(conda shell.zsh hook)" && conda activate anl_release_q4v2 + # ----- [SunSpot] ---------------------------------- + elif [[ $(hostname) == x1* ]]; then + echo "Running on SunSpot !!" + setup_conda_sunspot + # eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate q4-drop + fi + fi + # ----- [Polaris] --------------------------------------- + elif [[ $(hostname) == x3* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + echo "Running on Sirius !!" + setup_conda_sirius + else + echo "Running on Polaris !!" + # ---- [load conda] --------------------- + setup_conda_polaris + # if [[ -d "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221" ]]; then + # source "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221/bin/activate" + # fi + fi + elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then + echo "Running on Perlmutter !!" + module load pytorch + source "${SLURM_SUBMIT_DIR}/venvs/perlmutter/pytorch-2.1.0-cu12/bin/activate" + else # ------------------------------------- [Unknown] ------------------- + echo "Unknown hostname $(hostname)" + exit 1 + fi + else + echo "Unable to setup python environment. Exiting" + exit 1 + fi + echo "[python] Using: $(which python3)" +} + + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-polaris-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + SUBMITTED_FROM=$(echo $PBS_O_HOST | tr '-' ' ' | awk '{print $1}') + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/${SUBMITTED_FROM}/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi + # export ZERO_STAGE=1 + # export NUM_LAYERS=10 + # export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-${SUBMITTED_FROM}-${NOW}".log +} + +main + diff --git a/ALCF/test_blend_full.sh b/ALCF/test_blend_full.sh index 4245304456..459652a2ee 100755 --- a/ALCF/test_blend_full.sh +++ b/ALCF/test_blend_full.sh @@ -6,16 +6,16 @@ #PBS -l filesystems=eagle:grand:home cd ${PBS_O_WORKDIR} export PPN=4 -export MD=/home/hzheng/ALCF-Megatron-DeepSpeed -module load conda/2023-10-04 -#conda activate /soft/datascience/megatron-deepspeed/2023-10-04 -conda activate $HOME/PolarisAT/pyenvs/megatron/2023-10-04 +export MD=/eagle/argonne_tpc/soft/Megatron-DeepSpeed +source /eagle/argonne_tpc/soft/conda.sh +export TRITON_CACHE_DIR=/tmp/.cache/ + export TP=1 export PP=1 export SP=128 export MBS=1 export BS=$((MBS*SP)) -export export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") +export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") export DATA_FILE_LIST="/eagle/datasets//dolma/data_file_list_reweighted.txt" HIDDEN_SIZE=4096 @@ -25,8 +25,9 @@ EMBEDDINGS=2048 TRAIN_ITERS=80797 ZERO_STAGE=2 MODEL=LLAMA_7B +export PBS_JOBSIZE=$(cat $PBS_NODEFILE | uniq | wc -l) OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} -python3 ALCF_utils/test_blendable_dataset.py \ +APRUN_PMI=pmix aprun -n $((PBS_JOBSIZE*PPN)) -N $PPN --cc depth -d 16 ${MD}/local_rank.sh python3 ALCF/test_blendable_dataset.py \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ --num-layers ${NUM_LAYERS} \ @@ -51,7 +52,6 @@ python3 ALCF_utils/test_blendable_dataset.py \ --lr-warmup-iters 2 \ --optimizer adam \ --adam-beta1 0.9 \ - --mmap_warmup False \ --adam-beta2 0.95 \ --log-interval 1 \ --cpu-optimizer \ @@ -70,4 +70,4 @@ python3 ALCF_utils/test_blendable_dataset.py \ --data-path ${DATA_PATH} \ --data-cache-path /tmp/hzheng-megatron-deepspeed-cache/ \ --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ - --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed + --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed diff --git a/ALCF/test_blendable_dataset.py b/ALCF/test_blendable_dataset.py index a0cccbb6cb..c119862142 100644 --- a/ALCF/test_blendable_dataset.py +++ b/ALCF/test_blendable_dataset.py @@ -1,4 +1,9 @@ #!/usr/bin/env python +import time +import json +start_time = time.time() +from mpi4py import MPI +import os from megatron.data.gpt_dataset import build_train_valid_test_datasets import numpy as np from megatron.global_vars import set_args, set_global_variables, get_args @@ -6,21 +11,50 @@ from megatron.initialize import initialize_megatron from megatron.data.data_samplers import build_pretraining_data_loader +import torch +from megatron.core import mpu + + +comm = MPI.COMM_WORLD +from megatron.utils import PerfTrace, Profile + + +import datetime +def print_rank_0(msg): + if comm.rank==0: + print(f" [INFO][{datetime.datetime.now()}] {msg}", flush=True) +end_time = time.time() +print_rank_0(f"Loaded python modules in {end_time - start_time} seconds") initialize_megatron(allow_no_cuda=True) +comm.Barrier() +print_rank_0(f"Barrier synchonization time: {time.time() - end_time} seconds") args = get_args() +if os.getenv('DLIO_PROFILER_DATASET_DIR') is not None: + extra_trace_path = os.environ['DLIO_PROFILER_DATASET_DIR'] +else: + extra_trace_path='' +PerfTrace.initialize_log(f"{args.trace_dir}/trace-{comm.rank}-of-{comm.size}.pfw", f"{args.data_cache_path}:{extra_trace_path}:{args.data_path}:{args.save}:{args.load}", process_id=comm.rank) +dlp = Profile("TEST_BLENDABLEDATASET") + +os.makedirs(args.trace_dir, exist_ok=True) +corpus_all = [] data_file_list = args.data_file_list -print(f"Reading data from {args.data_file_list}") +print_rank_0(f"Reading data from {args.data_file_list}") files = [] weights = [] flist = [] with open(data_file_list, 'r') as fin: for f in fin.readlines(): - w, fname = f.split() + w, fname, c = f.split() weights.append(float(w)) flist.append(fname) files.append(float(w)) files.append(fname) + files.append(c) + if c not in corpus_all: + corpus_all.append(c) + splits_string="100,0,0" weights = np.array(weights) @@ -28,10 +62,9 @@ num_samples = args.global_batch_size*args.train_iters num_datasets = len(weights) - -print(f"Number of datasets: {num_datasets}") -print(f"Global batch size: {args.global_batch_size}") -print(f"Training iterations: {args.train_iters}") +print_rank_0(f"Number of datasets: {num_datasets}") +print_rank_0(f"Global batch size: {args.global_batch_size}") +print_rank_0(f"Training iterations: {args.train_iters}") train_valid_test_num_samples = [num_samples, 0, 0] seed=args.seed data_impl = args.data_impl @@ -40,34 +73,80 @@ splits_string = "1,0,0" # Build datasets +start_build_dataset = time.time() + +print_rank_0(f"Starting to build the blendable dataset") train_ds, valid_ds, test_ds = build_train_valid_test_datasets(files, data_impl, splits_string, train_valid_test_num_samples, seq_length, seed, skip_warmup, data_cache_path=args.data_cache_path) -dataset_idx = [train_ds.dataset_index[i] for i in range(num_samples)] -ratio_select=np.zeros(num_datasets) -#for i in range(num_datasets): -# ratio_select[i] = np.sum([i==d for d in dataset_idx])/num_samples - -print(f"Total number of samples: {len(train_ds)}") -print(f"Weights set: {weights[:min(8, num_datasets)]}") -#print(f"Weights across training: {ratio_select[:min(8, num_datasets)]}") -for e in range(min(100, args.train_iters)): - ratio_select=np.zeros(num_datasets) - for i in range(num_datasets): - ratio_select[i] = np.sum([i==d for d in dataset_idx[e*args.global_batch_size:(e+1)*args.global_batch_size]])/args.global_batch_size - print(f"iter-{e}: {ratio_select[:min(8, num_datasets)]}") +end_build_dataset = time.time() +print_rank_0(f"Finished building the blendable dataset in {end_build_dataset - start_build_dataset} second") +print_rank_0(f"Total number of samples: {len(train_ds)}") +print_rank_0(f"Weights set: {weights[:min(8, num_datasets)]}") -print("First 10 samples") -for i in range(10): - print(f"Sample: {i} \t dataset_idx: {train_ds.dataset_index[i]}, sample_idx: {train_ds.dataset_sample_index[i]}") +def get_sample_info(blendable_dataset, idx): + # corpus dataset + cd = blendable_dataset.dataset_index[idx] + # index within the corpus dataset + cds = blendable_dataset.dataset_sample_index[idx] + # dataset index within each corpus + fcd = blendable_dataset.datasets[cd].dataset_index[cds] + # sample index within the dataset + fcds = blendable_dataset.datasets[cd].dataset_sample_index[cds] + # corresponding data file + prefix = blendable_dataset.datasets[cd].dataset_builders[fcd].prefix + corpus = blendable_dataset.datasets[cd].dataset_builders[fcd].corpus + #v = blendable_dataset[idx]['text'] + #norm = np.linalg.norm(v) + return prefix, corpus, fcds - -#### Build data loaders +num_batches = args.train_iters +print(f"global_batch_size: {args.global_batch_size}") +print(f"number of batches: {num_batches}") + +fout = open("samples_list.jsonl", "w") +if comm.rank == 0: + for i in range(num_batches): + ns_corpus = {} + for c in corpus_all: + ns_corpus[c] = 0 + for j in range(args.global_batch_size): + prefix, corpus, idx = get_sample_info(train_ds, i*args.global_batch_size+j) + ns_corpus[corpus] +=1 + fout.write(f"\u007b 'batch': {i}, 'sample': {j}, 'corpus': '{corpus}', 'prefix': '{prefix}', 'dataset_sample_index': {idx} \u007d\n") + fout.write(f"\u007b 'batch': {i}, 'histogram': {ns_corpus} \u007d \n") +comm.Barrier() +exit() +start_build_dataloader = time.time() +print_rank_0(f"Starting to build the data loader") +rank_in_parallel_group = mpu.get_sequence_parallel_rank() train_dataloader = build_pretraining_data_loader( - train_ds, args.consumed_train_samples) + train_ds, args.consumed_train_samples) valid_dataloader = build_pretraining_data_loader( valid_ds, args.consumed_valid_samples) test_dataloader = build_pretraining_data_loader(test_ds, 0) +end_build_dataloader = time.time() +print_rank_0(f"Finished building the data loader in {end_build_dataloader - start_build_dataloader} second") + +print_rank_0(f"Starting loading the data") +start_loading_time = time.time() +NUM_ITEMS=1 +SLEEP_TIME=10.0 +@dlp.log +def compute(ct): + time.sleep(ct) +n=0 +start_time = time.time() +for i in dlp.iter(train_dataloader): + print(f"[{comm.rank}] DATA {i}") + n+=1 + if (n%NUM_ITEMS==0): + print_rank_0(f"Proccessed {n}th-batch in {time.time() - start_time}") + if n>=1000: + break + start_time = time.time() +end_loading_time = time.time() +print_rank_0(f"Finished loading the data ({n} batches) in {end_loading_time - start_loading_time}") diff --git a/ALCF/test_polaris.sh b/ALCF/test_polaris.sh new file mode 100644 index 0000000000..a18c87fad7 --- /dev/null +++ b/ALCF/test_polaris.sh @@ -0,0 +1,88 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on Polaris @ ALCF +# to launch (inside an interactive `qsub -I` job) on Polaris: +# +# ```bash` +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_polaris.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +######################################################## +# Setup / activate conda environment, +# mine is called q4-drop +######################################################## +setup_conda() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" + micromamba activate 2024-04-25 + else + echo "Found existing python at: $(which python3)" + fi +} + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-polaris-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-polaris-${NOW}".log +} + +main diff --git a/ALCF/test_sirius.sh b/ALCF/test_sirius.sh new file mode 100755 index 0000000000..0a528a9519 --- /dev/null +++ b/ALCF/test_sirius.sh @@ -0,0 +1,88 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on Sirius @ ALCF +# to launch (inside an interactive `qsub -I` job) on Sirius: +# +# ```bash` +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_sirius.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +######################################################## +# Setup / activate conda environment, +# mine is called q4-drop +######################################################## +setup_conda() { + if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" + micromamba activate 2024-04-23 + else + echo "Found existing python at: $(which python3)" + fi +} + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-sirius-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/sirius/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-sirius-${NOW}".log +} + +main diff --git a/ALCF/test_sunspot.sh b/ALCF/test_sunspot.sh new file mode 100755 index 0000000000..b3b22c78b4 --- /dev/null +++ b/ALCF/test_sunspot.sh @@ -0,0 +1,87 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on Sunspot @ ALCF +# to launch (inside an interactive `qsub -I` job) on Sirius: +# +# ```bash +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_sunspot.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +######################################################## +# Setup / activate conda environment, +# mine is called q4-drop +######################################################## +setup_conda() { + if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$(~/miniconda3/bin/conda shell hook -s posix)" + conda activate q4-drop + else + echo "Found existing python at: $(which python3)" + fi +} + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-sunspot-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/sunspot/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-sunspot-${NOW}.log" +} + +main diff --git a/examples_deepspeed/finetune_hf_llama/ds_config.json b/examples_deepspeed/finetune_hf_llama/ds_config.json index 9c0b332473..85f439ce47 100755 --- a/examples_deepspeed/finetune_hf_llama/ds_config.json +++ b/examples_deepspeed/finetune_hf_llama/ds_config.json @@ -1,11 +1,5 @@ { "train_batch_size" : 256, "train_micro_batch_size_per_gpu": 16, - "steps_per_print": 100, - "zero_optimization": { - "stage": 0 - }, - "bf16": { - "enabled": true - } + "steps_per_print": 1 } diff --git a/examples_deepspeed/finetune_hf_llama/finetune_llama.sh b/examples_deepspeed/finetune_hf_llama/finetune_llama.sh index c48ea11b93..ab8bfdf419 100644 --- a/examples_deepspeed/finetune_hf_llama/finetune_llama.sh +++ b/examples_deepspeed/finetune_hf_llama/finetune_llama.sh @@ -1,8 +1,8 @@ DS_CONFIG=./examples_deepspeed/finetune_hf_llama/ds_config.json -DATASET_PATH=./alpaca_data.json +DATASET_PATH=./examples_deepspeed/finetune_hf_llama/alpaca_data.json # dataset link: https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json -HF_LLAMA_PATH=/data/llama-7b/ +HF_LLAMA_PATH=/data/llama-2-7b-hf/ # weights link: https://huggingface.co/huggyllama/llama-7b MICRO_BATCH_SIZE=16 @@ -44,11 +44,20 @@ cat < $DS_CONFIG EOT -covert_args="deepspeed tools/hf2megads_weight_converter.py \ +covert_hf2mds_args="deepspeed tools/hf2megads_weight_converter.py \ --hf-ckpt-num-shards 2 \ ---origin-hf-ckpt-dir $HF_LLAMA_PATH \ +--hf-ckpt-dir $HF_LLAMA_PATH \ +--load-mode auto \ --save $MEGA_DS_LLAMA_PATH" +covert_mds2hf_args="deepspeed tools/hf2megads_weight_converter.py \ +--hf-ckpt-num-shards 2 \ +--hf-ckpt-dir $HF_LLAMA_PATH \ +--load-mode auto \ +--to-hf-ckpt \ +--load $MEGA_DS_LLAMA_PATH \ +--save $HF_LLAMA_PATH'-hf-out' " + finetune_args="deepspeed finetune_llama.py \ --load $MEGA_DS_LLAMA_PATH" @@ -98,8 +107,10 @@ comm_args="--tensor-model-parallel-size $TP \ --no-gradient-accumulation-fusion \ --repeated-dataloader" -if [ "$1" = "convert" ]; then - task_args="$covert_args" +if [ "$1" = "convert_hf2mds" ]; then + task_args="$covert_hf2mds_args" +elif [ "$1" = "convert_mds2hf" ]; then + task_args="$covert_mds2hf_args" else task_args="$finetune_args" fi diff --git a/examples_deepspeed/pretrain_llama2_distributed.sh b/examples_deepspeed/pretrain_llama2_distributed.sh index f275ea636a..4c790e8c19 100755 --- a/examples_deepspeed/pretrain_llama2_distributed.sh +++ b/examples_deepspeed/pretrain_llama2_distributed.sh @@ -41,6 +41,17 @@ GRAD_CLIP=1 # activation_checkpoint="true" activation_checkpoint="false" +LOG_TO_WANDB=0 +WANDB_ARGS= +if [ $LOG_TO_WANDB -eq 1 ] +then +WANDB_ARGS="\ + --wandb-project pretrain-llama2 \ + --wandb-exp-name exp0 \ + --wandb-save-dir ${BASE_PATH}/wandb \ + " +fi + # Below configuration required for llama model as per llama paper # --no-query-key-layer-scaling \ # --attention-dropout 0 \ @@ -53,7 +64,6 @@ activation_checkpoint="false" ###################################### - cat < $DS_CONFIG { "train_batch_size" : $GLOBAL_BATCH_SIZE, @@ -132,4 +142,5 @@ torchrun $DISTRIBUTED_ARGS \ --normalization rmsnorm \ --disable-bias-linear \ --num-key-value-heads $NUM_KV_HEADS \ + $WANDB_ARGS \ $ds_args diff --git a/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh b/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh index da028dc731..24bfa544d6 100644 --- a/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh +++ b/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh @@ -187,14 +187,6 @@ host="${HOSTNAME}" seed=1234 num_workers=0 -data_path="BookCorpusDataset_text_document" -if [ ! -f "BookCorpusDataset_text_document.bin" ]; then - wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin -fi -if [ ! -f "BookCorpusDataset_text_document.idx" ]; then - wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx -fi - vocab_path="gpt2-vocab.json" if [ ! -f "$vocab_path" ]; then wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json @@ -204,6 +196,24 @@ if [ ! -f "$merge_path" ]; then wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt fi + +data_path="BookCorpusDataset_text_document" +if [ ! -f "BookCorpusDataset_text_document.bin" ]; then + # Download the Bookcorpus dataset and convert to json + python preprocess_bookcorpus.py + + # Process the dataset + python ${dir}/../../tools/preprocess_data.py \ + --input ${data_path}.json \ + --output-prefix "BookCorpusDataset" \ + --vocab-file $vocab_path \ + --merge-file $merge_path \ + --dataset-impl mmap \ + --tokenizer-type GPT2BPETokenizer \ + --workers 32 \ + --append-eod +fi + prescale_grad="true" jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B" jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}" diff --git a/examples_deepspeed/sequence_parallel/preprocess_bookcorpus.py b/examples_deepspeed/sequence_parallel/preprocess_bookcorpus.py new file mode 100644 index 0000000000..c35a13ea68 --- /dev/null +++ b/examples_deepspeed/sequence_parallel/preprocess_bookcorpus.py @@ -0,0 +1,4 @@ +from datasets import load_dataset + +train_data = load_dataset('bookcorpus/bookcorpus', split='train') +train_data.to_json("BookCorpusDataset_text_document.json", lines=True) diff --git a/examples_deepspeed/universal_checkpointing/README.md b/examples_deepspeed/universal_checkpointing/README.md index 341b0d113f..281d320e99 100644 --- a/examples_deepspeed/universal_checkpointing/README.md +++ b/examples_deepspeed/universal_checkpointing/README.md @@ -10,12 +10,12 @@ This folder contains example scripts that demonstrate how to use Universal Check For ZeRO stage 1, we provide bash scripts for bf16 and fp16 training examples corresponding to the steps 1 and 3 above. The step 1 scripts launch a training run of TP=PP=DP=2 of 200 iterations that creates a checkpoint every 100 iterations. The step 3 scripts load a universal checkpoint of iteration 100 and resume training with TP=PP=2 and DP=1 for an additional 100 iterations. Users can modify these scripts to try out other save and resume 3D combinations (e.g., save TP=PP=DP=1 and resume TP=PP=DP=2). Tensorboard logs are created by both step 1 and 3 scripts to enable visual inspection of how well the loss curves of the initial and resumed training runs match, especially at iteration 101. 1. bf16: - * run_bf16.sh: step 1 - * run_universal_bf16.sh: step 3 + * megatron_gpt/run_bf16.sh: step 1 + * megatron_gpt/run_universal_bf16.sh: step 3 2. fp16: - * run_fp16.sh: step 1 - * run_universal_fp16.sh: step 3 + * megatron_gpt/run_fp16.sh: step 1 + * megatron_gpt/run_universal_fp16.sh: step 3 Please note that these scripts should be run from the root folder of the repo (i.e., two levels above this README). For illustration, here are the commands for running the bf16 example. @@ -41,22 +41,22 @@ NOTE: Make sure to update your `BASE_DATA_PATH` path in the `run_[bf16/fp16].sh` ### Step 1: Create ZeRO checkpoint ```bash - bash examples_deepspeed/universal_checkpointing/run_bf16.sh + bash examples_deepspeed/universal_checkpointing/megatron_gpt/run_bf16.sh ``` -By default the script will create the checkpoints in folder `z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_toy` +By default the script will create the checkpoints in folder `z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_sp1_toy` ### Step 2: Convert ZeRO checkpoint of iteration 100 to Universal format Assuming the DeepSpeed source code is cloned into the home folder, the following command will generate universal checkpoint for iteration 100. ```bash python ${HOME}/DeepSpeed/deepspeed/checkpoint/ds_to_universal.py \ - --input_folder z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_toy/global_step100 \ - --output_folder z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_toy/global_step100_universal + --input_folder z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_sp1_toy/global_step100 \ + --output_folder z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_sp1_toy/global_step100_universal ``` Note that we chose to create the universal checkpoint in the same checkpoint folder as the ZeRO checkpoint. This maintains the normal checkpoint folder structure expected by the Megatron-DeepSpeed code, which makes it easy to load universal checkpoints with little/no script or code changes. For clarity, we show below the contents of the checkpoint folder after creation of the universal checkpoint. Note that the conversion script creates `global_step100_universal` folder and `latest_universal` file. ```bash -ls -l z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_toy/ +ls -l z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_sp1_toy/ total 48 drwxr-xr-x 2 user group 4096 Oct 21 08:51 global_step100 drwxr-xr-x 3 user group 4096 Oct 21 09:28 global_step100_universal @@ -69,7 +69,7 @@ drwxr-xr-x 2 user group 4096 Oct 21 09:01 global_step200 ### Step 3: Resume training with Universal checkpoint of iteration 100 ```bash -bash examples_deepspeed/universal_checkpointing/run_universal_bf16.sh +bash examples_deepspeed/universal_checkpointing/megatron_gpt/run_universal_bf16.sh ``` This resumption script effects the loading of universal checkpoint rather than the ZeRO checkpoint in the folder by passing `--universal-checkpoint` command line flag to the main training script (i.e., `pretrain_gpt.py`). @@ -77,13 +77,15 @@ Please see the corresponding [pull request](https://github.com/microsoft/Megatro Combining sequence parallelism with data parallelism is another good use case for universal checkpointing, see [sp pull request](https://github.com/microsoft/DeepSpeed/pull/4752) for example and visualization of matching loss values. +Notes: The model weights using the ```--no-pipeline-parallel``` parameter and the model weights not using the ```--no-pipeline-parallel``` parameter are currently not supported for mutual conversion. + ### TensorBoard Log Analysis The Universal Checkpointing example includes a TensorBoard analysis script that will generate `csv` files and `png` plots across the unviersal checkpointing training steps for comparison of training and validation loss curves. After Step 3 is completed, the script may be executed as follows: ```bash -bash examples_deepspeed/universal_checkpointing/run_tb_analysis.sh z1_uni_ckpt +bash examples_deepspeed/universal_checkpointing/megatron_gpt/run_tb_analysis_gpt.sh z1_uni_ckpt ``` The script will output the following `csv` files: @@ -116,4 +118,25 @@ Repeat steps in ZeRO stage 1 training above with the following modifications to * Set ZERO_STAGE=2 * Add `--no-pipeline-parallel` flag to deepspeed options -## ZeRO stage 3 training (**Coming soon**) +## ZeRO stage 3 training +Repeat steps in ZeRO stage 1 training above with the following modifications to your job batch scripts: +* Set ZERO_STAGE=3 +* Add `--no-pipeline-parallel` flag to deepspeed options + +> **Note:** that the stage 3 universal checkpoint currently supports Data parallelism. + +Below is the visualization of the `png` files generated from ZeRO stage 3. + +
+ + + *Figure 1: Training LM loss curve for first 200 training steps of Step 1 (TP=1, PP=1, DP=4) and training steps 101 to 200 of Step 3 (TP=1, PP=1, DP=2), which was loaded using the Universal Checkpoint.* +
+ +
+ + + *Figure 2: Validation LM loss curve for first 200 training steps of Step 1 (TP=1, PP=1, DP=4) and training steps 101 to 200 of Step 3 (TP=1, PP=1, DP=2), which was loaded using the Universal Checkpoint.* +
+ + diff --git a/examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_training_loss.png b/examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_training_loss.png new file mode 100644 index 0000000000..4c6758e991 Binary files /dev/null and b/examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_training_loss.png differ diff --git a/examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_validation_loss.png b/examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_validation_loss.png new file mode 100644 index 0000000000..30d6f72eb8 Binary files /dev/null and b/examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_validation_loss.png differ diff --git a/examples_deepspeed/universal_checkpointing/llama/run_llama_bf16.sh b/examples_deepspeed/universal_checkpointing/llama/run_llama_bf16.sh new file mode 100644 index 0000000000..72e79d4f1f --- /dev/null +++ b/examples_deepspeed/universal_checkpointing/llama/run_llama_bf16.sh @@ -0,0 +1,175 @@ +#!/bin/bash +set -ex + +DIR=`pwd` +###################################### +# Change the below configurations here +BASE_PATH=dataset +DS_CONFIG=${BASE_PATH}/deepspeed.json +DATASET=${BASE_PATH}/my-gpt2_text_document +TOKENIZER_PATH=${BASE_PATH}/llama-7b/tokenizer.model # offical llama tokenizer.model + +GPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 + +HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 +FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 +NUM_LAYERS=24 # e.g. llama-13b: 40 +NUM_HEADS=16 # e.g. llama-13b: 40 +SEQ=2048 + +LR_WARMUP_STEPS=2000 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +## Activation checkpointing saves GPU memory, but reduces training speed +# activation_checkpoint="true" +activation_checkpoint="false" + +ZERO_STAGE=1 +DTYPE="bf16" + +# 3D parallelism of training +TP=2 +PP=2 +DP=2 +SP=1 +WORLD_SIZE=$((TP*PP*DP*SP)) +GLOBAL_BATCH=32 +MICRO_BATCH=$((GLOBAL_BATCH/WORLD_SIZE)) +TRAIN_ITERS=250000 +LR=3e-4 +MIN_LR=3e-5 + +# Debug +DEBUG_MODE=1 +if [[ $DEBUG_MODE == 1 ]]; then + EXIT_INTERVAL=200 + SIZE_TAG="toy" +else + EXIT_INTERVAL=$TRAIN_ITERS + SIZE_TAG="big" +fi + +# 3D parallelism of checkpoint to load +LOAD_TP=$TP +LOAD_PP=$PP +LOAD_DP=$DP +LOAD_SP=$SP +RUN_TAG="save" + + +EXP_DIR="z${ZERO_STAGE}_uni_ckpt" +CHECKPOINT_PATH=${EXP_DIR}/checkpoints/llama/z${ZERO_STAGE}/$DTYPE/tp${TP}_pp${PP}_dp${DP}_sp${SP}_${SIZE_TAG} +LOAD_CHECKPOINT_PATH=${EXP_DIR}/checkpoints/llama/z${ZERO_STAGE}/$DTYPE/tp${LOAD_TP}_pp${LOAD_PP}_dp${LOAD_DP}_sp${LOAD_SP}_${SIZE_TAG} +LOG_DIR="${EXP_DIR}/tensorboard/llama/$DTYPE/tp${TP}_pp${PP}_dp${DP}_sp${SP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_${SIZE_TAG}_${RUN_TAG}" +mkdir -p $LOG_DIR + +# Below configuration required for llama model as per llama paper +# --no-query-key-layer-scaling \ +# --attention-dropout 0 \ +# --hidden-dropout 0 \ +# --use-rotary-position-embeddings \ +# --untie-embeddings-and-output-weights \ +# --swiglu \ +# --normalization rmsnorm \ +# --disable-bias-linear \ +###################################### + +cat < $DS_CONFIG +{ + "train_batch_size" : $GLOBAL_BATCH, + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "steps_per_print": 1, + + "zero_optimization": { + "stage": $ZERO_STAGE + }, + + "bf16": { + "enabled": true + }, + + "wall_clock_breakdown" : false +} +EOT + +ds_args="" +ds_args=" --deepspeed ${ds_args}" +ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" +ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + +if [ "${activation_checkpoint}" = "true" ]; then + ds_args="--deepspeed-activation-checkpointing ${ds_args}" + + ## old argument for recomputing the transformer layer + # ds_args="--checkpoint-activations ${ds_args}" + + ## new argument for recomputing the transformer layer + ds_args="--recompute-granularity full --recompute-method uniform ${ds_args}" + ## new argument for recomputing only the attention layer + # ds_args="--recompute-granularity selective ${ds_args}" +fi + +if [[ ${ZERO_STAGE} -gt 1 ]]; then +ds_args="${ds_args} \ + --no-pipeline-parallel" +fi + +options="\ + --tensor-model-parallel-size $TP \ + --pipeline-model-parallel-size $PP \ + --ds-sequence-parallel-size $SP \ + --num-layers $NUM_LAYERS \ + --hidden-size $HIDDEN_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --num-attention-heads $NUM_HEADS \ + --micro-batch-size $MICRO_BATCH \ + --global-batch-size $GLOBAL_BATCH \ + --seq-length $SEQ \ + --max-position-embeddings $SEQ \ + --train-iters $TRAIN_ITERS \ + --save ${CHECKPOINT_PATH} \ + --load ${LOAD_CHECKPOINT_PATH} \ + --data-path $DATASET \ + --data-impl mmap \ + --tokenizer-type GPTSentencePieceTokenizer \ + --tokenizer-model $TOKENIZER_PATH \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr $LR \ + --lr-decay-style cosine \ + --min-lr $MIN_LR \ + --weight-decay $WEIGHT_DECAY \ + --clip-grad $GRAD_CLIP \ + --lr-warmup-iters $LR_WARMUP_STEPS \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --log-interval 1 \ + --save-interval 100 \ + --eval-interval 10 \ + --eval-iters 40 \ + --exit-interval ${EXIT_INTERVAL} \ + --${DTYPE} \ + --no-query-key-layer-scaling \ + --attention-dropout 0 \ + --hidden-dropout 0 \ + --use-rotary-position-embeddings \ + --untie-embeddings-and-output-weights \ + --swiglu \ + --normalization rmsnorm \ + --disable-bias-linear \ + --tensorboard-dir $LOG_DIR \ + $ds_args +" + +WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE" +run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}" + +echo ${options} +echo ${run_cmd} +eval ${run_cmd} diff --git a/examples_deepspeed/universal_checkpointing/llama/run_tb_analysis_llama.sh b/examples_deepspeed/universal_checkpointing/llama/run_tb_analysis_llama.sh new file mode 100755 index 0000000000..b807fb97a7 --- /dev/null +++ b/examples_deepspeed/universal_checkpointing/llama/run_tb_analysis_llama.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +OUTPUT_PATH=$1 + +if [ "$OUTPUT_PATH" == "" ]; then + OUTPUT_PATH="z1_uni_ckpt" +fi + +# Training Loss +python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \ + --tb_dir $OUTPUT_PATH \ + --tb_event_key "lm-loss-training/lm loss" \ + --plot_name "uc_char_training_loss.png" \ + --plot_title "Llama 7B Universal Checkpointing - Training Loss" \ + +# Validation Loss +python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \ + --tb_dir $OUTPUT_PATH \ + --tb_event_key "lm-loss-validation/lm loss validation" \ + --csv_name "val_" \ + --plot_name "uc_char_validation_loss.png" \ + --plot_title "Llama 7B Universal Checkpointing - Validation Loss" \ + --plot_y_label "Validation LM Loss" \ diff --git a/examples_deepspeed/universal_checkpointing/llama/run_universal_llama_bf16.sh b/examples_deepspeed/universal_checkpointing/llama/run_universal_llama_bf16.sh new file mode 100644 index 0000000000..334fa3eaf6 --- /dev/null +++ b/examples_deepspeed/universal_checkpointing/llama/run_universal_llama_bf16.sh @@ -0,0 +1,176 @@ +#!/bin/bash +set -ex + +DIR=`pwd` +###################################### +# Change the below configurations here +BASE_PATH=dataset +DS_CONFIG=${BASE_PATH}/deepspeed.json +DATASET=${BASE_PATH}/my-gpt2_text_document +TOKENIZER_PATH=${BASE_PATH}/llama-7b/tokenizer.model # offical llama tokenizer.model + +GPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 + +HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 +FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 +NUM_LAYERS=24 # e.g. llama-13b: 40 +NUM_HEADS=16 # e.g. llama-13b: 40 +SEQ=2048 + +LR_WARMUP_STEPS=2000 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +## Activation checkpointing saves GPU memory, but reduces training speed +# activation_checkpoint="true" +activation_checkpoint="false" + +ZERO_STAGE=1 +DTYPE="bf16" + +# 3D parallelism of training +TP=2 +PP=2 +DP=1 +SP=1 +WORLD_SIZE=$((TP*PP*DP*SP)) +GLOBAL_BATCH=32 +MICRO_BATCH=$((GLOBAL_BATCH/WORLD_SIZE)) +TRAIN_ITERS=250000 +LR=3e-4 +MIN_LR=3e-5 + +# Debug +DEBUG_MODE=1 +if [[ $DEBUG_MODE == 1 ]]; then + EXIT_INTERVAL=200 + SIZE_TAG="toy" +else + EXIT_INTERVAL=$TRAIN_ITERS + SIZE_TAG="big" +fi + +# 3D parallelism of checkpoint to load +LOAD_TP=2 +LOAD_PP=2 +LOAD_DP=2 +LOAD_SP=1 +RUN_TAG="uni_load${LOAD_TP}_${LOAD_PP}_${LOAD_DP}_${LOAD_SP}" + + +EXP_DIR="z${ZERO_STAGE}_uni_ckpt" +CHECKPOINT_PATH=${EXP_DIR}/checkpoints/llama/z${ZERO_STAGE}/$DTYPE/tp${TP}_pp${PP}_dp${DP}_sp${SP}_${SIZE_TAG} +LOAD_CHECKPOINT_PATH=${EXP_DIR}/checkpoints/llama/z${ZERO_STAGE}/$DTYPE/tp${LOAD_TP}_pp${LOAD_PP}_dp${LOAD_DP}_sp${LOAD_SP}_${SIZE_TAG} +LOG_DIR="${EXP_DIR}/tensorboard/llama/$DTYPE/tp${TP}_pp${PP}_dp${DP}_sp${SP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_${SIZE_TAG}_${RUN_TAG}" +mkdir -p $LOG_DIR + +# Below configuration required for llama model as per llama paper +# --no-query-key-layer-scaling \ +# --attention-dropout 0 \ +# --hidden-dropout 0 \ +# --use-rotary-position-embeddings \ +# --untie-embeddings-and-output-weights \ +# --swiglu \ +# --normalization rmsnorm \ +# --disable-bias-linear \ +###################################### + +cat < $DS_CONFIG +{ + "train_batch_size" : $GLOBAL_BATCH, + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "steps_per_print": 1, + + "zero_optimization": { + "stage": $ZERO_STAGE + }, + + "bf16": { + "enabled": true + }, + + "wall_clock_breakdown" : false +} +EOT + +ds_args="" +ds_args=" --deepspeed ${ds_args}" +ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" +ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + +if [ "${activation_checkpoint}" = "true" ]; then + ds_args="--deepspeed-activation-checkpointing ${ds_args}" + + ## old argument for recomputing the transformer layer + # ds_args="--checkpoint-activations ${ds_args}" + + ## new argument for recomputing the transformer layer + ds_args="--recompute-granularity full --recompute-method uniform ${ds_args}" + ## new argument for recomputing only the attention layer + # ds_args="--recompute-granularity selective ${ds_args}" +fi + +if [[ ${ZERO_STAGE} -gt 1 ]]; then +ds_args="${ds_args} \ + --no-pipeline-parallel" +fi + +options="\ + --tensor-model-parallel-size $TP \ + --pipeline-model-parallel-size $PP \ + --ds-sequence-parallel-size $SP \ + --num-layers $NUM_LAYERS \ + --hidden-size $HIDDEN_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --num-attention-heads $NUM_HEADS \ + --micro-batch-size $MICRO_BATCH \ + --global-batch-size $GLOBAL_BATCH \ + --seq-length $SEQ \ + --max-position-embeddings $SEQ \ + --train-iters $TRAIN_ITERS \ + --save ${CHECKPOINT_PATH} \ + --load ${LOAD_CHECKPOINT_PATH} \ + --data-path $DATASET \ + --data-impl mmap \ + --tokenizer-type GPTSentencePieceTokenizer \ + --tokenizer-model $TOKENIZER_PATH \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr $LR \ + --lr-decay-style cosine \ + --min-lr $MIN_LR \ + --weight-decay $WEIGHT_DECAY \ + --clip-grad $GRAD_CLIP \ + --lr-warmup-iters $LR_WARMUP_STEPS \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --log-interval 1 \ + --save-interval 100 \ + --eval-interval 10 \ + --eval-iters 40 \ + --exit-interval ${EXIT_INTERVAL} \ + --${DTYPE} \ + --no-query-key-layer-scaling \ + --attention-dropout 0 \ + --hidden-dropout 0 \ + --use-rotary-position-embeddings \ + --untie-embeddings-and-output-weights \ + --swiglu \ + --normalization rmsnorm \ + --disable-bias-linear \ + --tensorboard-dir $LOG_DIR \ + --universal-checkpoint \ + $ds_args +" + +WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE" +run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}" + +echo ${options} +echo ${run_cmd} +eval ${run_cmd} diff --git a/examples_deepspeed/universal_checkpointing/run_bf16.sh b/examples_deepspeed/universal_checkpointing/megatron_gpt/run_bf16.sh similarity index 99% rename from examples_deepspeed/universal_checkpointing/run_bf16.sh rename to examples_deepspeed/universal_checkpointing/megatron_gpt/run_bf16.sh index 0953954222..07cbc30e72 100755 --- a/examples_deepspeed/universal_checkpointing/run_bf16.sh +++ b/examples_deepspeed/universal_checkpointing/megatron_gpt/run_bf16.sh @@ -3,7 +3,7 @@ DIR=`pwd` DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` -BASE_DATA_PATH=datasets +BASE_DATA_PATH=dataset DATASET=${BASE_DATA_PATH}/my-gpt2_text_document VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt diff --git a/examples_deepspeed/universal_checkpointing/run_fp16.sh b/examples_deepspeed/universal_checkpointing/megatron_gpt/run_fp16.sh similarity index 99% rename from examples_deepspeed/universal_checkpointing/run_fp16.sh rename to examples_deepspeed/universal_checkpointing/megatron_gpt/run_fp16.sh index 691fa8a8e6..2f1b994079 100755 --- a/examples_deepspeed/universal_checkpointing/run_fp16.sh +++ b/examples_deepspeed/universal_checkpointing/megatron_gpt/run_fp16.sh @@ -3,7 +3,7 @@ DIR=`pwd` DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` -BASE_DATA_PATH=datasets +BASE_DATA_PATH=dataset DATASET=${BASE_DATA_PATH}/my-gpt2_text_document VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt diff --git a/examples_deepspeed/universal_checkpointing/run_tb_analysis.sh b/examples_deepspeed/universal_checkpointing/megatron_gpt/run_tb_analysis_gpt.sh similarity index 96% rename from examples_deepspeed/universal_checkpointing/run_tb_analysis.sh rename to examples_deepspeed/universal_checkpointing/megatron_gpt/run_tb_analysis_gpt.sh index 7aa988a0a0..3a17d66750 100755 --- a/examples_deepspeed/universal_checkpointing/run_tb_analysis.sh +++ b/examples_deepspeed/universal_checkpointing/megatron_gpt/run_tb_analysis_gpt.sh @@ -16,7 +16,6 @@ python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_scrip --tb_event_key "lm-loss-training/lm loss" \ --plot_name "uc_char_training_loss.png" \ --plot_title "Megatron-GPT Universal Checkpointing - Training Loss" \ - --use_sns # Validation Loss python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \ @@ -26,4 +25,3 @@ python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_scrip --plot_name "uc_char_validation_loss.png" \ --plot_title "Megatron-GPT Universal Checkpointing - Validation Loss" \ --plot_y_label "Validation LM Loss" \ - --use_sns diff --git a/examples_deepspeed/universal_checkpointing/megatron_gpt/run_tb_analysis_gpt_plot_only.sh b/examples_deepspeed/universal_checkpointing/megatron_gpt/run_tb_analysis_gpt_plot_only.sh new file mode 100755 index 0000000000..0c3ea5399c --- /dev/null +++ b/examples_deepspeed/universal_checkpointing/megatron_gpt/run_tb_analysis_gpt_plot_only.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +OUTPUT_PATH=$1 + +if [ "$OUTPUT_PATH" == "" ]; then + OUTPUT_PATH="z1_uni_ckpt" +fi + +# Training Loss +python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \ + --tb_dir $OUTPUT_PATH \ + --tb_event_key "lm-loss-training/lm loss" \ + --plot_name "uc_char_training_loss.png" \ + --plot_title "Megatron-GPT Universal Checkpointing - Training Loss" \ + --plot_only \ + --csv_dir "/workspace/uc/megatron/loss_csv" \ + +# Validation Loss +python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \ + --tb_dir $OUTPUT_PATH \ + --tb_event_key "lm-loss-validation/lm loss validation" \ + --csv_name "val_" \ + --plot_name "uc_char_validation_loss.png" \ + --plot_title "Megatron-GPT Universal Checkpointing - Validation Loss" \ + --plot_y_label "Validation LM Loss" \ + --plot_only \ + --csv_dir "/workspace/uc/megatron/val_csv" \ diff --git a/examples_deepspeed/universal_checkpointing/run_universal_bf16.sh b/examples_deepspeed/universal_checkpointing/megatron_gpt/run_universal_bf16.sh similarity index 99% rename from examples_deepspeed/universal_checkpointing/run_universal_bf16.sh rename to examples_deepspeed/universal_checkpointing/megatron_gpt/run_universal_bf16.sh index ef0e134cfc..4134b9df48 100755 --- a/examples_deepspeed/universal_checkpointing/run_universal_bf16.sh +++ b/examples_deepspeed/universal_checkpointing/megatron_gpt/run_universal_bf16.sh @@ -3,7 +3,7 @@ DIR=`pwd` DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` -BASE_DATA_PATH=datasets +BASE_DATA_PATH=dataset DATASET=${BASE_DATA_PATH}/my-gpt2_text_document VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt diff --git a/examples_deepspeed/universal_checkpointing/run_universal_fp16.sh b/examples_deepspeed/universal_checkpointing/megatron_gpt/run_universal_fp16.sh similarity index 99% rename from examples_deepspeed/universal_checkpointing/run_universal_fp16.sh rename to examples_deepspeed/universal_checkpointing/megatron_gpt/run_universal_fp16.sh index 1e207e422b..bb3a538951 100755 --- a/examples_deepspeed/universal_checkpointing/run_universal_fp16.sh +++ b/examples_deepspeed/universal_checkpointing/megatron_gpt/run_universal_fp16.sh @@ -3,7 +3,7 @@ DIR=`pwd` DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` -BASE_DATA_PATH=datasets +BASE_DATA_PATH=dataset DATASET=${BASE_DATA_PATH}/my-gpt2_text_document VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt diff --git a/examples_deepspeed/universal_checkpointing/tb_analysis/arguments.py b/examples_deepspeed/universal_checkpointing/tb_analysis/arguments.py index 3dacb45d4e..ca80872ca0 100644 --- a/examples_deepspeed/universal_checkpointing/tb_analysis/arguments.py +++ b/examples_deepspeed/universal_checkpointing/tb_analysis/arguments.py @@ -17,3 +17,5 @@ parser.add_argument("--skip_csv", action='store_true', help="Skip generation of csv files") parser.add_argument("--use_sns", action='store_true', help="Use the SNS library to format plot") parser.add_argument("--csv_name", required=False, default="", type=str, help="Unique name for CSV files") +parser.add_argument("--plot_only", action='store_true', help="Plot only using csv files") +parser.add_argument("--csv_dir", required=False, type=str, help="Directory for csv files") diff --git a/examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py b/examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py index 337f6540ab..fbf9b6dd28 100644 --- a/examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py +++ b/examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py @@ -6,9 +6,10 @@ import os import re import pandas as pd +import csv import matplotlib.pyplot as plt from tensorboard.backend.event_processing.event_accumulator import EventAccumulator -from utils import get_analyzer, find_files +from utils import get_analyzer, find_files_prefix, find_files_suffix from arguments import parser args = parser.parse_args() @@ -18,8 +19,8 @@ sns.set() def main(): - target_affix = 'events.out.tfevents' - tb_log_paths = find_files(args.tb_dir, target_affix) + target_prefix = 'events.out.tfevents' + tb_log_paths = find_files_prefix(args.tb_dir, target_prefix) analyzer = get_analyzer(args.analyzer) @@ -41,6 +42,8 @@ def main(): df = pd.DataFrame({"step": x, "value": y}) df.to_csv(f"{args.csv_name}{analyzer.get_csv_filename()}.csv") + plt.grid(True) + if not args.skip_plot: plt.legend() plt.title(args.plot_title) @@ -48,5 +51,35 @@ def main(): plt.ylabel(args.plot_y_label) plt.savefig(args.plot_name) +def plot_csv(): + target_suffix = 'csv' + csv_log_files = find_files_suffix(args.csv_dir, target_suffix) + + analyzer = get_analyzer(args.analyzer) + + for csv_file in csv_log_files: + analyzer.set_names(csv_file) + + x, y = [], [] + with open(csv_file, 'r') as file: + reader = csv.reader(file) + for row in reader: + if row[1] == 'step': + continue + x.append(int(row[1])) # Assuming the first column contains x values + y.append(float(row[2])) # Assuming the second column contains y values + + plt.plot(x, y, label=f'{analyzer.get_label_name()}') + + plt.grid(True) + plt.legend() + plt.title(args.plot_title) + plt.xlabel(args.plot_x_label) + plt.ylabel(args.plot_y_label) + plt.savefig(args.plot_name) + if __name__ == "__main__": - main() + if args.plot_only: + plot_csv() + else: + main() diff --git a/examples_deepspeed/universal_checkpointing/tb_analysis/uc_analysis.py b/examples_deepspeed/universal_checkpointing/tb_analysis/uc_analysis.py index f5809c3dc1..20d46ff6a8 100644 --- a/examples_deepspeed/universal_checkpointing/tb_analysis/uc_analysis.py +++ b/examples_deepspeed/universal_checkpointing/tb_analysis/uc_analysis.py @@ -19,7 +19,7 @@ def set_names(self, path_name): tp, pp, dp, sp = match.groups() self._label_name = f"Training Run: TP: {tp}, PP: {pp}, DP: {dp}" - self._csv_name = f"uc_out_tp_{tp}_pp_{pp}_dp_{dp}_sp_{sp}" + self._csv_name = f"uc_out_tp{tp}_pp{pp}_dp{dp}_sp{sp}" def get_label_name(self): return self._label_name diff --git a/examples_deepspeed/universal_checkpointing/tb_analysis/utils.py b/examples_deepspeed/universal_checkpointing/tb_analysis/utils.py index 4bbbb3f2f0..db6624bbc4 100644 --- a/examples_deepspeed/universal_checkpointing/tb_analysis/utils.py +++ b/examples_deepspeed/universal_checkpointing/tb_analysis/utils.py @@ -7,13 +7,13 @@ from uc_analysis import UniversalCheckpointingAnalysis -def find_files(directory, file_affix): +def find_files_prefix(directory, file_prefix): """ - Searches for files with a specific affix in a directory using os.walk(). + Searches for files with a specific prefix in a directory using os.walk(). Args: directory (str): The path to the directory to search. - file_affix (str): The desired file affix. + file_prefix (str): The desired file prefix. Returns: list: A list of paths to matching files. @@ -21,10 +21,28 @@ def find_files(directory, file_affix): matching_paths = [] for root, _, files in os.walk(directory): for filename in files: - if root not in matching_paths and filename.lower().startswith(file_affix.lower()): + if root not in matching_paths and filename.lower().startswith(file_prefix.lower()): matching_paths.append(os.path.join(root)) return matching_paths +def find_files_suffix(directory, file_suffix): + """ + Searches for files with a specific suffix in a directory using os.walk(). + + Args: + directory (str): The path to the directory to search. + file_suffix (str): The desired file suffix. + + Returns: + list: A list of paths to matching files. + """ + matching_paths = [] + for root, _, files in os.walk(directory): + for filename in files: + if root not in matching_paths and filename.lower().endswith(file_suffix.lower()): + matching_paths.append(os.path.join(filename)) + return matching_paths + def get_analyzer(analyzer_name): if analyzer_name == 'universal_checkpointing': return UniversalCheckpointingAnalysis() diff --git a/examples_deepspeed/zero_bubble_pp/README.md b/examples_deepspeed/zero_bubble_pp/README.md new file mode 100644 index 0000000000..2290860783 --- /dev/null +++ b/examples_deepspeed/zero_bubble_pp/README.md @@ -0,0 +1,51 @@ +# Zero Bubble Pipeline Parallelism Tutorials + +This folder contains examples and tutorials to enable Zero Bubble Pipeline Parallelism ([Paper Link](https://arxiv.org/abs/2401.10241)). The key idea is to breaking a backward pass into a $B$ pass and $W$ pass. $B$ on one stage will only depend on the $B$ on its next stage, compared to depending on both $B$ and $W$ of in 1F1B. + +![BW Split](./bw_split.png) + +Currently supported zero bubble schedules: +* ZB-H1 + +## ZB-H1 + +![alt text](zbh1.png) + +As shown in the above image, the ZB-H1 schedule cuts pipeline buble of 1F1B to 1/3. + +### ZB-H1 and Its Variation +There're two versions of ZB-H1 implemented in Megatron-Deepspeed: an official version (the 2nd schedule in the above image) which does a uniform B-W split, and another variation (the 3rd schedule in image) that does B-W split only when necessary. We provide the variation version as the default implementation. + +In practice the variation version is more friendly to a synchonized communication implementation and combined usage with tensor parallelism. However it changes the ordering of applying weight update of different microbatches (E.g. for Device 4 in the image above, the ordering of applying weight update is 4->5->6->7->1->2->3->8), hence might result in slightly different loss curve. + + +### How to use + +Simply add the following flag to the options to enable ZB-H1: + +``` +--enable-zbh1-pipeline +``` +The default implementation is the variation version of ZB-H1 mentioned in [Previous Section](#zb-h1). + +If you want the bit-to-bit exact semantics when compared to 1F1B, you can use the following flag. It might be a bit slower than the default implementation. + +``` +--enable-zbh1-exact-semantics +``` + +### ZB-H1 Toy Example + +Here is a toy example for using **ZB-H1** inside DeepSpeed repo. + +Firstly you'll need to prepare some sample training data and change the `data_path` in `zbh1_pretrain_gpt_1.3b.sh`. Then under this folder, Run + +``` +bash zbh1_pretrain_gpt_1.3b.sh +``` + +## Benchmarks + +The implementation has been checked and verified on various setups such as ZeRO Stage 1, activation recomputation, flash attention, tensor parallel, data parallel and bf16. By approximate measure, ~10% acceleration was observed when microbatch count is twice the number of pipeline stages: + +![alt text](benchmark.png) \ No newline at end of file diff --git a/examples_deepspeed/zero_bubble_pp/benchmark.png b/examples_deepspeed/zero_bubble_pp/benchmark.png new file mode 100644 index 0000000000..be46817d75 Binary files /dev/null and b/examples_deepspeed/zero_bubble_pp/benchmark.png differ diff --git a/examples_deepspeed/zero_bubble_pp/bw_split.png b/examples_deepspeed/zero_bubble_pp/bw_split.png new file mode 100644 index 0000000000..1ced957b44 Binary files /dev/null and b/examples_deepspeed/zero_bubble_pp/bw_split.png differ diff --git a/examples_deepspeed/zero_bubble_pp/zbh1.png b/examples_deepspeed/zero_bubble_pp/zbh1.png new file mode 100644 index 0000000000..364ef368a3 Binary files /dev/null and b/examples_deepspeed/zero_bubble_pp/zbh1.png differ diff --git a/examples_deepspeed/zero_bubble_pp/zbh1_pretrain_gpt_1.3b.sh b/examples_deepspeed/zero_bubble_pp/zbh1_pretrain_gpt_1.3b.sh new file mode 100644 index 0000000000..cf5705d973 --- /dev/null +++ b/examples_deepspeed/zero_bubble_pp/zbh1_pretrain_gpt_1.3b.sh @@ -0,0 +1,367 @@ +#!/bin/bash +dir=`pwd` +############################################################################### +### Main configs +## GPT-3 models use 2K sequence length/context window +seq_len=2048 + + +## The "GPT-3 XXX" below are configs from GPT-3 paper +## https://arxiv.org/abs/2005.14165, choose based on +## your desired model size or build your own configs + + +## init_std is standard deviation for weight initialization. Usually larger +## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size) +## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf) + + +## We changed min_lr to a lower number (1.0e-6), which we found is able to +## provide better zero-shot eval results. + + +## GPT-3 Small 125M +# model_size=0.125 +# num_layers=12 +# hidden_size=768 +# num_attn_heads=12 +# global_batch_size=256 +# lr=6.0e-4 +# min_lr=1.0e-6 +# init_std=0.02 + + +## GPT-3 Medium 350M +# model_size=0.35 +# num_layers=24 +# hidden_size=1024 +# num_attn_heads=16 +# global_batch_size=256 +# lr=3.0e-4 +# min_lr=1.0e-6 +# init_std=0.018 + + +## GPT-3 Large 760M +# model_size=0.76 +# num_layers=24 +# hidden_size=1536 +# num_attn_heads=16 +# global_batch_size=256 +# lr=2.5e-4 +# min_lr=1.0e-6 +# init_std=0.015 + + +## GPT-3 XL 1.3B +model_size=1.3 +num_layers=24 +hidden_size=2048 +num_attn_heads=16 +global_batch_size=16 +lr=2.0e-4 +min_lr=1.0e-6 +init_std=0.013 + + +## GPT-3 2.7B +# model_size=2.7 +# num_layers=32 +# hidden_size=2560 +# num_attn_heads=32 +# global_batch_size=512 +# lr=1.6e-4 +# min_lr=1.0e-6 +# init_std=0.011 + + +## GPT-3 6.7B +# model_size=6.7 +# num_layers=32 +# hidden_size=4096 +# num_attn_heads=32 +# global_batch_size=1024 +# lr=1.2e-4 +# min_lr=1.0e-6 +# init_std=0.009 + + +## GPT-3 13B +# model_size=13 +# num_layers=40 +# hidden_size=5120 +# num_attn_heads=40 +# global_batch_size=1024 +# lr=1.0e-4 +# min_lr=1.0e-6 +# init_std=0.008 + + +## GPT-3 175B +# model_size=175 +# num_layers=96 +# hidden_size=12288 +# num_attn_heads=96 +# global_batch_size=1536 +# lr=0.6e-4 +# min_lr=1.0e-6 +# init_std=0.005 +############################################################################### +### Training duration configs +## The main termination condition, original GPT-3 paper trains for 300B tokens. +train_tokens_in_billion=300 +train_tokens=$((${train_tokens_in_billion} * 1000000000)) + + +## train_samples is another termination condition and also affect the number of +## data samples to be indexed. Since we want to reach the train_tokens +## above, and data efficiency techniques may change num tokens in some samples, +## so we just set this config large enough to make sure we have enough +## processed data and don't terminate by train_samples. +train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} )) + + +## Another wall-clock time termination condition in minutes. Set it large +## enough to avoid undesired early termination. +exit_duration=30000000 +############################################################################### +### lr configs +## lr warmup and decay duration. +## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens. +## Here we increase the warmup tokens to 3B since when batch size warmup is not +## used, there are more tokens per step. Thus we need to increase warmup tokens +## to make sure there are enough warmup steps, which is important for training +## stability. +lr_warmup_tokens_in_million=3000 +lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000)) +## Here we changed the LR decay tokens to align with total train tokens, since +## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the +## learning rate schedule to match the number of training tokens results in the +## best final model quality +lr_decay_tokens_in_billion=${train_tokens_in_billion} +lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000)) +lr_decay_style="cosine" +############################################################################### +### Parallelism configs +## Model parallelism, 1 is no MP +mp_size=1 + + +## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true. +## Note that currently both curriculum learning and random-LTD are NOT +## compatible with pipeline parallelism. +pp_size=8 +no_pp="false" + + +## ZeRO-based data parallelism, stage=0 will disable ZeRO +zero_stage=0 + + +## Total number of GPUs. ds_ssh is from DeepSpeed library. +num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2)) +num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) +num_node=$(( ${num_gpus} / ${num_gpus_pernode} )) + + +## Data parallel size. +dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} )) + + +## Micro batch size per GPU +## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus +## Reduce it manually if GPU OOM +# batch_size=$(( ${global_batch_size} / ${dp_size} )) +batch_size=1 +############################################################################### +### Misc configs +log_interval=1 +eval_iters=10100 +eval_interval=10100 +# num_save controls how frequent to save checkpoint. num_save=20 means that a +# checkpoint will be saved every 5% of training. For longer training you would +# want larger num_save to save more frequently, and vice versa. +num_save=1 +# estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size})) +# save_interval=$((${estimated_train_iter} / ${num_save})) +save_interval=10100 + + +## Activation checkpointing saves GPU memory, but reduces training speed +activation_checkpoint="false" +# activation_checkpoint="false" + + +## Whether or not log optimizer states (norms, max abs values) to tensorboard. +## This is not required for training and might save GPU memory when turned off. +log_optimizer_state="true" +############################################################################### +### Output and data configs +current_time=$(date "+%Y.%m.%d_%H.%M.%S") +host="${HOSTNAME}" +seed=1234 +num_workers=0 + + +## Public the Pile dataset, can be downloaded at +## https://mystic.the-eye.eu/public/AI/pile_neox/ or +## https://the-eye.eu/public/AI/pile_neox/ Change data_home to where you +## store the pile_text_document.bin and pile_text_document.idx. +data_home="/code" +data_path="${data_home}/gpt_data/my-gpt2_text_document" + + +vocab_path="gpt2-vocab.json" +if [ ! -f "$vocab_path" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json +fi +merge_path="gpt2-merges.txt" +if [ ! -f "$merge_path" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt +fi + + +prescale_grad="true" +jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B" +jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}" +jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}" +if [[ $zero_stage -gt 0 ]]; then + jobname="${jobname}_z${zero_stage}" + prescale_grad="false" +fi +if [[ $mp_size -gt 1 ]]; then + jobname="${jobname}_mp${mp_size}" +fi +if [ "${no_pp}" = "false" ]; then + jobname="${jobname}_pp${pp_size}" +fi +jobname="${jobname}_seed${seed}_rebase" + + +username=$(whoami) +output_home="/blob/users/${username}/project/data_efficient_gpt" +log_path="${output_home}/log/" +checkpoint_path="${output_home}/checkpoint/${jobname}" +## Microsoft internal constraint: because tensorboard is logged by last rank, +## it's better to put the path in NFS instead of Blob. +tensorboard_dir="/vc_data/users/${username}/project/data_efficient_gpt/tensorboard/" +tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}" +mkdir -p ${log_path} +mkdir -p ${checkpoint_path} +mkdir -p ${tensorboard_path} +############################################################################### +data_options=" \ + --vocab-file ${vocab_path} \ + --merge-file ${merge_path} \ + --data-path ${data_path} \ + --data-impl mmap" + + +## If CL is used, make sure to set "--split" the same as what you used during +## offline data analysis&indexing. +megatron_options=" \ + --override-opt_param-scheduler \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --tensor-model-parallel-size ${mp_size} \ + --init-method-std ${init_std} \ + --lr-decay-tokens ${lr_decay_tokens} \ + --lr-warmup-tokens ${lr_warmup_tokens} \ + --micro-batch-size ${batch_size} \ + --exit-duration-in-mins ${exit_duration} \ + --global-batch-size ${global_batch_size} \ + --num-layers ${num_layers} \ + --hidden-size ${hidden_size} \ + --num-attention-heads ${num_attn_heads} \ + --seq-length ${seq_len} \ + --max-position-embeddings ${seq_len} \ + --train-tokens ${train_tokens} \ + --train-samples ${train_samples} \ + --lr ${lr} \ + --min-lr ${min_lr} \ + --lr-decay-style ${lr_decay_style} \ + --split 949,50,1 \ + --log-interval ${log_interval} \ + --eval-interval ${eval_interval} \ + --eval-iters ${eval_iters} \ + --save-interval ${save_interval} \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --hysteresis 2 \ + --num-workers ${num_workers} \ + --fp16 \ + --seed ${seed} \ + --load ${checkpoint_path} \ + --save ${checkpoint_path} \ + --no-async-tensor-model-parallel-allreduce \ + --tensorboard-queue-size 1 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + --tensorboard-dir ${tensorboard_path}" + + +if [ "${activation_checkpoint}" = "true" ]; then +megatron_options="${megatron_options} \ + --checkpoint-activations" +fi + + +if [ "${log_optimizer_state}" = "true" ]; then +megatron_options="${megatron_options} \ + --log-optimizer-states-to-tensorboard" +fi + + +config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}.json" +template_json="../rebase/ds_config_gpt_TEMPLATE.json" +sed "s/GBSIZE/${global_batch_size}/" ${template_json} \ + | sed "s/MBSIZE/${batch_size}/" \ + | sed "s/LOG_INTERVAL/${log_interval}/" \ + | sed "s/ZERO_STAGE/${zero_stage}/" \ + | sed "s/PRESCALE_GRAD/${prescale_grad}/" \ + > ${config_json} + + +deepspeed_options=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${zero_stage} \ + --enable-zbh1-pipeline \ + --enable-zbh1-exact-semantics \ + --pipeline-model-parallel-size ${pp_size}" + + +if [[ "${no_pp}" = "true" ]]; then +deepspeed_options="${deepspeed_options} \ + --no-pipeline-parallel" +fi + + +if [ "${activation_checkpoint}" = "true" ]; then +deepspeed_options="${deepspeed_options} \ + --deepspeed-activation-checkpointing" +fi + + +## When saving checkpoint to a storage with cache, their could be consistency +## issue of the pointer to latest checkpoint. Here we find the correct pointer +## and broadcast it to all nodes. +iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt" +iteration_file_2="$checkpoint_path/latest" +iteration=0 +for (( node = 0; node <= num_node-1; node++ )) +do + if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then + local_iteration=$(ssh -q worker-"$node" cat $iteration_file) + iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} : ${iteration} )) + fi +done +if [[ $iteration -gt 0 ]]; then + iteration_2="global_step${iteration}" + ds_ssh "echo $iteration > $iteration_file" + ds_ssh "echo $iteration_2 > $iteration_file_2" +fi + + +deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee log_zbh1_exact.txt \ No newline at end of file diff --git a/generate_config.sh b/generate_config.sh deleted file mode 100644 index 6bea420a2a..0000000000 --- a/generate_config.sh +++ /dev/null @@ -1,144 +0,0 @@ -#!/bin/bash --login - -for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" \ - "$PP" "$DTYPE" -do - if [ -z $v ]; then - echo "Please export required envs before execute $0" - exit 1 - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 config_file" - exit 1 -fi - -extra="" -common="\ - \"train_batch_size\": $GLOBAL_BATCH, - \"train_micro_batch_size_per_gpu\": $MICRO_BATCH, - \"steps_per_print\": 1, - \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, - \"optimizer\": { - \"type\": \"AdamW\", - \"params\": { - \"lr\": ${LR}, - \"beta1\": 0.9, - \"beta2\": 0.95, - \"eps\": 1e-5, - \"weight_decay\": 1e-1 - } - }, - \"scheduler\": { - \"type\": \"WarmupLR\", - \"params\": { - \"warmup_min_lr\": 0.00003, - \"warmup_max_lr\": 0.0003, - \"warmup_num_steps\": 5000 - } - }, - \"zero_allow_untested_optimizer\": true, - \"gradient_clipping\": 1.0, - \"activation_checkpointing\": { - \"partition_activations\": true, - \"contiguous_memory_optimization\": false - }, - \"wall_clock_breakdown\": false," - -flops_profiler="\ - \"flops_profiler\": { - \"enabled\": false, - \"profile_step\": 45, - \"module_depth\": -1, - \"top_modules\": 1, - \"detailed\": true, - \"output_file\": null - }" - -if [[ $DTYPE == "bf16" ]]; then -dtype="\ - \"communication_data_type\": \"bfp16\", - \"fp16\": { - \"enabled\": false, - \"loss_scale\": 0, - \"loss_scale_window\": 1000, - \"hysteresis\": 2, - \"min_loss_scale\": 1 - }, - \"bfloat16\": { - \"enabled\": true, - \"loss_scale\": 1.0 - }," -else -dtype="\ - \"communication_data_type\": \"fp16\", - \"fp16\": { - \"enabled\": true, - \"loss_scale\": 0, - \"loss_scale_window\": 1000, - \"hysteresis\": 2, - \"min_loss_scale\": 1 - }, - \"bfloat16\": { - \"enabled\": false, - \"loss_scale\": 1.0 - }," -fi - -if [ $ZERO_STAGE == 3 ]; then -zero="\ - \"zero_optimization\": { - \"stage\": 3, - \"reduce_scatter\": false, - \"stage3_max_live_parameters\": 3e9, - \"stage3_max_reuse_distance\": 3e9, - \"stage3_param_persistence_threshold\": 1e5, - \"stage3_prefetch_bucket_size\": 5e7, - \"contiguous_gradients\": true, - \"overlap_comm\": true, - \"reduce_bucket_size\": 90000000, - \"sub_group_size\": 1e9, - \"offload_optimizer\": { - \"device\": \"none\", - \"buffer_count\": 4, - \"pipeline_read\": false, - \"pipeline_write\": false, - \"pin_memory\": true - } - }," -elif [ $ZERO_STAGE == 2 ] || [ $ZERO_STAGE == 1 ]; then -zero="\ - \"zero_optimization\": { - \"stage\": $ZERO_STAGE - }," - if [ $ZERO_STAGE == 1 ]; then - if [ $PP > 1 ]; then - extra="\ - \"data_types\": { - \"grad_accum_dtype\": \"fp32\" - }, - \"comms_logger\": { - \"enabled\": true, - \"verbose\": false, - \"prof_all\": true, - \"debug\": false - }," - else - echo 'please add the config for zero_stage 1 without pipeline-parallelism' - fi - fi -else - echo 'Please add the correct config set!!!' -fi - -# flops_profiler must at the end because no ',' is allowed at the end -cat < $1 -{ -$common -$zero -$dtype -$extra -$flops_profiler -} -EOT diff --git a/generate_config_cpu_optimizer.sh b/generate_config_cpu_optimizer.sh deleted file mode 100644 index 99dec97958..0000000000 --- a/generate_config_cpu_optimizer.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/bin/bash --login - -for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" \ - "$PP" "$DTYPE" -do - if [ -z $v ]; then - echo "Please export required envs before execute $0" - exit 1 - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 config_file" - exit 1 -fi - -extra="" -common="\ - \"train_batch_size\": $GLOBAL_BATCH, - \"train_micro_batch_size_per_gpu\": $MICRO_BATCH, - \"steps_per_print\": 1, - \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, - \"optimizer\": { - \"type\": \"AdamW\", - \"params\": { - \"lr\": ${LR}, - \"beta1\": 0.9, - \"beta2\": 0.95, - \"eps\": 1e-5, - \"weight_decay\": 1e-1 - } - }, - \"scheduler\": { - \"type\": \"WarmupLR\", - \"params\": { - \"warmup_min_lr\": 0.00003, - \"warmup_max_lr\": 0.0003, - \"warmup_num_steps\": 5000 - } - }, - \"zero_allow_untested_optimizer\": true, - \"gradient_clipping\": 1.0, - \"activation_checkpointing\": { - \"partition_activations\": true, - \"contiguous_memory_optimization\": false - }, - \"wall_clock_breakdown\": false," - -flops_profiler="\ - \"flops_profiler\": { - \"enabled\": false, - \"profile_step\": 45, - \"module_depth\": -1, - \"top_modules\": 1, - \"detailed\": true, - \"output_file\": null - }" - -if [[ $DTYPE == "bf16" ]]; then -dtype="\ - \"communication_data_type\": \"bfp16\", - \"fp16\": { - \"enabled\": false, - \"loss_scale\": 0, - \"loss_scale_window\": 1000, - \"hysteresis\": 2, - \"min_loss_scale\": 1 - }, - \"bfloat16\": { - \"enabled\": true, - \"loss_scale\": 1.0 - }," -else -dtype="\ - \"communication_data_type\": \"fp16\", - \"fp16\": { - \"enabled\": true, - \"loss_scale\": 0, - \"loss_scale_window\": 1000, - \"hysteresis\": 2, - \"min_loss_scale\": 1 - }, - \"bfloat16\": { - \"enabled\": false, - \"loss_scale\": 1.0 - }," -fi - -if [ $ZERO_STAGE == 3 ]; then -zero="\ - \"zero_optimization\": { - \"stage\": 3, - \"reduce_scatter\": false, - \"stage3_max_live_parameters\": 3e9, - \"stage3_max_reuse_distance\": 3e9, - \"stage3_param_persistence_threshold\": 1e5, - \"stage3_prefetch_bucket_size\": 5e7, - \"contiguous_gradients\": true, - \"overlap_comm\": true, - \"reduce_bucket_size\": 90000000, - \"sub_group_size\": 1e9, - \"offload_optimizer\": { - \"device\": \"none\", - \"buffer_count\": 4, - \"pipeline_read\": false, - \"pipeline_write\": false, - \"pin_memory\": true - } - }," -elif [ $ZERO_STAGE == 2 ] || [ $ZERO_STAGE == 1 ]; then -zero="\ - \"zero_optimization\": { - \"stage\": $ZERO_STAGE, - \"offload_optimizer\": { - \"device\": \"cpu\", - \"buffer_count\": 4, - \"pipeline_read\": false, - \"pipeline_write\": false, - \"pin_memory\": true - } - }," - if [ $ZERO_STAGE == 1 ]; then - if [ $PP > 1 ]; then - extra="\ - \"data_types\": { - \"grad_accum_dtype\": \"fp32\" - }, - \"comms_logger\": { - \"enabled\": true, - \"verbose\": false, - \"prof_all\": true, - \"debug\": false - }," - else - echo 'please add the config for zero_stage 1 without pipeline-parallelism' - fi - fi -else - echo 'Please add the correct config set!!!' -fi - -# flops_profiler must at the end because no ',' is allowed at the end -cat < $1 -{ -$common -$zero -$dtype -$extra -$flops_profiler -} -EOT diff --git a/mds_to_hf.py b/mds_to_hf.py new file mode 100644 index 0000000000..d91513ed8b --- /dev/null +++ b/mds_to_hf.py @@ -0,0 +1,106 @@ +# Usage : python mds_to_hf.py --mds_checkpoint --output_dir --cache-dir /flare/Aurora_deployment/vsastry +# Tips : Do not run on login node. +# This script currently only takes care of tp=1. Takes a AuroraGPT Llama model trained with Megatron-DeepSpeed and converts to LLamaCausalForLM architecture from HuggingFace. + +import argparse +import torch +import os +from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer + +def repeat_kv_wt(x,np): + return torch.repeat_interleave(x, dim=0, repeats=np) + +def Update_llama_config(Llama_config, mds_args): + if mds_args['swiglu']: + Llama_config.hidden_act = "silu" + Llama_config.hidden_size = mds_args['hidden_size'] + Llama_config.intermediate_size = mds_args['ffn_hidden_size'] + Llama_config.max_position_embeddings = mds_args['max_position_embeddings'] + Llama_config.num_attention_heads = mds_args['num_attention_heads'] + Llama_config.num_hidden_layers = mds_args['num_layers'] + Llama_config.num_key_value_heads = mds_args['num_key_value_heads'] + Llama_config.rms_norm_eps = mds_args['layernorm_epsilon'] + Llama_config.rope_theta = mds_args['rope_theta'] + Llama_config.vocab_size = mds_args['padded_vocab_size'] + if mds_args['fp16'] == True: + Llama_config.torch_dtype = 'float16' + elif mds_args['bf16'] == True: + Llama_config.torch_dtype = 'bfloat16' + return Llama_config + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--mds_checkpoint', required=True) + parser.add_argument('--output_dir', required=True) + parser.add_argument('--cache_dir', required=True) + args = parser.parse_args() + + # make output_dir if it does not exits. + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + filename = str(args.mds_checkpoint) + if not filename.split("/")[-1].startswith('mp_rank') and not filename.split("/")[-1].endswith('.pt'): + assert ("Provide the right file path, The file should be of format mp_rank_*.pt") + print(f"loading mds checkpoint {filename}") + + mds_model = torch.load(args.mds_checkpoint,map_location=torch.device('cpu')) + Llama_model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf",cache_dir=args.cache_dir) + + Llama_config = Llama_model.config + Updated_Llama_config = Update_llama_config(Llama_config, mds_model['args'].__dict__) + # save the updated config.json file + Updated_Llama_config.to_json_file(os.path.join(args.output_dir,'config.json')) + + state_dict = {} + dim = mds_model['args'].__dict__['kv_channels'] + inv_freq = 1.0 / (mds_model['args'].__dict__['rope_theta'] ** (torch.arange(0,dim, 2).float() / dim)) + hidden_size = mds_model['args'].__dict__['hidden_size'] + kv_dim = mds_model['args'].__dict__['kv_channels'] * mds_model['args'].__dict__['num_key_value_heads'] + kv_groups = mds_model['args'].__dict__['num_attention_heads'] // mds_model['args'].__dict__['num_key_value_heads'] + nkvheads = mds_model['args'].__dict__['num_key_value_heads'] + for layer_i in range(Updated_Llama_config.__dict__['num_hidden_layers']): + # SELF ATTENTION layers. + # get the q, k, v weights separately. Keeping k and v at the GQA head dim, since the transformers/models/llama/modelling_utils will take care of it. + fused_qkv = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.self_attention.query_key_value.weight"] + fused_reshape = fused_qkv.view(nkvheads,(kv_groups+2)*dim,hidden_size) + ex_q = fused_reshape[:,:kv_groups*dim,:] + con_q = ex_q.contiguous().view(-1, fused_reshape.size(2)) + + ex_k = fused_reshape[:,kv_groups*dim:(kv_groups+1)*dim,:] + con_k = ex_k.contiguous().view(-1, fused_reshape.size(2)) + + ex_v = fused_reshape[:,(kv_groups+1)*dim:(kv_groups+2)*dim,:] + con_v = ex_v.contiguous().view(-1, fused_reshape.size(2)) + + state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = con_q + state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = con_k + #state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = repeat_kv_wt(fused_qkv[hidden_size:hidden_size+kv_dim], kv_groups) + state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = con_v + #state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = repeat_kv_wt(fused_qkv[hidden_size+kv_dim:hidden_size+2*kv_dim],kv_groups) + state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.self_attention.dense.weight"] + + # MLP Layers + fused_mlp = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.mlp.dense_h_to_4h.weight"] + chunked_mlp = torch.chunk(fused_mlp,2,dim=0) + state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = chunked_mlp[0] + state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = chunked_mlp[1] + state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.mlp.dense_4h_to_h.weight"] + + #LayerNorm weights and RoPe + state_dict[f"model.layers.{layer_i}.input_layernorm.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.input_layernorm.weight"] + state_dict[f"model.layers.{layer_i}.post_attention_layernorm.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.post_attention_layernorm.weight"] + + state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq + + # Get the non-encoder layer weights. + state_dict["model.embed_tokens.weight"] = mds_model['module']['language_model']['embedding']['word_embeddings']['weight'] + state_dict["model.norm.weight"] = mds_model['module']['language_model']['encoder']['final_layernorm.weight'] + state_dict["lm_head.weight"] = mds_model['module']['language_model']['output_layer']['weight'] + + # Save the model in the hf output path. + torch.save(state_dict, os.path.join(args.output_dir,"pytorch_model.bin")) + + + diff --git a/megatron/__init__.py b/megatron/__init__.py index d92a279ec6..4a7fe2cd82 100644 --- a/megatron/__init__.py +++ b/megatron/__init__.py @@ -9,6 +9,7 @@ from .global_vars import update_num_microbatches from .global_vars import get_tokenizer from .global_vars import get_tensorboard_writer +from .global_vars import get_wandb_writer from .global_vars import get_adlr_autoresume from .global_vars import get_timers from .initialize import initialize_megatron diff --git a/megatron/arguments.py b/megatron/arguments.py index d83fe99856..9b0e6ccb1a 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -1,3 +1,4 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. """Megatron arguments.""" @@ -44,6 +45,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): parser = _add_inference_args(parser) parser = _add_transformer_engine_args(parser) parser = _add_retro_args(parser) + parser = _add_profiler_args(parser) # Custom arguments. if extra_args_provider is not None: @@ -73,6 +75,12 @@ def validate_args(args, defaults={}): assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\ ' ({}) is not divisible by tensor model parallel size ({})'.format( args.world_size, args.tensor_model_parallel_size) + # Zero bubble pipeline is defined on deepspeed's scheduler + if args.enable_zbh1_pipeline: + assert args.deepspeed, 'Use DeepSpeed to use zero-bubble H1 pipeline' + assert args.sequence_parallel == False, "Sequence Parallel not tested, proceed at own will by removing this line" + if args.enable_zbh1_exact_semantics: + assert args.enable_zbh1_pipeline, 'Exact semantics require ZBH1 pipeline enabled' # Pipeline model parallel size. args.pipeline_model_parallel_size = min( args.pipeline_model_parallel_size, @@ -95,8 +103,8 @@ def validate_args(args, defaults={}): args.ds_sequence_parallel_size assert args.world_size % model_parallel_size == 0, 'world size ({}) is not'\ ' divisible by tensor parallel size ({}) times pipeline parallel ' \ - 'size ({})'.format(args.world_size, args.tensor_model_parallel_size, - args.pipeline_model_parallel_size) + 'size ({}) times seqence parallel size ({})'.format(args.world_size, args.tensor_model_parallel_size, + args.pipeline_model_parallel_size, args.ds_sequence_parallel_size) args.data_parallel_size = args.world_size // model_parallel_size if args.rank == 0: print('using world size: {}, data-parallel-size: {}, ' @@ -391,7 +399,8 @@ def validate_args(args, defaults={}): args.async_tensor_model_parallel_allreduce = False if not args.use_dataset_only: - if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1": + if deepspeed.accelerator.get_accelerator().device_name() == "cuda" \ + and os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1": if args.sequence_parallel: raise RuntimeError( "Using sequence parallelism requires setting the environment variable " @@ -421,7 +430,7 @@ def validate_args(args, defaults={}): args.compression_training = False # FlashAttention - args.use_flash_attn = args.use_flash_attn_v1 or args.use_flash_attn_triton or args.use_flash_attn_v2 + args.use_flash_attn = args.use_flash_attn_v1 or args.use_flash_attn_triton or args.use_flash_attn_v2 or args.use_flash_attn_builder # AML if args.aml_data_download_path is not None: @@ -672,6 +681,9 @@ def _add_network_size_args(parser): help='Untie embeddings and output weights.'), group.add_argument('--embedding-weights-in-fp32', action='store_true', help='Cast word embedding weights to fp32 before embedding fwd.'), + group.add_argument('--kill-switch-file', type=str, default=None, + help='Location of kill switch file. ' + 'If found will automatically exit the program at runtime.') return parser @@ -740,6 +752,12 @@ def _add_logging_args(parser): group.add_argument('--log-world-size-to-tensorboard', action='store_true', help='Enable world size logging to tensorboard.') + group.add_argument('--wandb-project', type=str, default='', + help='The wandb project name. Ignore wandb by default.') + group.add_argument('--wandb-exp-name', type=str, default='', + help='The wandb experiment name.') + group.add_argument('--wandb-save-dir', type=str, default='', + help='Path to save the wandb results locally.') return parser @@ -762,6 +780,15 @@ def _add_regularization_args(parser): help='Weight decay increment function.') group.add_argument('--clip-grad', type=float, default=1.0, help='Gradient clipping based on global L2 norm.') + group.add_argument('--sophiag-beta1', type=float, default=0.9, + help='First coefficient for computing running averages ' + 'of gradient and its hessian') + group.add_argument('--sophiag-beta2', type=float, default=0.95, + help='Second coefficient for computing running averages ' + 'of gradient and its hessian') + group.add_argument('--sophiag-rho', type=float, default=0.01, + help='SophiaG clipping threshhold') + group.add_argument('--adam-beta1', type=float, default=0.9, help='First coefficient for computing running averages ' 'of gradient and its square') @@ -800,7 +827,7 @@ def _add_training_args(parser): ' ' ' ' 'For example:' - ' --rampup-batch-size 16 8 300000 \ ' + ' --rampup-batch-size 16 8 300000 \\ ' ' --global-batch-size 1024' 'will start with global batch size 16 and over ' ' (1024 - 16) / 8 = 126 intervals will increase' @@ -835,6 +862,10 @@ def _add_training_args(parser): 'uniformly divided recompute unit, ' '2) block: the number of individual Transformer layers ' 'to recompute within each pipeline stage.') + group.add_argument('--enable-zbh1-pipeline', action='store_true', + help='Activate zero bubble pipeline parallelism schedule method') + group.add_argument('--enable-zbh1-exact-semantics', action='store_true', + help='Use an exact semantics for zbh1 schedule, might be slower than the default.') # deprecated # HACK: added back arguments because DeepSpeed still relies on the old @@ -874,6 +905,8 @@ def _add_training_args(parser): 'training if SIGTERM is received') group.add_argument('--tensorboard-dir', type=str, default=None, help='Write TensorBoard logs to this directory.') + group.add_argument('--trace-dir', type=str, default="./trace/", + help='Write trace logs to this directory.') group.add_argument('--no-masked-softmax-fusion', action='store_false', help='Disable fusion of query_key_value scaling, ' @@ -910,12 +943,43 @@ def _add_training_args(parser): 'https://arxiv.org/abs/2307.08691') group.add_argument('--use-flash-attn-triton', action='store_true', help='use FlashAttention implementation of attention using Triton.') + group.add_argument('--use-flash-attn-builder', action='store_true', + help='use FlashAttention op builder.') group.add_argument('--disable-bias-linear', action='store_false', help='Disable bias in the linear layers', dest='add_bias_linear') - group.add_argument('--optimizer', type=str, default='adam', - choices=['adam', 'sgd'], - help='Optimizer function') + group.add_argument( + '--optimizer', + type=str, + default='adam', + choices=[ + 'adam', + 'adamw', + 'sophiag', + 'sgd', + 'ds.fusedlamb', + 'ipex.lamb', + 'ipex.fusedlamb', + 'apex.adam', + 'apex.sgd', + 'adamwschedulefree', + 'sgdschedulefree', + 'galoreadamw', + 'adam8bit', + 'galoreadamw8bit', + 'galoreadamw8bitperlayer' + ], + help='Optimizer function' + ) + group.add_argument( + "--schedulefree-for-each", + action="store_true", + help=""" + Use a foreach-backed implementation of the schedulefree optimizers. + Should be significantly faster, + but will have a higher peak memory usage. + """, + ) group.add_argument('--dataloader-type', type=str, default=None, choices=['single', 'cyclic'], help='Single pass vs multiple pass data loader') @@ -958,7 +1022,19 @@ def _add_training_args(parser): dest='gradient_accumulation_fusion') group.add_argument('--use-dataset-only', type=bool, required=False, default=False, help='If set to True, only use the megatron dataset for external trainer ') - group.add_argument('--profile', action='store_true', help='Enable Torch Profiler') + # group.add_argument('--profile', action='store_true', help='Enable Torch Profiler') + group.add_argument( + "--train-range-to-skip", + action="extend", + nargs="+", + type=int, + help=("Range of iters to skip during training. Must be in pairs."), + ) + group.add_argument('--train-iters-to-skip', action="extend", nargs="+", type=str, + help=( + "Specific train iterations to skip when training. " + "Load the data and just perform a noop." + )) return parser @@ -1137,7 +1213,7 @@ def _add_distributed_args(parser): help='overlap pipeline parallel communication with forward and backward chunks', dest='overlap_p2p_comm') group.add_argument('--distributed-backend', default='nccl', - choices=['nccl', 'gloo', 'ccl'], + choices=['nccl', 'gloo', 'ccl', 'hccl'], help='Which backend to use for distributed training.') group.add_argument('--distributed-timeout-minutes', type=int, default=10, help='Timeout minutes for torch.distributed.') @@ -1215,6 +1291,10 @@ def _add_data_args(parser): group.add_argument('--data-file-list', type=str, default=None, help='The file with the list of dataset and weights') + group.add_argument('--shuffle-sample-in-corpus', action='store_true', help="Whether to shuffle the samples within in the dataset files") + + group.add_argument('--blend-sample-in-corpus', action='store_true', help="Whether to blend different files in the same corpus") + group.add_argument('--split', type=str, default='969, 30, 1', help='Comma-separated list of proportions for training,' ' validation, and test split. For example the split ' @@ -1281,6 +1361,8 @@ def _add_data_args(parser): help='What type of tokenizer to use.') group.add_argument('--tokenizer-model', type=str, default=None, help='Sentencepiece tokenizer model.') + group.add_argument('--trust-remote-code', action='store_true', default=False, + help='To run HFTokenizer model from local path.') group.add_argument('--data-impl', type=str, default='infer', choices=['mmap', 'infer'], help='Implementation of indexed datasets.') @@ -1310,6 +1392,7 @@ def _add_data_args(parser): help='Force to use certain index file.') group.add_argument('--repeated-dataloader', action='store_true', help='Once all the data has been loaded, reuse the DataLoader.') + group.add_argument('--multiprocessing-context', type=str, default='fork') return parser @@ -1466,6 +1549,8 @@ def _add_zero_args(parser): help='Remote device for ZeRO-3 initialized parameters.') group.add_argument('--use-pin-memory', action='store_true', help='Use pinned CPU memory for ZeRO-3 initialized model parameters.') + group.add_argument('--use-mics', action='store_true', + help='Use MiCS') return parser def _add_memoryopt_args(parser): @@ -1510,7 +1595,6 @@ def _add_activation_checkpoint_args(parser): def _add_distillation_args(parser): group = parser.add_argument_group('Knowledge distillation', 'Distillation Configurations') - group.add_argument('--num-layers-teacher', type=int, default=None, help='Number of the teacher transformer layers.') group.add_argument('--num-experts-teacher', type=int, nargs='+', default=[1,], @@ -1519,7 +1603,6 @@ def _add_distillation_args(parser): help='Tansformer teacher hidden size.') group.add_argument('--num-attention-heads-teacher', type=int, default=None, help='Number of teacher transformer attention heads.') - group.add_argument('--mos', action='store_true', help='Enable Mixture-of-Students via knolwedge distillation.') group.add_argument('--kd', action='store_true', @@ -1529,8 +1612,30 @@ def _add_distillation_args(parser): group.add_argument('--kd-temp', default=1.0, type=float) group.add_argument('--reset-iteration', action='store_true', help='Reset the iteration count.') - group.add_argument('--load-teacher', type=str, default=None, help='Directory containing a teacher model checkpoint.') return parser + + +def _add_profiler_args(parser): + group = parser.add_argument_group(title='profiling configuration') + + group.add_argument("--profile", + type=str, + default=None, + choices=['pt', 'pt-full'], + help="Enable profiling, pt-full gives call stack compared to pt") + + group.add_argument("--profile_steps", + type=str, + default='2,3', + help="Which steps to profile. Format: ,") + + group.add_argument("--profile-ranks", + type=int, + nargs='+', + default=None, + help="Which ranks to profile. Format: 0 1 2 3") + + return parser diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index 3e4d20035e..a4f82ec9d3 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -8,6 +8,12 @@ import numpy as np from deepspeed.accelerator import get_accelerator import torch +import ezpz as ez +import logging +import torch.distributed as tdist + +import yaml +from pathlib import Path from megatron import update_num_microbatches, get_tokenizer from megatron.core import mpu, tensor_parallel @@ -15,6 +21,7 @@ from .utils import (unwrap_model, print_rank_0, is_rank_0) +from .utils import PerfTrace, Profile from deepspeed.checkpoint import ( ORIGINAL_VOCAB_SIZE, @@ -24,9 +31,15 @@ UNIVERSAL_CHECKPOINT_VERSION_VALUE, ) -_CHECKPOINT_VERSION = None +RANK = ez.get_rank() +WORLD_SIZE = ez.get_world_size() +DEVICE = ez.get_torch_device() +log = logging.getLogger(__name__) +log.setLevel("INFO") if RANK == 0 else log.setLevel("CRITICAL") +_CHECKPOINT_VERSION = None +dlp = Profile("CHECKPOINT") def set_checkpoint_version(value): global _CHECKPOINT_VERSION if _CHECKPOINT_VERSION is not None: @@ -155,7 +168,7 @@ def get_checkpoint_tracker_filename(checkpoints_path): training to restart from.""" return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt') - +@dlp.log def read_metadata(tracker_filename): # Read the tracker file and either set the iteration or # mark it as a release checkpoint. @@ -195,7 +208,7 @@ def read_metadata(tracker_filename): max_iter = iteration return max_iter, release - +@dlp.log def get_rng_state(): """ collect rng state across data parallel ranks """ args = get_args() @@ -221,10 +234,16 @@ def get_rng_state(): return rng_state_list - +@dlp.log def save_checkpoint(iteration, model, optimizer, opt_param_scheduler): """Save a model checkpoint.""" args = get_args() + assert args is not None + args_iter = args.iteration + if args_iter != iteration: + log.warning(f"{args.iteration=} != {iteration} passed to 'save_checkpoint'") + + save_lr_state_dict() # Only rank zero of the data parallel writes to the disk. if not args.deepspeed: @@ -322,7 +341,7 @@ def state_dict_for_save_checkpoint_deepspeed(destination=None, prefix='', keep_v if torch.distributed.is_initialized(): torch.distributed.barrier() - +@dlp.log def _transpose_first_dim(t, num_splits, num_splits_first, model): input_shape = t.size() # We use a self_attention module but the values extracted aren't @@ -392,7 +411,7 @@ def fix_query_key_value_ordering(model, checkpoint_version): print_rank_0(" succesfully fixed query-key-values ordering for" " checkpoint version {}".format(checkpoint_version)) - +@dlp.log def _load_base_checkpoint(load_dir, rank0=False): """ Load the base state_dict from the given directory @@ -447,7 +466,7 @@ def _load_base_checkpoint(load_dir, rank0=False): return state_dict, release - +@dlp.log def load_args_from_checkpoint(args, load_arg='load'): """Set required arguments from the checkpoint specified in the arguments. @@ -528,16 +547,82 @@ def _set_arg(arg_name, old_arg_name=None, force=False): _set_arg('num_layers_per_virtual_pipeline_stage') return args, checkpoint_args - -def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', strict=True, load_only_weights=False): +@dlp.log +def load_lr_state_dict(strict: bool = False) -> dict: + """Load {iteration, lr} from .yaml file when restoring from checkpoint.""" + args = get_args() + assert args is not None + lr_state_dict_fp = Path(args.load).joinpath( + f"lr_state_dict_{RANK}_of_{WORLD_SIZE}.yaml" + ) + lr_state_dict = {} + if lr_state_dict_fp.is_file(): + with lr_state_dict_fp.open('r') as f: + lr_state_dict = yaml.safe_load(f) + args.lr = lr_state_dict['lr'] + else: + if strict: + raise FileNotFoundError( + f"{lr_state_dict_fp=}.is_file() is False" + ) + log.info( + f"Unable to load lr_state_dict from {lr_state_dict_fp=}, " + f"but strict=False. Returning empty dictionary: {lr_state_dict=}" + ) + return lr_state_dict + +@dlp.log +def save_lr_state_dict() -> None: + """Save {iteration, lr} to .yaml file for safe-keeping. + + Make sure we're only saving from RANK == 0. + """ + if RANK != 0: + return None + args = get_args() + assert args is not None + outdir = getattr(args, 'save', None) + assert outdir is not None + lr_state_dict_fp = Path(args.save).joinpath( + "lr_state_dict.yaml" + ) + log.info(f"Saving lr_state_dict to {lr_state_dict_fp.as_posix()}") + with lr_state_dict_fp.open('w') as f: + yaml.dump( + {'iteration': args.iteration, 'lr': args.lr}, + f + ) + +@dlp.log +def load_checkpoint( + model, + optimizer, + opt_param_scheduler, + load_arg: str = 'load', + strict: bool = True, + load_only_weights: bool = False, + strict_lr_state_dict: bool = False +): """Load a model checkpoint and return the iteration. strict (bool): whether to strictly enforce that the keys in :attr:`state_dict` of the checkpoint match the names of parameters and buffers in model. """ args = get_args() + assert args is not None load_dir = getattr(args, load_arg) - + lr_state_dict = {} + lr_tensor = torch.tensor(args.lr, requires_grad=False, device=DEVICE) + if RANK == 0: + lr_state_dict = load_lr_state_dict(strict=strict_lr_state_dict) + if len(lr_state_dict.keys()) > 0 and 'lr' in lr_state_dict: + lr_tensor = torch.tensor( + lr_state_dict['lr'], + requires_grad=False, + device=DEVICE, + ) + tdist.broadcast(lr_tensor, 0) + args.lr = lr_tensor.item() if args.deepspeed: if args.finetune: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, @@ -553,7 +638,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri print_rank_0(' will not load any checkpoints and will start from ' 'random') return 0 - release = False + release = False else: model = unwrap_model(model) @@ -729,7 +814,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri return iteration - +@dlp.log def load_biencoder_checkpoint(model, only_query_model=False, only_context_model=False, custom_load_path=None): """ diff --git a/megatron/core/pipeline_parallel/deepspeed_zbh1_engine.py b/megatron/core/pipeline_parallel/deepspeed_zbh1_engine.py new file mode 100644 index 0000000000..ba451d9705 --- /dev/null +++ b/megatron/core/pipeline_parallel/deepspeed_zbh1_engine.py @@ -0,0 +1,110 @@ +from megatron.core.tensor_parallel.weight_grad_store import WeightGradStore + +from deepspeed.runtime.pipe.engine import PipelineEngine +from deepspeed.utils.timer import BACKWARD_MICRO_TIMER, \ + BACKWARD_GLOBAL_TIMER, BACKWARD_INNER_MICRO_TIMER, BACKWARD_INNER_GLOBAL_TIMER +from deepspeed.runtime.utils import PartitionedTensor +from deepspeed.accelerator import get_accelerator + +import torch +from torch.cuda.amp import custom_bwd +from packaging import version + + +from megatron.core.parallel_state import ( + get_tensor_model_parallel_world_size, + get_tensor_model_parallel_group, + get_global_memory_buffer, +) + +def _exec_backward_only_pass(self, buffer_id): + assert self.optimizer is not None, "must provide optimizer during " \ + "init in order to use backward" + + self.mem_status('BEFORE BWD ONLY', reset_max=True) + from megatron.core.tensor_parallel.layers import LinearWithGradAccumulationAndAsyncCommunication + WeightGradStore.set_combine_bw(False) + # The last stage just runs backward on the loss using DeepSpeed's typical + # mechanisms. + if self.is_last_stage(): + super(PipelineEngine, self).backward(self.loss) + WeightGradStore.flush() + self.mem_status('AFTER BWD ONLY') + + WeightGradStore.set_combine_bw(True) + return + + outputs = self.pipe_buffers['outputs'][buffer_id] + + if self.wall_clock_breakdown(): + self.timers(BACKWARD_MICRO_TIMER).start() + self.timers(BACKWARD_GLOBAL_TIMER).start() + self.timers(BACKWARD_INNER_MICRO_TIMER).start() + self.timers(BACKWARD_INNER_GLOBAL_TIMER).start() + + # Reconstruct if we previously partitioned the output. We must be + # careful to also restore the computational graph of the tensors we partitioned. + if self.is_pipe_partitioned: + if self.is_grad_partitioned: + if self.pipe_partition_output_meta_cache is None: + self.pipe_partition_output_meta_cache = outputs[0].to('cpu') + part_output = PartitionedTensor.from_meta(meta=self.pipe_partition_output_meta_cache, + local_part=outputs[1], + group=self.grid.get_slice_parallel_group()) + self.pipe_buffers['output_tensors'][buffer_id].data = part_output.full() + outputs = (self.pipe_buffers['output_tensors'][buffer_id], *outputs[2:]) + else: + # Already restored from partition + self.pipe_buffers['output_tensors'][buffer_id].data = outputs[0] + outputs = (self.pipe_buffers['output_tensors'][buffer_id], *outputs[1:]) + + grad_tensors = self.grad_layer + if self.is_grad_partitioned: + if self.grad_partition_grad_layer_meta_cache is None: + self.grad_partition_grad_layer_meta_cache = self.grad_layer[0].to('cpu') + part_grad = PartitionedTensor.from_meta(meta=self.grad_partition_grad_layer_meta_cache, + local_part=self.grad_layer[1], + group=self.grid.get_slice_parallel_group()) + grad_tensors = (part_grad.full(), *grad_tensors[2:]) + part_grad = None + + if self.using_bf16_optimizer and not self.is_last_stage(): + # manually call because we don't call optimizer.backward() + self.optimizer.clear_lp_grads() + + # This handles either a single tensor or tuple of tensors. + + if isinstance(outputs, tuple): + out_tensors = [t for t in outputs if t.is_floating_point()] + assert len(out_tensors) == len(grad_tensors) + torch.autograd.backward(tensors=out_tensors, grad_tensors=grad_tensors) + else: + torch.autograd.backward(tensors=(outputs, ), grad_tensors=(grad_tensors, )) + + + WeightGradStore.flush() + + if self.using_bf16_optimizer and not self.is_last_stage(): + # manually call because we don't call optimizer.backward() + self.optimizer.update_hp_grads(clear_lp_grads=False) + + # Free up the memory from the output of forward() + self.pipe_buffers['output_tensors'][buffer_id] = None + self.pipe_buffers['outputs'][buffer_id] = None + grad_tensors = None + + WeightGradStore.set_combine_bw(True) + + if self.wall_clock_breakdown(): + self.timers(BACKWARD_INNER_MICRO_TIMER).stop() + self.timers(BACKWARD_INNER_GLOBAL_TIMER).stop() + self.timers(BACKWARD_MICRO_TIMER).stop() + self.timers(BACKWARD_GLOBAL_TIMER).stop() + +def _exec_weight_pass(self): + if self.using_bf16_optimizer: + # manually call because we don't call optimizer.backward() + self.optimizer.clear_lp_grads() + WeightGradStore.pop() + if self.using_bf16_optimizer: + self.optimizer.update_hp_grads(clear_lp_grads=False) \ No newline at end of file diff --git a/megatron/core/pipeline_parallel/deepspeed_zbh1_schedule.py b/megatron/core/pipeline_parallel/deepspeed_zbh1_schedule.py new file mode 100644 index 0000000000..651aadce72 --- /dev/null +++ b/megatron/core/pipeline_parallel/deepspeed_zbh1_schedule.py @@ -0,0 +1,148 @@ +from deepspeed.runtime.pipe.schedule import PipeSchedule, PipeInstruction, BufferOpInstruction, \ + LoadMicroBatch, RecvActivation, SendActivation, RecvGrad, SendGrad, \ + ForwardPass, BackwardPass, ReduceGrads, ReduceTiedGrads, OptimizerStep +from megatron import get_args + +class ZeroBubbleH1Pipeline(PipeSchedule): + """A schedule for training a batch using hybrid parallelism. + + Pipeline parallelism is extracted through gradient accumulation and thus + convergence follows that of a data parallel approach with the same batch + size. + """ + + def steps(self): + num_warmup_microbatches = self.stages - self.stage_id + + forward = 0 + backward = 0 + weight = 0 + + # F section + for _ in range(num_warmup_microbatches - 1): + if forward == self.micro_batches: + continue + forward_id = self.get_buffer_id(forward) + forward += 1 + + cmds = [] + if not self.is_first_stage: + cmds.append(RecvActivation(forward_id)) + if self.is_first_stage or self.is_last_stage: + cmds.append(LoadMicroBatch(forward_id)) + cmds.append(ForwardPass(forward_id)) + if not self.is_last_stage: + cmds.append(SendActivation(forward_id)) + yield cmds + + # FB section + for _ in range(self.stage_id): + if forward == self.micro_batches: + continue + forward_id = self.get_buffer_id(forward) + backward_id = self.get_buffer_id(backward) + forward += 1 + backward += 1 + + cmds = [] + if not self.is_first_stage: + cmds.append(RecvActivation(forward_id)) + if self.is_first_stage or self.is_last_stage: + cmds.append(LoadMicroBatch(forward_id)) + cmds.append(ForwardPass(forward_id)) + if not self.is_last_stage: + cmds.append(RecvGrad(backward_id)) + cmds.append(SendActivation(forward_id)) + cmds.append(BackwardOnlyPass(backward_id)) + if not self.is_first_stage: + cmds.append(SendGrad(backward_id)) + yield cmds + + # FBW section + while forward < self.micro_batches: + forward_id = self.get_buffer_id(forward) + backward_id = self.get_buffer_id(backward) + forward += 1 + backward += 1 + weight += 1 + + cmds = [] + if not self.is_first_stage: + cmds.append(RecvActivation(forward_id)) + if self.is_first_stage or self.is_last_stage: + cmds.append(LoadMicroBatch(forward_id)) + cmds.append(ForwardPass(forward_id)) + if not self.is_last_stage: + cmds.append(RecvGrad(backward_id)) + cmds.append(SendActivation(forward_id)) + if self.is_first_stage: + cmds.append(BackwardPass(backward_id)) + elif forward == self.micro_batches: + cmds.append(BackwardOnlyPass(backward_id)) + cmds.append(SendGrad(backward_id)) + cmds.append(WeightPass()) + else: + if get_args().enable_zbh1_exact_semantics: + cmds.append(BackwardOnlyPass(backward_id)) + cmds.append(SendGrad(backward_id)) + cmds.append(WeightPass()) + else: + cmds.append(BackwardPass(backward_id)) + cmds.append(SendGrad(backward_id)) + yield cmds + + #BW section + while backward < self.micro_batches: + backward_id = self.get_buffer_id(backward) + backward += 1 + weight += 1 + + cmds = [] + if not self.is_last_stage: + cmds.append(RecvGrad(backward_id)) + if self.is_first_stage: + cmds.append(BackwardPass(backward_id)) + else: + cmds.append(BackwardOnlyPass(backward_id)) + cmds.append(SendGrad(backward_id)) + cmds.append(WeightPass()) + yield cmds + + #W section + while weight < self.micro_batches: + weight += 1 + yield [WeightPass()] + + yield [ReduceTiedGrads(), ReduceGrads(), OptimizerStep()] + + def get_buffer_id(self, microbatch_id): + num_warmup_microbatches = self.stages - self.stage_id + return microbatch_id % num_warmup_microbatches + + +##Additional Instruction classes +class BackwardOnlyPass(BufferOpInstruction): + """Compute a backward pass and accumulate gradients. + + Roughly: + + .. code-block:: python + + outputs = buffers['outputs'][buffer_id] + gradients = buffers['gradients'][buffer_id] + torch.autograd.backward(tensors=outputs, + grad_tensors=gradients, inputs = input_tensor) + """ + pass + +class WeightPass(PipeInstruction): + """Compute a weight pass and accumulate gradients. + + Roughly: + + .. code-block:: python + + torch.autograd.backward(tensors=outputs, + grad_tensors=gradients, inputs = model.parameters()) + """ + pass diff --git a/megatron/core/pipeline_parallel/p2p_communication.py b/megatron/core/pipeline_parallel/p2p_communication.py index b23f6c84b3..78e43e7fed 100644 --- a/megatron/core/pipeline_parallel/p2p_communication.py +++ b/megatron/core/pipeline_parallel/p2p_communication.py @@ -16,7 +16,8 @@ from megatron.core import ModelParallelConfig from deepspeed.accelerator import get_accelerator - +from megatron.utils import Profile +dlp = Profile("PIPELINE") # Types Shape = Union[List[int], torch.Size] @@ -329,6 +330,7 @@ def _ring_exchange_wrapper(**kwargs): return tensor_recv_prev, tensor_recv_next, reqs +@dlp.log def recv_forward(tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor: """ Receive tensor from previous rank in pipeline (forward receive). @@ -353,7 +355,7 @@ def recv_forward(tensor_shape: Shape, config.timers('forward-recv').stop() return input_tensor - +@dlp.log def recv_backward(tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor: """Receive tensor from next rank in pipeline (backward receive). @@ -376,7 +378,7 @@ def recv_backward(tensor_shape: Shape, config.timers('backward-recv').stop() return output_tensor_grad - +@dlp.log def send_forward(output_tensor: torch.Tensor, config: ModelParallelConfig) -> None: """Send tensor to next rank in pipeline (forward send). @@ -397,7 +399,7 @@ def send_forward(output_tensor: torch.Tensor, if config.timers is not None: config.timers('forward-send').stop() - +@dlp.log def send_backward(input_tensor_grad: torch.Tensor, config: ModelParallelConfig) -> None: """Send tensor to previous rank in pipeline (backward send). @@ -417,7 +419,7 @@ def send_backward(input_tensor_grad: torch.Tensor, if config.timers is not None: config.timers('backward-send').stop() - +@dlp.log def send_forward_recv_backward(output_tensor: torch.Tensor, tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor: @@ -441,7 +443,7 @@ def send_forward_recv_backward(output_tensor: torch.Tensor, config.timers('forward-send-backward-recv').stop() return output_tensor_grad - +@dlp.log def send_backward_recv_forward(input_tensor_grad: torch.Tensor, tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor: @@ -465,7 +467,7 @@ def send_backward_recv_forward(input_tensor_grad: torch.Tensor, config.timers('backward-send-forward-recv').stop() return input_tensor - +@dlp.log def send_forward_recv_forward(output_tensor: torch.Tensor, recv_prev: bool, tensor_shape: Shape, @@ -491,7 +493,7 @@ def send_forward_recv_forward(output_tensor: torch.Tensor, return input_tensor, wait_handles return input_tensor - +@dlp.log def send_backward_recv_backward(input_tensor_grad: torch.Tensor, recv_next: bool, tensor_shape: Shape, @@ -517,7 +519,7 @@ def send_backward_recv_backward(input_tensor_grad: torch.Tensor, return output_tensor_grad, wait_handles return output_tensor_grad - +@dlp.log def send_forward_backward_recv_forward_backward( output_tensor: torch.Tensor, input_tensor_grad: torch.Tensor, diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index 407bb16d56..c24959c64f 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -14,10 +14,12 @@ from megatron.core.enums import ModelType from megatron.core.utils import get_attr_wrapped_model, get_model_type, get_model_config -from megatron.utils import unwrap_model +from megatron.utils import print_rank_0, unwrap_model from megatron.model import DistributedDataParallel as LocalDDP from megatron.model import Float16Module +from megatron.utils import Profile +dlp = Profile("CORE") # Types Shape = Union[List[int], torch.Size] @@ -124,6 +126,7 @@ def deallocate_output_tensor(out, deallocate_pipeline_outputs=False): dtype = out.dtype, ) +@dlp.log def custom_backward(output, grad_output): '''Directly call C++ autograd engine. @@ -162,7 +165,7 @@ def custom_backward(output, grad_output): - +@dlp.log def forward_step(forward_step_func, data_iterator, model, @@ -227,8 +230,15 @@ def forward_step(forward_step_func, return output_tensor return [output_tensor] - -def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config, model=None): +@dlp.log +def backward_step( + input_tensor, + output_tensor, + output_tensor_grad, + model_type, + config, + model=None +): """Backward step through passed-in output tensor. If last stage, output_tensor_grad is None, otherwise gradient of loss @@ -241,12 +251,19 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c # needs to be modified slightly to support arbitrary numbers of skip # connections. args = get_args() - if args.deepspeed: - assert model is not None - + assert args is not None if config.timers is not None: config.timers('backward-compute', log_level=2).start() - + if (to_skip := getattr(args, 'train_iters_to_skip', None)) is not None: + if config.timers is not None: + config.timers('backward-compute').stop() + if len(to_skip) > 0 and args.iteration in [int(i) for i in to_skip]: + print_rank_0( + f'Caught {args.iteration=} in `iters_to_skip`! Skipping!' + ) + return [None] + if args.deepspeed: + assert model is not None # Retain the grad on the input_tensor. unwrap_input_tensor_grad = False if not isinstance(input_tensor, list): @@ -255,24 +272,20 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c for x in input_tensor: if x is not None: x.retain_grad() - if not isinstance(output_tensor, list): output_tensor = [output_tensor] if not isinstance(output_tensor_grad, list): output_tensor_grad = [output_tensor_grad] - # Backward pass. if args.deepspeed: model.backward(output_tensor[0]) else: if output_tensor_grad[0] is None and config.grad_scale_func is not None: output_tensor[0] = config.grad_scale_func(output_tensor[0]) - if config.deallocate_pipeline_outputs: custom_backward(output_tensor[0], output_tensor_grad[0]) else: torch.autograd.backward(output_tensor[0], grad_tensors=output_tensor_grad[0]) - # Collect the grad of the input_tensor. input_tensor_grad = [None] if input_tensor is not None: @@ -282,7 +295,6 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c input_tensor_grad.append(None) else: input_tensor_grad.append(x.grad) - # Handle single skip connection if it exists (encoder_hidden_state in # model with encoder and decoder). if parallel_state.get_pipeline_model_parallel_world_size() > 1 and \ @@ -292,13 +304,11 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c input_tensor_grad[-1].add_(output_tensor_grad[1]) if unwrap_input_tensor_grad: input_tensor_grad = input_tensor_grad[0] - if config.timers is not None: config.timers('backward-compute').stop() - return input_tensor_grad - +@dlp.log def forward_backward_no_pipelining(*, forward_step_func, data_iterator: Union[Iterator, List[Iterator]], @@ -345,7 +355,7 @@ def forward_backward_no_pipelining(*, forward_data_store = [] input_tensor, output_tensor_grad = None, None with no_sync_func(): - for i in range(num_microbatches - 1): + for i in dlp.iter(range(num_microbatches - 1)): output_tensor = forward_step(forward_step_func, data_iterator, model, num_microbatches, input_tensor, forward_data_store, config, collect_non_loss_data) if not forward_only: @@ -363,7 +373,7 @@ def forward_backward_no_pipelining(*, return forward_data_store - +@dlp.log def forward_backward_pipelining_with_interleaving(*, forward_step_func, data_iterator: Union[Iterator, List[Iterator]], @@ -916,7 +926,7 @@ def get_tensor_shapes(*, return tensor_shapes - +@dlp.log def recv_forward(tensor_shapes, config): input_tensors = [] for tensor_shape in tensor_shapes: @@ -926,7 +936,7 @@ def recv_forward(tensor_shapes, config): input_tensors.append(p2p_communication.recv_forward(tensor_shape, config)) return input_tensors - +@dlp.log def recv_backward(tensor_shapes, config): output_tensor_grads = [] for tensor_shape in tensor_shapes: @@ -936,7 +946,7 @@ def recv_backward(tensor_shapes, config): output_tensor_grads.append(p2p_communication.recv_backward(tensor_shape, config)) return output_tensor_grads - +@dlp.log def send_forward(output_tensors, tensor_shapes, config): if not isinstance(output_tensors, list): output_tensors = [output_tensors] @@ -945,7 +955,7 @@ def send_forward(output_tensors, tensor_shapes, config): continue p2p_communication.send_forward(output_tensor, config) - +@dlp.log def send_backward(input_tensor_grads, tensor_shapes, config): if not isinstance(input_tensor_grads, list): input_tensor_grads = [input_tensor_grads] @@ -954,7 +964,7 @@ def send_backward(input_tensor_grads, tensor_shapes, config): continue p2p_communication.send_backward(input_tensor_grad, config) - +@dlp.log def send_forward_recv_backward(output_tensors, tensor_shapes, config): if not isinstance(output_tensors, list): output_tensors = [output_tensors] @@ -968,7 +978,7 @@ def send_forward_recv_backward(output_tensors, tensor_shapes, config): output_tensor_grads.append(output_tensor_grad) return output_tensor_grads - +@dlp.log def send_backward_recv_forward(input_tensor_grads, tensor_shapes, config): if not isinstance(input_tensor_grads, list): input_tensor_grads = [input_tensor_grads] @@ -982,7 +992,7 @@ def send_backward_recv_forward(input_tensor_grads, tensor_shapes, config): input_tensors.append(input_tensor) return input_tensors - +@dlp.log def forward_backward_pipelining_without_interleaving(*, forward_step_func, data_iterator: Union[Iterator, List[Iterator]], diff --git a/megatron/core/tensor_parallel/cross_entropy.py b/megatron/core/tensor_parallel/cross_entropy.py index 9dcdc0459f..d0453d25ea 100644 --- a/megatron/core/tensor_parallel/cross_entropy.py +++ b/megatron/core/tensor_parallel/cross_entropy.py @@ -69,14 +69,14 @@ def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0): vocab_size = exp_logits.size(-1) if label_smoothing > 0: - """ + r""" We'd like to assign 1 / (K - 1) probability mass to every index that is not the ground truth. = (1 - alpha) * y_gt + alpha * mean(y_{i for i != gt}) = (1 - alpha) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i = ((K - 1) * (1 - alpha) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i = (K * (1 - alpha) - 1) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i} y_i = (1 - (alpha * K) / (K - 1)) * y_gt + ( (alpha * K) / (K - 1) ) * \sum_{i} y_i / K - From: https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/losses/smoothed_cross_entropy.py + From: """ assert 1.0 > label_smoothing > 0.0 smoothing = label_smoothing * vocab_size / (vocab_size - 1) diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py index 020d25915a..3dd3299ae0 100644 --- a/megatron/core/tensor_parallel/layers.py +++ b/megatron/core/tensor_parallel/layers.py @@ -1,3 +1,4 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # Parts of the code here are adapted from PyTorch @@ -16,6 +17,8 @@ from torch.cuda.amp import custom_fwd, custom_bwd +from megatron import get_args + from megatron.core.model_parallel_config import ModelParallelConfig from megatron.core.parallel_state import ( @@ -233,6 +236,11 @@ def __init__(self, sequence_length, embedding_dim): def forward(self, position_ids): return self.local_embeddings(position_ids - self.offset) +def gradientUpdateFunction(total_input, grad_output, weight): + if weight.grad == None: + weight.grad = grad_output.t().matmul(total_input) + else: + weight.grad += grad_output.t().matmul(total_input) class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function): """See linear_with_grad_accumulation_and_async_allreduce""" @@ -278,6 +286,7 @@ def forward(ctx, input, weight, bias, gradient_accumulation_fusion, @staticmethod @custom_bwd def backward(ctx, grad_output): + args = get_args() input, weight = ctx.saved_tensors use_bias = ctx.use_bias @@ -359,7 +368,13 @@ def backward(ctx, grad_output): # grad_weight = None # else: # grad_weight = grad_output.t().matmul(total_input) - grad_weight = grad_output.t().matmul(total_input) + if args.enable_zbh1_pipeline: + from megatron.core.tensor_parallel.weight_grad_store import WeightGradStore + WeightGradStore.put(total_input, grad_output, weight, gradientUpdateFunction) + grad_weight = None + else: + grad_weight = grad_output.t().matmul(total_input) + grad_bias = grad_output.sum(dim=0) if use_bias else None if ctx.sequence_parallel: @@ -441,7 +456,8 @@ def linear_with_grad_accumulation_and_async_allreduce( ] if not linear_with_grad_accumulation_and_async_allreduce.warned: - if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1": + if get_accelerator().device_name() == "cuda" \ + and os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1": if sequence_parallel: warnings.warn( "When using sequence parallelism it is recommended to set the " diff --git a/megatron/core/tensor_parallel/weight_grad_store.py b/megatron/core/tensor_parallel/weight_grad_store.py new file mode 100644 index 0000000000..bbd1aea533 --- /dev/null +++ b/megatron/core/tensor_parallel/weight_grad_store.py @@ -0,0 +1,34 @@ +import queue + +class WeightGradStore: + + cache = [] + weight_grad_queue = queue.Queue() + combine_bw = True + + @classmethod + def set_combine_bw(cls, combine_bw): + # For the following backward pass, combine W with B and skip next W. + cls.combine_bw = combine_bw + + @classmethod + def put(cls, total_input, grad_output, weight, func): + if cls.combine_bw == True: + func(total_input, grad_output, weight) + return + # Store the weight gradient computation of linear layers. + cls.cache.append((total_input, grad_output, weight, func)) + + @classmethod + def flush(cls): + # Collect all stored computations during backward as a W. + cls.weight_grad_queue.put(cls.cache) + cls.cache = [] + + @classmethod + def pop(cls): + # Execute a single W. + assert cls.weight_grad_queue.qsize() > 0 + stored_grads = cls.weight_grad_queue.get() + for total_input, grad_output, weight, func in stored_grads: + func(total_input, grad_output, weight) \ No newline at end of file diff --git a/megatron/data/blendable_dataset.py b/megatron/data/blendable_dataset.py old mode 100644 new mode 100755 index f3276c6823..ab164fdc48 --- a/megatron/data/blendable_dataset.py +++ b/megatron/data/blendable_dataset.py @@ -6,16 +6,23 @@ import os import time +import logging import numpy as np import torch from deepspeed.accelerator import get_accelerator -from megatron import print_rank_0 +# from megatron import print_rank_0 from megatron.core import mpu +from megatron.utils import Profile, PerfTrace +from mpi4py import MPI -class BlendableDataset(torch.utils.data.Dataset): +from megatron.utils import get_logger +log = get_logger(__name__, rank_zero_only=True) +dlp = Profile("DATASET") +class BlendableDataset(torch.utils.data.Dataset): + @dlp.log def __init__(self, datasets, weights, size, *, data_cache_path=None): @@ -32,8 +39,9 @@ def __init__(self, datasets, weights, size, *, weights /= sum_weights # Build indicies. + @dlp.log def _build_indices(): - start_time = time.time() + start_time = time.perf_counter() dataset_index = np.zeros(self.size, dtype=np.int64) dataset_sample_index = np.zeros(self.size, dtype=np.int64) @@ -41,8 +49,10 @@ def _build_indices(): helpers.build_blending_indices(dataset_index, dataset_sample_index, weights, num_datasets, self.size, torch.distributed.get_rank() == 0) - print_rank_0('> elapsed time for building blendable dataset indices: ' - '{:.2f} (sec)'.format(time.time() - start_time)) + log.info( + "> elapsed time for building blendable dataset indices: " + f"{time.perf_counter() - start_time:.2f} (sec)" + ) return dataset_index, dataset_sample_index desc = "Blendable dataset\n\n" @@ -52,7 +62,8 @@ def _build_indices(): desc += f"Weights: {weights}\n" desc += f"Size: {size}\n" self.desc = desc - + self.dataset_index = np.zeros(self.size, dtype=np.int64) + self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) if data_cache_path: desc_hash = hashlib.md5(desc.encode('utf-8')).hexdigest() desc_path = os.path.join(data_cache_path, desc_hash + ".dsc") @@ -65,38 +76,46 @@ def _build_indices(): ' dataset, building indices on rank 0 ...', flush=True) dataset_index, dataset_sample_index = _build_indices() try: + log.debug(" > saving index map files") + start_time = time.perf_counter() os.makedirs(os.path.dirname(index_path), exist_ok=True) with open(desc_path, 'wt') as fd: fd.write(desc) np.save(index_path, dataset_index, allow_pickle=True) np.save(sample_index_path, dataset_sample_index, allow_pickle=True) + log.info(f" > finished saving index map files in {time.perf_counter() - start_time} seconds") except OSError: print(f'There was an error trying to create the data cache directory ({data_cache_path})') print('or a file in it. This is set with the --data-cache-path argument. Please') print('ensure you have write access to this directory or specify one that you do have') print('write access to.') cache_success = False - - + self.dataset_index = dataset_index + self.dataset_sample_index = dataset_sample_index + ''' I don't think the following piece of code is necessary any more; I commented them out now counts = get_accelerator().LongTensor([cache_success]) torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) if counts[0].item() != ( - torch.distributed.get_world_size() // - torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) // - torch.distributed.get_world_size(group=mpu.get_sequence_parallel_group())): - print_rank_0("Data index creation unsuccessful, exiting.") + torch.distributed.get_world_size() // + torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) // + torch.distributed.get_world_size(group=mpu.get_sequence_parallel_group())): + log.info("Data index creation unsuccessful, exiting.") exit() + ''' + torch.distributed.barrier(group=mpu.get_data_parallel_group()) + torch.distributed.barrier(group=mpu.get_pipeline_model_parallel_group()) + torch.distributed.barrier(group=mpu.get_data_parallel_group()) - # Load on all ranks. - print_rank_0(f'> loading blendable dataset index: {index_path}') + start_time = time.perf_counter() + log.info(f'> loading blendable dataset index: {index_path}') self.dataset_index = np.load(index_path, allow_pickle=True, mmap_mode='r') assert self.dataset_index.size == self.size - - print_rank_0(f'> loading blendable dataset sample index: {sample_index_path}') + log.info(f'> loading blendable dataset sample index: {sample_index_path}') self.dataset_sample_index = np.load(sample_index_path, allow_pickle=True, mmap_mode='r') assert self.dataset_sample_index.size == self.size + log.info(f'> finished loading in {time.perf_counter() - start_time} seconds') else: self.dataset_index, self.dataset_sample_index = _build_indices() @@ -108,14 +127,14 @@ def _build_indices(): raise RuntimeError('BlendedDataset size is improperly bounded') except IndexError: pass - print_rank_0('> size of blendable dataset: ' + log.info('> size of blendable dataset: ' '{} samples'.format(self.size)) def __len__(self): return self.size - + @dlp.log def __getitem__(self, idx): dataset_idx = self.dataset_index[idx] sample_idx = self.dataset_sample_index[idx] diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 8eb2f2a668..b242101b3a 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -2,7 +2,6 @@ """Dataloaders.""" - import random import torch import numpy as np @@ -46,7 +45,10 @@ def build_pretraining_data_loader(dataset, consumed_samples): batch_sampler=batch_sampler, num_workers=args.num_workers, pin_memory=True, - # multiprocessing_context='spawn' + multiprocessing_context=( + args.multiprocessing_context if args.num_workers > 0 + else None + ) ) if args.repeated_dataloader: loader=RepeatingLoader(loader) @@ -54,28 +56,39 @@ def build_pretraining_data_loader(dataset, consumed_samples): class MegatronPretrainingSampler: - def __init__(self, total_samples, consumed_samples, micro_batch_size, - data_parallel_rank, data_parallel_size, drop_last=True): + def __init__( + self, + total_samples, + consumed_samples, + micro_batch_size, + data_parallel_rank, + data_parallel_size, + drop_last=True + ): # Keep a copy of input params for later use. self.total_samples = total_samples self.consumed_samples = consumed_samples self.micro_batch_size = micro_batch_size self.data_parallel_rank = data_parallel_rank - self.micro_batch_times_data_parallel_size = \ + self.micro_batch_times_data_parallel_size = ( self.micro_batch_size * data_parallel_size + ) self.drop_last = drop_last # Sanity checks. - assert self.total_samples > 0, \ - 'no sample to consume: {}'.format(self.total_samples) - assert self.consumed_samples < self.total_samples, \ - 'no samples left to consume: {}, {}'.format(self.consumed_samples, - self.total_samples) + assert self.total_samples > 0, ( + f'no sample to consume: {self.total_samples}' + ) + assert self.consumed_samples < self.total_samples, ( + 'no samples left to consume: ' + f'{self.consumed_samples}, {self.total_samples}' + ) assert self.micro_batch_size > 0 assert data_parallel_size > 0 - assert self.data_parallel_rank < data_parallel_size, \ - 'data_parallel_rank should be smaller than data size: {}, ' \ - '{}'.format(self.data_parallel_rank, data_parallel_size) + assert self.data_parallel_rank < data_parallel_size, ( + f'data_parallel_rank should be smaller than data size: ' + f'{self.data_parallel_rank}, {data_parallel_size}' + ) def __len__(self): return self.total_samples @@ -125,8 +138,16 @@ def __getitem__(self, idx): class MegatronPretrainingRandomSampler: - def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, - data_parallel_rank, data_parallel_size, data_sharding): + def __init__( + self, + dataset, + total_samples, + consumed_samples, + micro_batch_size, + data_parallel_rank, + data_parallel_size, + data_sharding + ): # Keep a copy of input params for later use. self.dataset = dataset self.total_samples = total_samples @@ -135,19 +156,23 @@ def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, self.data_parallel_rank = data_parallel_rank self.data_parallel_size = data_parallel_size self.data_sharding = data_sharding - self.micro_batch_times_data_parallel_size = \ + self.micro_batch_times_data_parallel_size = ( self.micro_batch_size * data_parallel_size - self.last_batch_size = \ + ) + self.last_batch_size = ( self.total_samples % self.micro_batch_times_data_parallel_size + ) # Sanity checks. - assert self.total_samples > 0, \ - 'no sample to consume: {}'.format(self.total_samples) + assert self.total_samples > 0, ( + f'no sample to consume: {self.total_samples}' + ) assert self.micro_batch_size > 0 assert data_parallel_size > 0 - assert self.data_parallel_rank < data_parallel_size, \ - 'data_parallel_rank should be smaller than data size: {}, ' \ - '{}'.format(self.data_parallel_rank, data_parallel_size) + assert self.data_parallel_rank < data_parallel_size, ( + f'data_parallel_rank should be smaller than data size: ' + f'{self.data_parallel_rank}, {data_parallel_size}' + ) def __len__(self): return self.total_samples @@ -163,23 +188,31 @@ def __iter__(self): # data sharding and random sampling if self.data_sharding: - bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ - * self.micro_batch_size + bucket_size = ( + self.micro_batch_size * ( + self.total_samples + // self.micro_batch_times_data_parallel_size + ) + ) bucket_offset = current_epoch_samples // self.data_parallel_size start_idx = self.data_parallel_rank * bucket_size - g = torch.Generator() g.manual_seed(self.epoch) random_idx = torch.randperm(bucket_size, generator=g).tolist() idx_range = [start_idx + x for x in random_idx[bucket_offset:]] else: - full_bucket_size = (self.total_samples // self.micro_batch_size) \ - * self.micro_batch_size + full_bucket_size = ( + self.micro_batch_size * ( + self.total_samples + // self.micro_batch_size + ) + ) full_bucket_offset = current_epoch_samples g = torch.Generator() g.manual_seed(self.epoch) - idx_range_total = \ + idx_range_total = ( torch.randperm(full_bucket_size, generator=g).tolist() + ) idx_range_active = idx_range_total[full_bucket_offset:] idx_range = idx_range_active[self.data_parallel_rank::self.data_parallel_size] diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py index cadca053cf..61dd3909e4 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/data/dataset_utils.py @@ -40,6 +40,18 @@ DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5] +def get_datasets_corpuses_weights_and_num_samples(data_prefix, train_valid_test_num_samples): + assert len(data_prefix) % 3 == 0 + num_datasets = len(data_prefix) // 3 + data_new_prefix = [] + corpuses = [] + for i in range(num_datasets): + data_new_prefix += [data_prefix[3*i], data_prefix[3*i+1]] + corpuses.append(data_prefix[3*i+2]) + prefixes, weights, datasets_train_valid_test_num_samples = \ + get_datasets_weights_and_num_samples(data_new_prefix, + train_valid_test_num_samples) + return prefixes, corpuses, weights, datasets_train_valid_test_num_samples def get_datasets_weights_and_num_samples(data_prefix, train_valid_test_num_samples): diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index 210a92c85e..d09f08d63a 100644 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -9,168 +9,541 @@ import numpy as np import torch from deepspeed.accelerator import get_accelerator -from megatron import print_rank_0, is_rank_0, get_args +from megatron import is_rank_0, get_args from megatron.core import mpu +from megatron.data import helpers # type:ignore from megatron.data.blendable_dataset import BlendableDataset -from megatron.data.dataset_utils import get_datasets_weights_and_num_samples +from megatron.data.dataset_utils import ( + get_datasets_weights_and_num_samples, + get_datasets_corpuses_weights_and_num_samples, +) from megatron.data.dataset_utils import get_train_valid_test_split_ from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset - -def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, - train_valid_test_num_samples, - seq_length, seed, skip_warmup, - train_data_prefix=None, - valid_data_prefix=None, - test_data_prefix=None, - return_doc_ids=False, *, - data_cache_path=None): +from megatron.utils import PerfTrace, Profile, get_logger +from mpi4py import MPI + +dlp = Profile("DATASET") + +log = get_logger(__name__, rank_zero_only=True) + + +@dlp.log +def build_train_valid_test_datasets( + data_prefix, + data_impl, + splits_string, + train_valid_test_num_samples, + seq_length, + seed, + skip_warmup, + train_data_prefix=None, + valid_data_prefix=None, + test_data_prefix=None, + return_doc_ids=False, + *, + data_cache_path=None, +): """Build train, valid, and test datasets.""" if data_prefix: - print_rank_0("Single data path provided for train, valid & test") + log.debug("Single data path provided for train, valid & test") # Single dataset. if len(data_prefix) == 1: - return _build_train_valid_test_datasets(data_prefix[0], - data_impl, splits_string, - train_valid_test_num_samples, - seq_length, seed, skip_warmup, - data_cache_path=data_cache_path) + return _build_train_valid_test_datasets( + data_prefix[0], + data_impl, + splits_string, + train_valid_test_num_samples, + seq_length, + seed, + skip_warmup, + data_cache_path=data_cache_path, + ) # Blending dataset. # Parse the values. - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output + output = get_datasets_corpuses_weights_and_num_samples( + data_prefix, train_valid_test_num_samples + ) + prefixes, corpuses, weights, datasets_train_valid_test_num_samples = output + corpus_list = sorted(set(corpuses)) train_num_samples, valid_num_samples, test_num_samples = map( - sum, - zip(*datasets_train_valid_test_num_samples) + sum, zip(*datasets_train_valid_test_num_samples) ) - # Build individual datasets. + class DatasetBuilder: + """ + This is for building individual dataset from each dataset file + """ + + @dlp.log + def __init__( + self, + prefix, + corpus, + data_impl, + splits_string, + num_samples, + seq_length, + seed, + skip_warmup, + return_doc_ids, + data_cache_path=data_cache_path, + name="train", + ): + self.prefix = prefix + self.data_impl = data_impl + self.splits_string = splits_string + if name == "train": + self.num_samples = num_samples[0] + elif name == "valid": + self.num_samples = num_samples[1] + else: + self.num_samples = num_samples[2] + self.num_samples_train_valid_test = num_samples + self.seq_length = seq_length + self.seed = seed + self.skip_warmup = skip_warmup + self.return_doc_ids = return_doc_ids + self.data_cache_path = data_cache_path + self.dataset = None + self.name = name + self.desc = prefix + f"{self.num_samples}" + f"{seq_length}" + f"{seed}" + self.build = False + self.corpus = corpus + + @dlp.log + def Build(self): + self.dataset = _build_train_valid_test_datasets_single( + self.prefix, + self.data_impl, + self.splits_string, + self.num_samples_train_valid_test, + self.seq_length, + self.seed, + self.skip_warmup, + self.name, + self.return_doc_ids, + data_cache_path=self.data_cache_path, + ) + self.build = True + return self.dataset + + class BuildCorpusDataset(torch.utils.data.Dataset): + @dlp.log + def __init__(self, dataset_builders): + self.dataset_builders = dataset_builders + self.num_datasets = len(dataset_builders) + self.num_samples = np.sum([d.num_samples for d in dataset_builders]) + self.indices = np.zeros((self.num_samples, 2), dtype=np.uint64) + self.desc = "CorpusDataset:" + # m = 0 + num_samples_list = np.array([d.num_samples for d in dataset_builders]) + self.num_samples = np.sum(num_samples_list) + args = get_args() + + @dlp.log + def _build_indices_blended(): + start_time = time.time() + dataset_index = np.zeros(self.num_samples, dtype=np.int64) + dataset_sample_index = np.zeros(self.num_samples, dtype=np.int64) + weights = num_samples_list / self.num_samples + helpers.build_blending_indices( + dataset_index, dataset_sample_index, + weights, self.num_datasets, self.num_samples, + torch.distributed.get_rank() == 0) + log.debug(f"> elapsed time for building blendable dataset indices for corpus {self.dataset_builders[0].corpus}: " + "{:.2f} (sec)".format(time.time() - start_time)) + return dataset_index, dataset_sample_index + + + def _build_indices_concat(): + start_time = time.time() + dataset_index = np.zeros(self.num_samples, dtype=np.int64) + dataset_sample_index = np.zeros(self.num_samples, dtype=np.int64) + helpers.build_concat_indices( + dataset_index, + dataset_sample_index, + num_samples_list, + self.num_datasets, + torch.distributed.get_rank() == 0, + ) + log.debug( + "> elapsed time for building concat dataset indices: " + "{:.2f} (sec)".format(time.time() - start_time) + ) + return dataset_index, dataset_sample_index + + if args.blend_sample_in_corpus: + self.dataset_index, self.dataset_sample_index = _build_indices_blended() + else: + self.dataset_index, self.dataset_sample_index = _build_indices_concat() + + np_rng = np.random.RandomState(seed=dataset_builders[0].seed) + self.shuffle_index = np.arange(self.num_samples) + if args.shuffle_sample_in_corpus: + np_rng.shuffle(self.shuffle_index) + for i in range(self.num_datasets): + self.desc += dataset_builders[i].prefix + "," + + log.info( + f"[BuildConcatDataset] Caught {args.shuffle_sample_in_corpus=} across" + f" {self.num_samples} samples" + ) + self.desc += ( + f"-{self.num_samples}" + + f"-{dataset_builders[0].seq_length}" + + f"{dataset_builders[0].seed}" + ) + + def __len__(self): + return self.num_samples + + @dlp.log + def __getitem__(self, idx): + id_shuffle = self.shuffle_index[idx] + i = self.dataset_index[id_shuffle] + j = self.dataset_sample_index[id_shuffle] + if self.dataset_builders[i].build: + return self.dataset_builders[i].dataset[j] + else: + return self.dataset_builders[i].Build()[j] + + # Predetermine whether need to build the specific dataset or not. + start_time = time.time() + log.debug(" >>> Started building datasets in distributed way ... ") + + a, b, c = [int(d) for d in splits_string.split(",")] + train_datasets = [] valid_datasets = [] test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - prefixes[i], data_impl, splits_string, - datasets_train_valid_test_num_samples[i], - seq_length, seed, skip_warmup, - return_doc_ids, - data_cache_path=data_cache_path) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) + # Build individual datasets. + args = get_args() + @dlp.log + def build_corpus_datasets(dataset_type="train"): + start_time = time.time() + log.debug(f" >>> Building {dataset_type} corpus datasets ...") + datasets = [] + corpus_builders = {} + corpus_weights = {} + for c in corpus_list: + corpus_builders[c] = [] + corpus_weights[c] = 0.0 + dataset_builders = [ + DatasetBuilder( + prefixes[i], + corpuses[i], + data_impl, + splits_string, + datasets_train_valid_test_num_samples[i], + seq_length, + seed, + skip_warmup, + return_doc_ids, + data_cache_path, + dataset_type, + ) + for i in range(len(weights)) + ] + for i in range( + torch.distributed.get_rank() + // mpu.get_tensor_model_parallel_world_size(), + len(weights), + torch.distributed.get_world_size() + // mpu.get_tensor_model_parallel_world_size(), + ): + dataset_builders[i].Build() + log.debug( + f" >>> Finished building individual datasets in {time.time() - start_time} seconds" + ) + start_concating_time = time.time() + for i, d in zip(range(len(weights)), dataset_builders): + corpus_builders[d.corpus].append(d) + corpus_weights[d.corpus] += weights[i] + total = 0 + log.debug(" > number of samples for each corpus ") + corpus_weights_achieved = {} + for c in corpus_list: + datasets.append(BuildCorpusDataset(corpus_builders[c])) + total += datasets[-1].num_samples + corpus_weights_achieved[c] = ( + float(datasets[-1].num_samples) / train_num_samples + ) + log.debug( + f" {c}: {datasets[-1].num_samples} w={corpus_weights_achieved[c]} (expected: {corpus_weights[c]})" + ) + log.debug(f" > total number of samples: {total}") + log.debug( + f" >>> Finished concatenating datasets in {time.time() - start_concating_time} seconds" + ) + log.debug( + f" >>> Finished building {dataset_type} corpus datasets in {time.time() - start_time} seconds" + ) + return datasets, [corpus_weights_achieved[c] for c in corpus_list] + + train_weights = None + if a > 0: + train_datasets, train_weights = build_corpus_datasets("train") + valid_weights = None + if b > 0: + valid_datasets, valid_weights = build_corpus_datasets("valid") + test_weights = None + if c > 0: + test_datasets, test_weights = build_corpus_datasets("test") + + # This barrier is critical to make sure that all the datasets are built once + # and the metadata were written to the cache folder before other ranks touch them + log.debug( + f" >>> Rank 0 - finished building datasets in {time.time() - start_time} seconds" + ) + torch.distributed.barrier(group=mpu.get_data_parallel_group()) + torch.distributed.barrier(group=mpu.get_pipeline_model_parallel_group()) + torch.distributed.barrier(group=mpu.get_data_parallel_group()) + log.debug( + f" >>> Finished building datasets (all ranks) in distributed way in {time.time() - start_time} seconds" + ) + log.debug(" >>> Starting to build BlendableDataset") # Blend. + start_time = time.time() blending_train_dataset = None - if train_datasets: - blending_train_dataset = BlendableDataset(train_datasets, weights, train_num_samples, - data_cache_path=data_cache_path) + if train_datasets and train_weights: + blending_train_dataset = BlendableDataset( + train_datasets, + train_weights, + train_num_samples, + data_cache_path=data_cache_path, + ) blending_valid_dataset = None - if valid_datasets: - blending_valid_dataset = BlendableDataset(valid_datasets, weights, valid_num_samples, - data_cache_path=data_cache_path) + if valid_datasets and valid_weights: + blending_valid_dataset = BlendableDataset( + valid_datasets, + valid_weights, + valid_num_samples, + data_cache_path=data_cache_path, + ) blending_test_dataset = None - if test_datasets: - blending_test_dataset = BlendableDataset(test_datasets, weights, test_num_samples, - data_cache_path=data_cache_path) - - return (blending_train_dataset, blending_valid_dataset, - blending_test_dataset) + if test_datasets and test_weights: + blending_test_dataset = BlendableDataset( + test_datasets, + test_weights, + test_num_samples, + data_cache_path=data_cache_path, + ) + end_time = time.time() + log.debug( + f" >>> Finished building BlendableDataset in {end_time - start_time} seconds" + ) + return (blending_train_dataset, blending_valid_dataset, blending_test_dataset) else: - print_rank_0("Separate data paths provided for train, valid & test. Split string will be ignored.") + log.debug( + "Separate data paths provided for train, valid & test. Split string will be ignored." + ) train_dataset, valid_dataset, test_dataset = None, None, None # Single dataset. if train_data_prefix is not None: - train_dataset = build_dataset("train", train_data_prefix, data_impl, - splits_string, - train_valid_test_num_samples[0], - seq_length, seed, skip_warmup, - data_cache_path=data_cache_path) + train_dataset = build_dataset( + "train", + train_data_prefix, + data_impl, + splits_string, + train_valid_test_num_samples[0], + seq_length, + seed, + skip_warmup, + data_cache_path=data_cache_path, + ) if valid_data_prefix is not None: - valid_dataset = build_dataset("valid", valid_data_prefix, data_impl, - splits_string, - train_valid_test_num_samples[1], - seq_length, seed, False, - data_cache_path=data_cache_path) - + valid_dataset = build_dataset( + "valid", + valid_data_prefix, + data_impl, + splits_string, + train_valid_test_num_samples[1], + seq_length, + seed, + False, + data_cache_path=data_cache_path, + ) if test_data_prefix is not None: - test_dataset = build_dataset("test", test_data_prefix, data_impl, - splits_string, - train_valid_test_num_samples[2], - seq_length, seed, False, - data_cache_path=data_cache_path) + test_dataset = build_dataset( + "test", + test_data_prefix, + data_impl, + splits_string, + train_valid_test_num_samples[2], + seq_length, + seed, + False, + data_cache_path=data_cache_path, + ) return (train_dataset, valid_dataset, test_dataset) -def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, - train_valid_test_num_samples, - seq_length, seed, skip_warmup, - return_doc_ids=False, *, - data_cache_path=None): +@dlp.log +def _build_train_valid_test_datasets( + data_prefix, + data_impl, + splits_string, + train_valid_test_num_samples, + seq_length, + seed, + skip_warmup, + return_doc_ids=False, + *, + data_cache_path=None, +): """Build train, valid, and test datasets.""" # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, - data_impl, - skip_warmup) + indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup) total_num_of_documents = indexed_dataset.sizes.shape[0] splits = get_train_valid_test_split_(splits_string, total_num_of_documents) # Print stats about the splits. - print_rank_0(' > dataset split:') + log.debug(" > dataset split:") def print_split_stats(name, index): - print_rank_0(' {}:'.format(name)) - print_rank_0(' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[index], splits[index + 1], - splits[index + 1] - splits[index])) - print_split_stats('train', 0) - print_split_stats('validation', 1) - print_split_stats('test', 2) + log.debug(" {}:".format(name)) + log.debug( + " document indices in [{}, {}) total of {} " "documents".format( + splits[index], splits[index + 1], splits[index + 1] - splits[index] + ) + ) + + print_split_stats("train", 0) + print_split_stats("validation", 1) + print_split_stats("test", 2) def build_dataset(index, name): dataset = None if splits[index + 1] > splits[index]: - documents = np.arange(start=splits[index], stop=splits[index + 1], - step=1, dtype=np.int32) - dataset = GPTDataset(name, data_prefix, documents, indexed_dataset, - splits_string, - train_valid_test_num_samples[index], - seq_length, seed, - return_doc_ids, - data_cache_path=data_cache_path) + documents = np.arange( + start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32 + ) + dataset = GPTDataset( + name, + data_prefix, + documents, + indexed_dataset, + splits_string, + train_valid_test_num_samples[index], + seq_length, + seed, + return_doc_ids, + data_cache_path=data_cache_path, + ) return dataset - train_dataset = build_dataset(0, 'train') - valid_dataset = build_dataset(1, 'valid') - test_dataset = build_dataset(2, 'test') + train_dataset = build_dataset(0, "train") + valid_dataset = build_dataset(1, "valid") + test_dataset = build_dataset(2, "test") return (train_dataset, valid_dataset, test_dataset) -def build_dataset(dataset_name, data_prefix, data_impl, - splits_string, num_samples, - seq_length, seed, skip_warmup, - *, - data_cache_path=None): +@dlp.log +def _build_train_valid_test_datasets_single( + data_prefix, + data_impl, + splits_string, + train_valid_test_num_samples, + seq_length, + seed, + skip_warmup, + name, + return_doc_ids=False, + *, + data_cache_path=None, +): + """Build train, valid, and test datasets.""" + + # Each rank print out information + log.debug(f" >> building dataset for {data_prefix}") + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + splits = get_train_valid_test_split_(splits_string, total_num_of_documents) + + # Print stats about the splits. + log.debug(" > dataset split:") + + def print_split_stats(name, index): + log.debug(" {}:".format(name)) + log.debug( + " document indices in [{}, {}) total of {} " "documents".format( + splits[index], splits[index + 1], splits[index + 1] - splits[index] + ) + ) + + print_split_stats("train", 0) + print_split_stats("validation", 1) + print_split_stats("test", 2) + + def build_dataset(index, name): + dataset = None + if splits[index + 1] > splits[index]: + documents = np.arange( + start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32 + ) + dataset = GPTDataset( + name, + data_prefix, + documents, + indexed_dataset, + splits_string, + train_valid_test_num_samples[index], + seq_length, + seed, + return_doc_ids, + data_cache_path=data_cache_path, + ) + return dataset + + if name.find("train") != -1: + return build_dataset(0, "train") + if name.find("valid") != -1: + return build_dataset(1, "valid") + if name.find("test") != -1: + return build_dataset(2, "test") + + +@dlp.log +def build_dataset( + dataset_name, + data_prefix, + data_impl, + splits_string, + num_samples, + seq_length, + seed, + skip_warmup, + *, + data_cache_path=None, +): dataset = None if len(data_prefix) == 1: - dataset = _build_dataset(dataset_name, data_prefix[0], data_impl, - splits_string, num_samples, seq_length, - seed, skip_warmup, - data_cache_path=data_cache_path) + dataset = _build_dataset( + dataset_name, + data_prefix[0], + data_impl, + splits_string, + num_samples, + seq_length, + seed, + skip_warmup, + data_cache_path=data_cache_path, + ) else: # Blending dataset. # Parse the values. @@ -181,73 +554,108 @@ def build_dataset(dataset_name, data_prefix, data_impl, # Build individual datasets. datasets = [] for i in range(len(prefixes)): - ds = _build_dataset(dataset_name, prefixes[i], data_impl, - splits_string, dataset_num_samples[i], - seq_length, seed, skip_warmup, - data_cache_path=data_cache_path) + ds = _build_dataset( + dataset_name, + prefixes[i], + data_impl, + splits_string, + dataset_num_samples[i], + seq_length, + seed, + skip_warmup, + data_cache_path=data_cache_path, + ) if ds: datasets.append(ds) if datasets: - dataset = BlendableDataset(datasets, weights, num_samples, - data_cache_path=data_cache_path) + dataset = BlendableDataset( + datasets, weights, num_samples, data_cache_path=data_cache_path + ) return dataset -def _build_dataset(dataset_name, data_prefix, data_impl, splits_string, - num_samples, seq_length, seed, skip_warmup, - *, - data_cache_path=None): +@dlp.log +def _build_dataset( + dataset_name, + data_prefix, + data_impl, + splits_string, + num_samples, + seq_length, + seed, + skip_warmup, + *, + data_cache_path=None, +): """ Build dataset. This method is called when individual train, valid, test datasets are provided """ # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, - data_impl, - skip_warmup) + indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup) total_num_of_documents = indexed_dataset.sizes.shape[0] - print_rank_0(' {}:'.format(dataset_name)) - print_rank_0(' document indices in [0, {}) total of {} ' - 'documents'.format(total_num_of_documents, total_num_of_documents)) - - documents = np.arange(start=0, stop=total_num_of_documents, - step=1, dtype=np.int32) - - dataset = GPTDataset(dataset_name, data_prefix, documents, indexed_dataset, - splits_string, num_samples, seq_length, seed, - data_cache_path=data_cache_path) + log.debug(" {}:".format(dataset_name)) + log.debug( + " document indices in [0, {}) total of {} " "documents".format( + total_num_of_documents, total_num_of_documents + ) + ) + + documents = np.arange(start=0, stop=total_num_of_documents, step=1, dtype=np.int32) + + dataset = GPTDataset( + dataset_name, + data_prefix, + documents, + indexed_dataset, + splits_string, + num_samples, + seq_length, + seed, + data_cache_path=data_cache_path, + ) return dataset +@dlp.log def get_indexed_dataset_(data_prefix, data_impl, skip_warmup): """Build indexed dataset.""" - print_rank_0(' > building dataset index ...') + log.debug(" > building dataset index ...") start_time = time.time() - indexed_dataset = make_indexed_dataset(data_prefix, - data_impl, - skip_warmup) - print_rank_0(' > finished creating indexed dataset in {:4f} ' - 'seconds'.format(time.time() - start_time)) - print_rank_0(' number of documents: {}'.format( - indexed_dataset.sizes.shape[0])) + indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup) + log.debug( + " > finished creating indexed dataset in {:4f} " "seconds".format( + time.time() - start_time + ) + ) + log.debug(" number of documents: {}".format(indexed_dataset.sizes.shape[0])) return indexed_dataset class GPTDataset(torch.utils.data.Dataset): - - def __init__(self, name, data_prefix, documents, indexed_dataset, - splits_string, num_samples, seq_length, seed, - return_doc_ids=False, *, - data_cache_path=None): - + @dlp.log + def __init__( + self, + name, + data_prefix, + documents, + indexed_dataset, + splits_string, + num_samples, + seq_length, + seed, + return_doc_ids=False, + *, + data_cache_path=None, + ): self.name = name self.indexed_dataset = indexed_dataset self.return_doc_ids = return_doc_ids @@ -257,20 +665,29 @@ def __init__(self, name, data_prefix, documents, indexed_dataset, assert np.max(documents) < indexed_dataset.sizes.shape[0] # Build index mappings. - self.doc_idx, self.sample_idx, self.shuffle_idx, self.desc, self.desc_hash = \ - _build_index_mappings(self.name, data_prefix, - documents, self.indexed_dataset.sizes, - splits_string, num_samples, seq_length, seed, - data_cache_path=data_cache_path) - + self.doc_idx, self.sample_idx, self.shuffle_idx, self.desc, self.desc_hash = ( + _build_index_mappings( + self.name, + data_prefix, + documents, + self.indexed_dataset.sizes, + splits_string, + num_samples, + seq_length, + seed, + data_cache_path=data_cache_path, + ) + ) def __len__(self): # -1 is due to data structure used to retieve the index: # sample i --> [sample_idx[i], sample_idx[i+1]) return self.sample_idx.shape[0] - 1 + @dlp.log def __getitem__(self, idx): args = get_args() + assert args is not None orig_idx = idx # Get the shuffled index. try: @@ -279,21 +696,24 @@ def __getitem__(self, idx): if is_rank_0(): import json from rich import print_json + print(exc) print( - '\n'.join( - ['-------------------------------------------------', - f'Trying to access {idx=} from self.shuffle_idx,', - f'but {len(self.shuffle_idx)=}', - '-------------------------------------------------'] + "\n".join( + [ + "-------------------------------------------------", + f"Trying to access {idx=} from self.shuffle_idx,", + f"but {len(self.shuffle_idx)=}", + "-------------------------------------------------", + ] ) ) print_json( json.dumps( { - 'doc_idx': len(self.doc_idx), - 'sample_idx': len(self.sample_idx), - 'shuffle_idx': len(self.shuffle_idx), + "doc_idx": len(self.doc_idx), + "sample_idx": len(self.sample_idx), + "shuffle_idx": len(self.shuffle_idx), }, indent=4, ) @@ -307,45 +727,57 @@ def __getitem__(self, idx): doc_ids = [] if doc_index_f == doc_index_l: doc_ids.append(self.doc_idx[doc_index_f]) - sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], - offset=offset_f, - length=offset_l - offset_f + 1) + sample = self.indexed_dataset.get( + self.doc_idx[doc_index_f], + offset=offset_f, + length=offset_l - offset_f + 1, + ) else: # Otherwise, get the rest of the initial document. doc_ids.append(self.doc_idx[doc_index_f]) - sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], - offset=offset_f)] + sample_list = [ + self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f) + ] # Loop over all in between documents and add the entire document. for i in range(doc_index_f + 1, doc_index_l): doc_ids.append(self.doc_idx[i]) sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) # And finally add the relevant portion of last document. doc_ids.append(self.doc_idx[doc_index_l]) - sample_list.append(self.indexed_dataset.get( - self.doc_idx[doc_index_l], - length=offset_l + 1)) + sample_list.append( + self.indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1) + ) sample = np.concatenate(sample_list) - text_name = 'text' + text_name = "text" if args.use_dataset_only: - text_name = 'input_ids' + text_name = "input_ids" sample_dict = {text_name: np.array(sample, dtype=np.int64)} if args.return_data_index: - sample_dict.update({'index': np.array([orig_idx], dtype=np.int64)}) + sample_dict.update({"index": np.array([orig_idx], dtype=np.int64)}) - if self.return_doc_ids: # for retro preprocessing - sample_dict.update({'doc_ids': np.array(doc_ids, dtype=np.int64)}) + if self.return_doc_ids: # for retro preprocessing + sample_dict.update({"doc_ids": np.array(doc_ids, dtype=np.int64)}) if args.use_dataset_only: - sample_dict.update({'labels': np.array(sample, dtype=np.int64)}) + sample_dict.update({"labels": np.array(sample, dtype=np.int64)}) return sample_dict -def _build_index_mappings(name, data_prefix, documents, sizes, - splits_string, num_samples, seq_length, seed, - *, - data_cache_path): +@dlp.log +def _build_index_mappings( + name, + data_prefix, + documents, + sizes, + splits_string, + num_samples, + seq_length, + seed, + *, + data_cache_path, +): """Build doc-idx, sample-idx, and shuffle-idx. doc-idx: is an array (ordered) of documents to be used in training. sample-idx: is the start document index and document offset for each @@ -353,10 +785,11 @@ def _build_index_mappings(name, data_prefix, documents, sizes, shuffle-idx: maps the sample index into a random index into sample-idx. """ args = get_args() + assert args is not None # Number of tokens in each epoch and number of required epochs. tokens_per_epoch = _num_tokens(documents, sizes) num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) - if args.train_data_exact_num_epochs is not None and name == 'train': + if args.train_data_exact_num_epochs is not None and name == "train": num_epochs = args.train_data_exact_num_epochs # rng state @@ -371,13 +804,13 @@ def _build_index_mappings(name, data_prefix, documents, sizes, desc += f"Sequence length {seq_length}\n" desc += f"Random seed {seed}\n" desc += f"Split {splits_string}\n" - desc_hash = hashlib.md5(desc.encode('utf-8')).hexdigest() + desc_hash = hashlib.md5(desc.encode("utf-8")).hexdigest() desc_filename = desc_hash + ".dsc" - doc_idx_filename = desc_hash + '_doc_idx.npy' - sample_idx_filename = desc_hash + '_sample_idx.npy' - shuffle_idx_filename = desc_hash + '_shuffle_idx.npy' + doc_idx_filename = desc_hash + "_doc_idx.npy" + sample_idx_filename = desc_hash + "_sample_idx.npy" + shuffle_idx_filename = desc_hash + "_shuffle_idx.npy" - if name == 'train': + if name == "train": # force to use certain index files if args.train_desc_path is not None: desc_filename = args.train_desc_path @@ -392,15 +825,15 @@ def _build_index_mappings(name, data_prefix, documents, sizes, # duplication, then look in data-cache-path if specified, # If nothing is found, use the last path looked in build_indices = True - prefixes = [os.path.join(os.path.dirname(data_prefix), 'index-cache')] + prefixes = [os.path.join(os.path.dirname(data_prefix), "index-cache")] if data_cache_path is not None: prefixes.append(data_cache_path) for prefix in prefixes: idx_path = { - 'desc': os.path.join(prefix, desc_filename), - 'doc': os.path.join(prefix, doc_idx_filename), - 'sample': os.path.join(prefix, sample_idx_filename), - 'shuffle': os.path.join(prefix, shuffle_idx_filename) + "desc": os.path.join(prefix, desc_filename), + "doc": os.path.join(prefix, doc_idx_filename), + "sample": os.path.join(prefix, sample_idx_filename), + "shuffle": os.path.join(prefix, shuffle_idx_filename), } for f in idx_path.values(): if not os.path.isfile(f): @@ -409,13 +842,17 @@ def _build_index_mappings(name, data_prefix, documents, sizes, # Found our files! build_indices = False break - data_cache_dir = os.path.dirname(idx_path['desc']) + data_cache_dir = os.path.dirname(idx_path["desc"]) data_cache_success = True # Build the indexed mapping if not exist. - if build_indices and is_rank_0(): - print_rank_0(' > WARNING: could not find index map files, building ' - 'the indices on rank 0 ...') + if build_indices: + # Since this function will be called by all the rank in the very beginning. Therefore, we assume that all the + # ranks will first create the document files, and then read it. + # There will not be contension effects going on either + log.warning( + f" > WARNING: could not find index map files, building on rank {torch.distributed.get_rank()}" + ) # For the last epoch, decide whether include the entire epoch # in the global shuffle or not. @@ -424,64 +861,80 @@ def _build_index_mappings(name, data_prefix, documents, sizes, # not mean anything. if num_epochs == 1: separate_last_epoch = False - print(' > only one epoch required, setting ' - 'separate_last_epoch to False', flush=True) + log.debug( + " > only one epoch required, setting " "separate_last_epoch to False" + ) else: # Get the number of samples for the last epoch num_samples_from_epochs_minus_one = ( - (num_epochs - 1) * tokens_per_epoch - 1) // seq_length - last_epoch_num_samples = num_samples - \ - num_samples_from_epochs_minus_one - assert last_epoch_num_samples >= 0, \ - 'last epoch number of samples should be non-negative.' + (num_epochs - 1) * tokens_per_epoch - 1 + ) // seq_length + last_epoch_num_samples = num_samples - num_samples_from_epochs_minus_one + assert ( + last_epoch_num_samples >= 0 + ), "last epoch number of samples should be non-negative." num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length - assert last_epoch_num_samples <= (num_samples_per_epoch + 1), \ - 'last epoch number of samples exceeded max value.' + assert last_epoch_num_samples <= ( + num_samples_per_epoch + 1 + ), "last epoch number of samples exceeded max value." # If we have less than 80% of the samples for the last epoch, # seperate out the epoch and treat it differently. # Note: the 80% number is just based on common sense and can # be adjusted if needed. - separate_last_epoch = (last_epoch_num_samples < - int(0.80 * num_samples_per_epoch)) + separate_last_epoch = last_epoch_num_samples < int( + 0.80 * num_samples_per_epoch + ) if separate_last_epoch: - string = ' > last epoch number of samples ({}) is smaller '\ - 'than 80% of number of samples per epoch ({}), '\ - 'setting separate_last_epoch to True' + string = ( + " > last epoch number of samples ({}) is smaller " + "than 80% of number of samples per epoch ({}), " + "setting separate_last_epoch to True" + ) else: - string = ' > last epoch number of samples ({}) is larger '\ - 'than 80% of number of samples per epoch ({}), '\ - 'setting separate_last_epoch to False' - print(string.format(last_epoch_num_samples, - num_samples_per_epoch), flush=True) - + string = ( + " > last epoch number of samples ({}) is larger " + "than 80% of number of samples per epoch ({}), " + "setting separate_last_epoch to False" + ) + log.debug(string.format(last_epoch_num_samples, num_samples_per_epoch)) try: os.makedirs(data_cache_dir, exist_ok=True) # description - with open(idx_path['desc'], 'wt') as fd: + with open(idx_path["desc"], "wt") as fd: fd.write(desc) # doc-idx. start_time = time.time() - doc_idx = _build_doc_idx(documents, num_epochs, np_rng, - separate_last_epoch) - np.save(idx_path['doc'], doc_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save doc-idx mapping ' - '(seconds): {:4f}'.format(time.time() - start_time)) + doc_idx = _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch) + np.save(idx_path["doc"], doc_idx, allow_pickle=True) + log.debug( + " > elasped time to build and save doc-idx mapping " + "(seconds): {:4f}".format(time.time() - start_time) + ) # sample-idx. start_time = time.time() # Use C++ implementation for speed. # First compile and then import. from megatron.data import helpers + assert doc_idx.dtype == np.int32 assert sizes.dtype == np.int32 - sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, - num_epochs, tokens_per_epoch) - np.save(idx_path['sample'], sample_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save sample-idx mapping ' - '(seconds): {:4f}'.format(time.time() - start_time)) + sample_idx = helpers.build_sample_idx( + sizes, + doc_idx, + seq_length, + num_epochs, + tokens_per_epoch, + torch.distributed.get_rank() == 0, + ) + np.save(idx_path["sample"], sample_idx, allow_pickle=True) + log.debug( + " > elasped time to build and save sample-idx mapping " + "(seconds): {:4f}".format(time.time() - start_time) + ) # shuffle-idx. start_time = time.time() # -1 is due to data structure used to retieve the index: @@ -490,45 +943,46 @@ def _build_index_mappings(name, data_prefix, documents, sizes, num_samples_ = num_samples_from_epochs_minus_one else: num_samples_ = sample_idx.shape[0] - 1 - shuffle_idx = _build_shuffle_idx(num_samples_, - sample_idx.shape[0] - 1, np_rng) - np.save(idx_path['shuffle'], shuffle_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save shuffle-idx mapping' - ' (seconds): {:4f}'.format(time.time() - start_time)) + shuffle_idx = _build_shuffle_idx( + num_samples_, sample_idx.shape[0] - 1, np_rng + ) + np.save(idx_path["shuffle"], shuffle_idx, allow_pickle=True) + log.debug( + " > elasped time to build and save shuffle-idx mapping" + " (seconds): {:4f}".format(time.time() - start_time) + ) except OSError: - print(f'There was an error trying to create the data cache directory ({data_cache_dir})') - print('or a file in it. This defaults to a directory "index-cache" within the directory') - print('the data files are in and can be set with the --data-cache-path argument. Please') - print('ensure you have write access to this directory or specify one that you do have') - print('write access to.') + print( + f"There was an error trying to create the data cache directory ({data_cache_dir})" + ) + print( + 'or a file in it. This defaults to a directory "index-cache" within the directory' + ) + print( + "the data files are in and can be set with the --data-cache-path argument. Please" + ) + print( + "ensure you have write access to this directory or specify one that you do have" + ) + print("write access to.") data_cache_success = False - counts = get_accelerator().LongTensor([data_cache_success]) - torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) - torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) - if counts[0].item() != ( - torch.distributed.get_world_size() // - torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) // - torch.distributed.get_world_size(group=mpu.get_sequence_parallel_group())): - print_rank_0("Data index creation unsuccessful, exiting.") - exit() - # Load mappings. start_time = time.time() - print_rank_0(f" > loading doc-idx mapping from {idx_path['doc']}") - doc_idx = np.load(idx_path['doc'], allow_pickle=True, mmap_mode='r') + log.debug(f" > loading doc-idx mapping from {idx_path['doc']}") + doc_idx = np.load(idx_path["doc"], allow_pickle=True, mmap_mode="r") - print_rank_0(f" > loading sample-idx mapping from {idx_path['sample']}") - sample_idx = np.load(idx_path['sample'], allow_pickle=True, mmap_mode='r') + log.debug(f" > loading sample-idx mapping from {idx_path['sample']}") + sample_idx = np.load(idx_path["sample"], allow_pickle=True, mmap_mode="r") - print_rank_0(f" > loading shuffle-idx mapping from {idx_path['shuffle']}") - shuffle_idx = np.load(idx_path['shuffle'], allow_pickle=True, mmap_mode='r') + log.debug(f" > loading shuffle-idx mapping from {idx_path['shuffle']}") + shuffle_idx = np.load(idx_path["shuffle"], allow_pickle=True, mmap_mode="r") - print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( - time.time() - start_time)) - print_rank_0(' total number of samples: {}'.format( - sample_idx.shape[0])) - print_rank_0(' total number of epochs: {}'.format(num_epochs)) + log.debug( + " loaded indexed file in {:3.3f} seconds".format(time.time() - start_time) + ) + log.debug(" total number of samples: {}".format(sample_idx.shape[0])) + log.debug(" total number of epochs: {}".format(num_epochs)) return doc_idx, sample_idx, shuffle_idx, desc, desc_hash @@ -553,24 +1007,25 @@ def _num_epochs(tokens_per_epoch, seq_length, num_samples): return num_epochs +@dlp.log def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): """Build an array with length = number-of-epochs * number-of-dcuments. Each index is mapped to a corresponding document.""" if not separate_last_epoch or num_epochs == 1: - doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1] + doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1] doc_idx[:] = documents doc_idx = doc_idx.reshape(-1) doc_idx = doc_idx.astype(np.int32) np_rng.shuffle(doc_idx) return doc_idx - doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False) + doc_idx_first = _build_doc_idx(documents, num_epochs - 1, np_rng, False) doc_idx_last = _build_doc_idx(documents, 1, np_rng, False) return np.concatenate((doc_idx_first, doc_idx_last)) -def _build_sample_idx(sizes, doc_idx, seq_length, - num_epochs, tokens_per_epoch): +@dlp.log +def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch): """Sample index mapping is a 2D array with sizes [number-of-samples + 1, 2] where [..., 0] contains the index into `doc_idx` and [..., 1] is the @@ -604,7 +1059,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length, # Note that -1 here is for the same reason we have -1 in # `_num_epochs` calculations. if remaining_seq_length <= 0: - doc_offset += (remaining_seq_length + doc_length - 1) + doc_offset += remaining_seq_length + doc_length - 1 remaining_seq_length = 0 else: # Otherwise, start from the begining of the next document. @@ -618,23 +1073,27 @@ def _build_sample_idx(sizes, doc_idx, seq_length, return sample_idx +@dlp.log def _build_shuffle_idx(num_samples, total_size, np_rng): """Build the range [0, size) and shuffle.""" - print(' > building shuffle index with split [0, {}) and [{}, {}) ' - '...'.format(num_samples, num_samples, total_size), flush=True) + log.debug( + " > building shuffle index with split [0, {}) and [{}, {}) " "...".format( + num_samples, num_samples, total_size + ) + ) dtype_ = np.uint32 if total_size >= (np.iinfo(np.uint32).max - 1): dtype_ = np.int64 - shuffle_idx_first = np.arange(start=0, stop=num_samples, - step=1, dtype=dtype_) + shuffle_idx_first = np.arange(start=0, stop=num_samples, step=1, dtype=dtype_) np_rng.shuffle(shuffle_idx_first) if num_samples == total_size: return shuffle_idx_first - shuffle_idx_last = np.arange(start=num_samples, stop=total_size, - step=1, dtype=dtype_) + shuffle_idx_last = np.arange( + start=num_samples, stop=total_size, step=1, dtype=dtype_ + ) np_rng.shuffle(shuffle_idx_last) return np.concatenate((shuffle_idx_first, shuffle_idx_last)) diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp index 142f159dd3..9dee0589b6 100644 --- a/megatron/data/helpers.cpp +++ b/megatron/data/helpers.cpp @@ -15,7 +15,23 @@ namespace py = pybind11; using namespace std; const int32_t LONG_SENTENCE_LEN = 512; - +void build_concat_indices(py::array_t& dataset_index, py::array_t& dataset_sample_index, + const py::array_t &num_samples, + const int64_t num_datasets, const bool verbose) { + if (verbose) { + std::cout << "> building indices for corpus datasets ..." << std::endl; + } + auto dataset_index_ptr = dataset_index.mutable_unchecked<1>(); + auto num_samples_ptr = num_samples.unchecked<1>(); + auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>(); + int64_t m = 0; + for(uint64_t i=0; i& dataset_index, py::array_t& dataset_sample_index, @@ -84,7 +100,7 @@ py::array build_sample_idx(const py::array_t& sizes_, const py::array_t& doc_idx_, const int32_t seq_length, const int32_t num_epochs, - const int64_t tokens_per_epoch) { + const int64_t tokens_per_epoch, const bool verbose=false) { /* Sample index (sample_idx) is used for gpt2 like dataset for which the documents are flattened and the samples are built based on this 1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2] @@ -103,16 +119,17 @@ py::array build_sample_idx(const py::array_t& sizes_, // Mapping and it's length (1D). int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length; int64_t* sample_idx = new int64_t[2*(num_samples+1)]; - - cout << " using:" << endl << std::flush; - cout << " number of documents: " << - doc_idx_.shape(0) / num_epochs << endl << std::flush; - cout << " number of epochs: " << num_epochs << - endl << std::flush; - cout << " sequence length: " << seq_length << - endl << std::flush; - cout << " total number of samples: " << num_samples << - endl << std::flush; + if (verbose) { + cout << " using:" << endl << std::flush; + cout << " number of documents: " << + doc_idx_.shape(0) / num_epochs << endl << std::flush; + cout << " number of epochs: " << num_epochs << + endl << std::flush; + cout << " sequence length: " << seq_length << + endl << std::flush; + cout << " total number of samples: " << num_samples << + endl << std::flush; + } // Index into sample_idx. int64_t sample_index = 0; @@ -698,4 +715,5 @@ PYBIND11_MODULE(helpers, m) { m.def("build_blocks_mapping", &build_blocks_mapping); m.def("build_sample_idx", &build_sample_idx); m.def("build_blending_indices", &build_blending_indices); + m.def("build_concat_indices", &build_concat_indices); } diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py index 62ebdc9813..e2a0c4751f 100644 --- a/megatron/data/indexed_dataset.py +++ b/megatron/data/indexed_dataset.py @@ -15,13 +15,20 @@ from functools import lru_cache import os + import shutil import struct from itertools import accumulate import numpy as np import torch -from megatron import print_rank_0 + +# from megatron import print_rank_0 +from megatron.utils import Profile, get_logger + +log = get_logger(__name__) + +dlp = Profile("DATASET") def __best_fitting_dtype(vocab_size=None): @@ -32,28 +39,32 @@ def __best_fitting_dtype(vocab_size=None): def get_available_dataset_impl(): - return ['lazy', 'cached', 'mmap'] + return ["lazy", "cached", "mmap"] def infer_dataset_impl(path): if IndexedDataset.exists(path): - with open(index_file_path(path), 'rb') as f: + with open(index_file_path(path), "rb") as f: magic = f.read(8) if magic == IndexedDataset._HDR_MAGIC: - return 'cached' + return "cached" elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]: - return 'mmap' + return "mmap" else: return None else: print(f"Dataset does not exist: {path}") - print("Path should be a basename that both .idx and .bin can be appended to get full filenames.") + print( + "Path should be a basename that both .idx and .bin can be appended to get full filenames." + ) return None def make_builder(out_file, impl, vocab_size=None): - if impl == 'mmap': - return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size)) + if impl == "mmap": + return MMapIndexedDatasetBuilder( + out_file, dtype=__best_fitting_dtype(vocab_size) + ) else: return IndexedDatasetBuilder(out_file) @@ -61,22 +72,24 @@ def make_builder(out_file, impl, vocab_size=None): def make_dataset(path, impl, skip_warmup=False): if not IndexedDataset.exists(path): print(f"Dataset does not exist: {path}") - print("Path should be a basename that both .idx and .bin can be appended to get full filenames.") + print( + "Path should be a basename that both .idx and .bin can be appended to get full filenames." + ) return None - if impl == 'infer': + if impl == "infer": impl = infer_dataset_impl(path) - if impl == 'lazy' and IndexedDataset.exists(path): + if impl == "lazy" and IndexedDataset.exists(path): return IndexedDataset(path) - elif impl == 'cached' and IndexedDataset.exists(path): + elif impl == "cached" and IndexedDataset.exists(path): return IndexedCachedDataset(path) - elif impl == 'mmap' and MMapIndexedDataset.exists(path): + elif impl == "mmap" and MMapIndexedDataset.exists(path): return MMapIndexedDataset(path, skip_warmup) print(f"Unknown dataset implementation: {impl}") return None def dataset_exists(path, impl): - if impl == 'mmap': + if impl == "mmap": return MMapIndexedDataset.exists(path) else: return IndexedDataset.exists(path) @@ -112,11 +125,11 @@ def code(dtype): def index_file_path(prefix_path): - return prefix_path + '.idx' + return prefix_path + ".idx" def data_file_path(prefix_path): - return prefix_path + '.bin' + return prefix_path + ".bin" def create_doc_idx(sizes): @@ -129,7 +142,8 @@ def create_doc_idx(sizes): class IndexedDataset(torch.utils.data.Dataset): """Loader for IndexedDataset""" - _HDR_MAGIC = b'TNTIDX\x00\x00' + + _HDR_MAGIC = b"TNTIDX\x00\x00" def __init__(self, path): super().__init__() @@ -137,43 +151,46 @@ def __init__(self, path): self.data_file = None self.read_index(path) + @dlp.log def read_index(self, path): - with open(index_file_path(path), 'rb') as f: + with open(index_file_path(path), "rb") as f: magic = f.read(8) assert magic == self._HDR_MAGIC, ( - 'Index file doesn\'t match expected format. ' - 'Make sure that --dataset-impl is configured properly.' + "Index file doesn't match expected format. " + "Make sure that --dataset-impl is configured properly." ) version = f.read(8) - assert struct.unpack('= self._len: - raise IndexError('index out of range') + raise IndexError("index out of range") def __del__(self): if self.data_file: self.data_file.close() # @lru_cache(maxsize=8) + @dlp.log def __getitem__(self, idx): if not self.data_file: self.read_data(self.path) if isinstance(idx, int): i = idx self.check_index(i) - tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]] + tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]] a = np.empty(tensor_size, dtype=self.dtype) self.data_file.seek(self.data_offsets[i] * self.element_size) self.data_file.readinto(a) @@ -182,7 +199,7 @@ def __getitem__(self, idx): start, stop, step = idx.indices(len(self)) if step != 1: raise ValueError("Slices into indexed_dataset must be contiguous") - sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]] + sizes = self.sizes[self.dim_offsets[start] : self.dim_offsets[stop]] size = sum(sizes) a = np.empty(size, dtype=self.dtype) self.data_file.seek(self.data_offsets[start] * self.element_size) @@ -202,8 +219,8 @@ def size(self, index): @staticmethod def exists(path): - return ( - os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)) + return os.path.exists(index_file_path(path)) and os.path.exists( + data_file_path(path) ) @property @@ -212,7 +229,6 @@ def supports_prefetch(self): class IndexedCachedDataset(IndexedDataset): - def __init__(self, path): super().__init__(path) self.cache = None @@ -222,6 +238,7 @@ def __init__(self, path): def supports_prefetch(self): return True + @dlp.log def prefetch(self, indices): if all(i in self.cache_index for i in indices): return @@ -237,7 +254,7 @@ def prefetch(self, indices): for i in indices: self.cache_index[i] = ptx size = self.data_offsets[i + 1] - self.data_offsets[i] - a = self.cache[ptx: ptx + size] + a = self.cache[ptx : ptx + size] self.data_file.seek(self.data_offsets[i] * self.element_size) self.data_file.readinto(a) ptx += size @@ -247,14 +264,15 @@ def prefetch(self, indices): self.data_file = None # @lru_cache(maxsize=8) + @dlp.log def __getitem__(self, idx): if isinstance(idx, int): i = idx self.check_index(i) - tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]] + tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]] a = np.empty(tensor_size, dtype=self.dtype) ptx = self.cache_index[i] - np.copyto(a, self.cache[ptx: ptx + a.size]) + np.copyto(a, self.cache[ptx : ptx + a.size]) return a elif isinstance(idx, slice): # Hack just to make this work, can optimizer later if necessary @@ -275,8 +293,9 @@ class IndexedDatasetBuilder(object): np.float64: 8, } + @dlp.log def __init__(self, out_file, dtype=np.int32): - self.out_file = open(out_file, 'wb') + self.out_file = open(out_file, "wb") self.dtype = dtype self.data_offsets = [0] self.dim_offsets = [0] @@ -284,6 +303,7 @@ def __init__(self, out_file, dtype=np.int32): self.element_size = self.element_sizes[self.dtype] self.doc_idx = [0] + @dlp.log def add_item(self, tensor): bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype)) self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size) @@ -294,6 +314,7 @@ def add_item(self, tensor): def end_document(self): self.doc_idx.append(len(self.sizes)) + @dlp.log def merge_file_(self, another_file): index = IndexedDataset(another_file) assert index.dtype == self.dtype @@ -311,7 +332,7 @@ def merge_file_(self, another_file): self.doc_idx.extend((doc_offset + index.doc_idx)[1:]) - with open(data_file_path(another_file), 'rb') as f: + with open(data_file_path(another_file), "rb") as f: while True: data = f.read(1024) if data: @@ -321,12 +342,12 @@ def merge_file_(self, another_file): def finalize(self, index_file): self.out_file.close() - index = open(index_file, 'wb') - index.write(b'TNTIDX\x00\x00') - index.write(struct.pack(' setting tensorboard ...') _GLOBAL_TENSORBOARD_WRITER = SummaryWriter( log_dir=args.tensorboard_dir, @@ -152,6 +159,44 @@ def _set_tensorboard_writer(args): 'no TensorBoard logs will be written.', flush=True) +def _set_wandb_writer(args): + """Set wandb writer.""" + global _GLOBAL_WANDB_WRITER + _ensure_var_is_not_initialized(_GLOBAL_WANDB_WRITER, + 'wandb writer') + + if args.rank == (args.world_size - 1): + if getattr(args, 'wandb_project', '') == '' and \ + getattr(args, 'wandb_exp_name', '') == '': + print('WARNING: WANDB writing requested but no legit wandb ' + 'project or experiment name provided, ' + 'therefore no WANDB logs will be written ' + 'according to random generated project or experiment name.', flush=True) + return + + try: + import wandb + except (ImportError, ModuleNotFoundError): + print('WARNING: WANDB writing requested but is not ' + 'available (try to pip install wandb to solve it), ' + 'no WANDB logs will be written.', flush=True) + return + + if args.wandb_save_dir: + save_dir = args.wandb_save_dir + else: + # Defaults to the save dir. + save_dir = os.path.join(args.save, 'wandb') + wandb_kwargs = { + 'dir': save_dir, + 'name': args.wandb_exp_name, + 'project': args.wandb_project, + 'config': vars(args)} + os.makedirs(wandb_kwargs['dir'], exist_ok=True) + wandb.init(**wandb_kwargs) + _GLOBAL_WANDB_WRITER = wandb + + def _set_adlr_autoresume(args): """Initialize ADLR autoresume.""" global _GLOBAL_ADLR_AUTORESUME @@ -179,9 +224,9 @@ def _set_timers(args): def _ensure_var_is_initialized(var, name): """Make sure the input variable is not None.""" - assert var is not None, '{} is not initialized.'.format(name) + assert var is not None, f'{name} is not initialized.' def _ensure_var_is_not_initialized(var, name): """Make sure the input variable is not None.""" - assert var is None, '{} is already initialized.'.format(name) + assert var is None, f'{name} is already initialized.' diff --git a/megatron/initialize.py b/megatron/initialize.py index 31f26c5086..90acf496ee 100644 --- a/megatron/initialize.py +++ b/megatron/initialize.py @@ -1,3 +1,4 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. """Megatron initialization.""" @@ -15,6 +16,8 @@ from megatron import get_args from megatron import get_tensorboard_writer from megatron.core import mpu, tensor_parallel +from megatron.core.pipeline_parallel.deepspeed_zbh1_engine import _exec_backward_only_pass, _exec_weight_pass +from megatron.core.pipeline_parallel.deepspeed_zbh1_schedule import BackwardOnlyPass, WeightPass, ZeroBubbleH1Pipeline from megatron.arguments import (parse_args, validate_args) from megatron.checkpointing import load_args_from_checkpoint from megatron.global_vars import set_global_variables @@ -182,6 +185,7 @@ def setup_deepspeed_random_and_activation_checkpointing(args): deepspeed.checkpointing.configure( mpu, + deepspeed_config=args.deepspeed_config, partition_activations=args.partition_activations, contiguous_checkpointing=args.contigious_checkpointing, num_checkpoints=num_layers, @@ -216,13 +220,21 @@ def _initialize_distributed(): get_accelerator().set_device(device) # only do so when device_count > 0 + if args.enable_zbh1_pipeline: + deepspeed.runtime.pipe.schedule.TrainSchedule = ZeroBubbleH1Pipeline + deepspeed.runtime.pipe.engine.PipelineEngine._INSTRUCTION_MAP.update( + { + BackwardOnlyPass: _exec_backward_only_pass, + WeightPass: _exec_weight_pass, + } + ) # Call the init process if args.deepspeed or args.ds_inference: deepspeed.init_distributed() else: if not torch.distributed.is_initialized(): torch.distributed.init_process_group( - backend=args.distributed_backend, + backend=get_accelerator().communication_backend_name(), world_size=args.world_size, rank=args.rank, timeout=timedelta(minutes=args.distributed_timeout_minutes)) diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py index 2306749fcb..f2beea06e8 100644 --- a/megatron/model/__init__.py +++ b/megatron/model/__init__.py @@ -1,16 +1,36 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# type:ignore +# noqa: E401,E402,F401 +import torch from deepspeed.accelerator.real_accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': + +accelerator = get_accelerator() + +if accelerator is not None and accelerator.device_name() == "xpu": + import intel_extension_for_pytorch # noqa: F401 # type: ignore + +if accelerator is not None and accelerator.device_name() == "cuda": from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm - from apex.normalization import MixedFusedRMSNorm as RMSNorm + + try: + from apex.normalization import MixedFusedRMSNorm as RMSNorm # type:ignore + + HAS_APEX = True + except Exception: + HAS_APEX = False + from .rmsnorm import RMSNorm else: - from .rmsnorm import RMSNorm - from torch.nn import LayerNorm - -from .distributed import DistributedDataParallel -from .bert_model import BertModel -from .gpt_model import GPTModel, GPTModelPipe -from .t5_model import T5Model -from .language_model import get_language_model -from .module import Float16Module + if hasattr(torch.xpu, "IpexRmsNorm"): + from .fused_rmsnorm import RMSNorm + else: + from .rmsnorm import RMSNorm # noqa:E401,E402,F401 + from torch.nn import LayerNorm # noqa:E401,E402,F401 + + +from .distributed import DistributedDataParallel # noqa:E401,E402,F401 +from .bert_model import BertModel # noqa:E401,E402,F401 +from .gpt_model import GPTModel, GPTModelPipe # noqa:E401,E402,F401 +from .t5_model import T5Model # noqa:E401,E402,F401 +from .language_model import get_language_model # noqa:E401,E402,F401 +from .module import Float16Module # noqa:E401,E402,F401 diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 2f3b89014b..d1ef034397 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -1,9 +1,11 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. """This code is copied fron NVIDIA apex: https://github.com/NVIDIA/apex with some changes. """ +from deepspeed.accelerator.real_accelerator import get_accelerator import numbers import torch from torch.nn.parameter import Parameter @@ -13,6 +15,7 @@ import inspect from megatron.core.utils import make_viewless_tensor +from megatron import get_args try: from apex.contrib.layer_norm.layer_norm import FastLayerNormFN @@ -56,8 +59,15 @@ def __init__(self, normalized_shape, eps=1e-5, normalized_shape = (normalized_shape,) self.normalized_shape = torch.Size(normalized_shape) self.eps = eps - self.weight = Parameter(torch.Tensor(*normalized_shape)) - self.bias = Parameter(torch.Tensor(*normalized_shape)) + init_device = None + if get_accelerator().device_name() == 'hpu': + init_device = get_accelerator().current_device_name() + self.weight = Parameter(torch.empty(*normalized_shape, + device=init_device, + dtype=get_args().params_dtype)) + self.bias = Parameter(torch.empty(*normalized_shape, + device=init_device, + dtype=get_args().params_dtype)) self.reset_parameters() self.no_persist_layer_norm = no_persist_layer_norm self.sequence_parallel = sequence_parallel diff --git a/megatron/model/fused_rmsnorm.py b/megatron/model/fused_rmsnorm.py new file mode 100644 index 0000000000..d69b0822a4 --- /dev/null +++ b/megatron/model/fused_rmsnorm.py @@ -0,0 +1,24 @@ +from megatron import get_args + +import torch +from torch.nn.parameter import Parameter +from torch.nn import init +import intel_extension_for_pytorch as ipex # noqa + + +# Taken from facebookresearch/llama +class RMSNorm(torch.nn.Module): + def __init__( + self, dim: int, eps: float = 1e-6, sequence_parallel: bool = False + ): + super().__init__() + self.eps = eps + self.weight = Parameter(torch.ones(dim, dtype=get_args().params_dtype)) + self.sequence_parallel = sequence_parallel + setattr(self.weight, "sequence_parallel", self.sequence_parallel) + + def forward(self, x): + output = torch.xpu.IpexRmsNorm( + x, self.weight.shape, self.weight, self.eps + ) + return output diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py index 0527765f16..e5e60c43ee 100644 --- a/megatron/model/gpt_model.py +++ b/megatron/model/gpt_model.py @@ -1,8 +1,10 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. """GPT-2 model.""" import torch +from collections import OrderedDict from megatron import get_args from megatron.core import mpu, tensor_parallel, sequence_parallel @@ -16,7 +18,7 @@ from megatron.model import LayerNorm, RMSNorm from .language_model import EmbeddingPipe -from .transformer import ParallelTransformerLayerPipe, LMHeadPipe +from .transformer import ParallelTransformerLayerPipe, LMHeadPipe, get_num_experts_per_layer from deepspeed.pipe import PipelineModule, LayerSpec, TiedLayerSpec @@ -360,20 +362,44 @@ def _to_float16(inputs): embedding_weights_in_fp32=args.embedding_weights_in_fp32, tied_weight_attr='word_embeddings_weight')) + experts_per_layer = get_num_experts_per_layer(args.num_experts, args.num_layers, args.expert_interval) + self.is_moe_model = any(n_experts > 1 for n_experts in experts_per_layer) + + # Currently PipelineEngine does not support more than 1 pipe and/or grad partitioned tensors that + # require grads. + # When using MoE, we have 2 tensors that are passed along pipeline stages and both require grads. + # Therefore, verify that both pipe_partitioned / grad_partitioned are not enabled + if self.is_moe_model and args.pipeline_model_parallel_size > 1 and args.tensor_model_parallel_size > 1: + pipe_partitioned_enabled = args.deepspeed_config_dict.get('pipeline', {}).get('pipe_partitioned', False) + grad_partitioned_enabled = args.deepspeed_config_dict.get('pipeline', {}).get('grad_partitioned', False) + assert not pipe_partitioned_enabled and not grad_partitioned_enabled, \ + 'Pipe and/or Grad partitioning are not supported for MoE model' + for layer_idx in range(args.num_layers): self.specs.append( LayerSpec(ParallelTransformerLayerPipe, - config, - layer_number=layer_idx, - self_attn_mask_type=AttnMaskType.causal)) + config, + layer_number=layer_idx, + self_attn_mask_type=AttnMaskType.causal, + num_experts=experts_per_layer[layer_idx], + input_aggregated_moe_loss=(self.is_moe_model and layer_idx > 0), + return_aggregated_moe_loss=self.is_moe_model)) + + # if model has experts, add a layer to get and cache the aggregated moe loss from the + # last transformer layer + if self.is_moe_model: + self.specs.append(self._calculate_moe_loss) # Final layernorm after transformer layers if args.normalization == 'layernorm': self.specs.append(LayerSpec(LayerNorm, args.hidden_size, - eps=args.layernorm_epsilon)) + eps=args.layernorm_epsilon, + sequence_parallel=args.sequence_parallel)) else: - self.specs.append(LayerSpec(RMSNorm, args.hidden_size, args.layernorm_epsilon)) + self.specs.append(LayerSpec(RMSNorm, args.hidden_size, + args.layernorm_epsilon, + sequence_parallel=args.sequence_parallel)) def _logits_helper(embedding, lm_output): """A wrapper to massage inputs/outputs from pipeline. """ @@ -404,6 +430,11 @@ def _logits_helper(embedding, lm_output): if args.fp16 or args.bf16: self.specs.append(float16_to_fp32) + # Cache losses + self.moe_loss = None + self.last_lm_loss = None # detached, for display only + self.last_moe_loss = None # detached, for display only + if args.checkpoint_activations: interval = args.checkpoint_num_layers elif args.recompute_granularity == "full" and args.recompute_method == 'uniform': @@ -418,10 +449,34 @@ def _logits_helper(embedding, lm_output): num_dp=mpu.get_data_parallel_world_size()) super().__init__(layers=self.specs, - loss_fn=CrossEntropy, + loss_fn=self.loss_func, topology=topo, activation_checkpoint_interval=interval, partition_method='type:transformer') + def _calculate_moe_loss(self, inputs): + """ Calculate MoE auxiliary loss """ + assert isinstance(inputs, tuple) and len(inputs) == 2 + hidden, aggregated_moe_loss = inputs[0], inputs[1] + args = get_args() + self.moe_loss = aggregated_moe_loss * args.moe_loss_coeff + return hidden + + def loss_func(self, output, labels): + loss = CrossEntropy(output, labels) + self.last_lm_loss = loss.clone().detach() + if self.moe_loss is not None: + loss += self.moe_loss + self.last_moe_loss = self.moe_loss.clone().detach() + return loss + def universal_checkpoint_info(self): return UniversalCheckpointInfo(using_model_pipe=True).get() + + def get_additional_losses(self): + if not self.is_moe_model: + return None + return OrderedDict({ + 'lm loss': self.last_lm_loss, + 'moe loss': self.last_moe_loss + }) diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index ec2ae1877a..eebf8744ca 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -1,3 +1,4 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. """Transformer based language model.""" @@ -256,8 +257,8 @@ def forward(self, input_ids, position_ids, tokentype_ids=None): # Dropout. if self.sequence_parallel: - # already partition sequence, do not need scatter_to_sequence_parallel_region - # embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings) + # already partition sequence, do not need scatter_to_sequence_parallel_region ? + embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings) with tensor_parallel.get_cuda_rng_tracker().fork(): embeddings = self.embedding_dropout(embeddings) else: @@ -389,10 +390,16 @@ def __init__(self, post_process=True, num_experts=[1]): args = get_args() - # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5. - if args.untie_embeddings_and_output_weights: assert not add_decoder - super(TransformerLanguageModel, self).__init__(share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights) - + # TODO: passing `share_embeddings_and_output_weights=False` + # will not work correctly for T5 and embeddings will not be synced. + # Fix later for T5. + if args.untie_embeddings_and_output_weights: + assert not add_decoder + super(TransformerLanguageModel, self).__init__( + share_embeddings_and_output_weights=( + not args.untie_embeddings_and_output_weights + ) + ) self.pre_process = pre_process self.post_process = post_process self.hidden_size = config.hidden_size @@ -405,27 +412,35 @@ def __init__(self, self.add_pooler = add_pooler self.encoder_hidden_state = None self.add_retriever = args.retro_add_retriever - self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights + self.untie_embeddings_and_output_weights = ( + args.untie_embeddings_and_output_weights + ) self.num_experts = num_experts # Embeddings. if self.pre_process: - self.embedding = Embedding(self.hidden_size, - args.padded_vocab_size, - args.max_position_embeddings, - args.hidden_dropout, - config, - self.num_tokentypes, - args.embedding_weights_in_fp32) + self.embedding = Embedding( + self.hidden_size, + args.padded_vocab_size, + args.max_position_embeddings, + args.hidden_dropout, + config, + self.num_tokentypes, + args.embedding_weights_in_fp32 + ) self._embedding_key = 'embedding' # Rotary positional embeddings - self.use_rotary_position_embeddings = \ - args.use_rotary_position_embeddings + self.use_rotary_position_embeddings = ( + args.use_rotary_position_embeddings + ) if args.use_rotary_position_embeddings: self.seq_length = args.seq_length - rotary_dim = args.hidden_size // args.num_attention_heads \ - if args.kv_channels is None else args.kv_channels + rotary_dim = ( + args.hidden_size // args.num_attention_heads + if args.kv_channels is None + else args.kv_channels + ) if args.rotary_percent < 1.0: rotary_dim = int(rotary_dim * args.rotary_percent) @@ -433,15 +448,22 @@ def __init__(self, # partial rotary embeddings, which is better than full rotary # Wang and Komatsuzaki et al # https://github.com/kingoflolz/mesh-transformer-jax/ - self.rotary_pos_emb = RotaryEmbedding(rotary_dim, theta=args.rope_theta) + self.rotary_pos_emb = RotaryEmbedding( + rotary_dim, + theta=args.rope_theta + ) # Encoder (usually set to True, False if part of an encoder-decoder # architecture and in encoder-only stage). if self.add_encoder: self.encoder = ParallelTransformer( config, - model_type=args.model_type if not args.retro_add_retriever \ - else ModelType.retro_decoder, + # args.model_type if not args.retro_add_retriever + # else ModelType.retro_decoder + model_type=( + ModelType.retro_decoder if args.retro_add_retriever + else args.model_type + ), self_attn_mask_type=self.encoder_attn_mask_type, pre_process=self.pre_process, post_process=self.post_process, @@ -461,7 +483,8 @@ def __init__(self, self_attn_mask_type=self.decoder_attn_mask_type, pre_process=self.pre_process, post_process=self.post_process, - num_experts=self.num_experts) + num_experts=self.num_experts + ) self._decoder_key = 'decoder' else: self.decoder = None @@ -478,24 +501,30 @@ def __init__(self, args.padded_vocab_size, config=config, init_method=self.init_method, - bias=False) # Setting bias to False always to keep it consistent with embedding tying that also does not have a bias. + # Setting bias to False always to keep it consistent with + # embedding tying that also does not have a bias. + bias=False + ) self._output_layer_key = 'output_layer' def set_input_tensor(self, input_tensor): """ See megatron.model.transformer.set_input_tensor()""" - # This is usually handled in schedules.py but some inference code still # gives us non-lists or None if not isinstance(input_tensor, list): input_tensor = [input_tensor] if self.add_encoder and self.add_decoder: - assert len(input_tensor) == 1, \ - 'input_tensor should only be length 1 for stage with both encoder and decoder' + assert len(input_tensor) == 1, ( + 'input_tensor should only be length 1 ' + 'for stage with both encoder and decoder' + ) self.encoder.set_input_tensor(input_tensor[0]) elif self.add_encoder: - assert len(input_tensor) == 1, \ - 'input_tensor should only be length 1 for stage with only encoder' + assert len(input_tensor) == 1, ( + 'input_tensor should only be length 1 ' + 'for stage with only encoder' + ) self.encoder.set_input_tensor(input_tensor[0]) elif self.add_decoder: if len(input_tensor) == 2: @@ -505,32 +534,50 @@ def set_input_tensor(self, input_tensor): self.decoder.set_input_tensor(None) self.encoder_hidden_state = input_tensor[0] else: - raise Exception('input_tensor must have either length 1 or 2') + raise Exception( + 'input_tensor must have either length 1 or 2' + ) else: - raise Exception('Stage must have at least either encoder or decoder') - - def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, - dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None, - retriever_input_ids=None, - retriever_position_ids=None, - retriever_attn_mask=None, - enc_dec_attn_mask=None, tokentype_ids=None, - inference_params=None, - pooling_sequence_index=0, - enc_hidden_states=None, output_enc_hidden=False): + raise Exception( + 'Stage must have at least either encoder or decoder' + ) + + def forward( + self, + enc_input_ids, + enc_position_ids, + enc_attn_mask, + dec_input_ids=None, + dec_position_ids=None, + dec_attn_mask=None, + retriever_input_ids=None, + retriever_position_ids=None, + retriever_attn_mask=None, + enc_dec_attn_mask=None, + tokentype_ids=None, + inference_params=None, + pooling_sequence_index=0, + enc_hidden_states=None, + output_enc_hidden=False + ): args = get_args() # Encoder embedding. if self.pre_process: - encoder_input = self.embedding(enc_input_ids, enc_position_ids, - tokentype_ids=tokentype_ids) + encoder_input = self.embedding( + enc_input_ids, + enc_position_ids, + tokentype_ids=tokentype_ids + ) else: encoder_input = None # Retriever embedding. if self.add_retriever and self.pre_process: - retriever_input = self.embedding(retriever_input_ids, - retriever_position_ids, - tokentype_ids=tokentype_ids) + retriever_input = self.embedding( + retriever_input_ids, + retriever_position_ids, + tokentype_ids=tokentype_ids + ) else: retriever_input = None diff --git a/megatron/model/module.py b/megatron/model/module.py index 963ad2d29d..08cf7cb553 100644 --- a/megatron/model/module.py +++ b/megatron/model/module.py @@ -1,3 +1,4 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. """Megatron Module""" @@ -10,10 +11,9 @@ from megatron.core import mpu, tensor_parallel -_FLOAT_TYPES = [get_accelerator().FloatTensor(0).dtype] -_HALF_TYPES = [get_accelerator().HalfTensor(0).dtype] -_BF16_TYPES = [get_accelerator().BFloat16Tensor(0).dtype] - +_FLOAT_TYPES = None +_HALF_TYPES = None +_BF16_TYPES = None def param_is_not_shared(param): @@ -131,6 +131,9 @@ def conversion_helper(val, conversion): def fp32_to_float16(val, float16_convertor): """Convert fp32 `val` to fp16/bf16""" + global _FLOAT_TYPES + if _FLOAT_TYPES is None: + _FLOAT_TYPES = [get_accelerator().FloatTensor(0).dtype] def half_conversion(val): val_typecheck = val if isinstance(val_typecheck, (Parameter, Variable)): @@ -143,6 +146,11 @@ def half_conversion(val): def float16_to_fp32(val): """Convert fp16/bf16 `val` to fp32""" + global _HALF_TYPES, _BF16_TYPES + if _HALF_TYPES is None: + _HALF_TYPES = [get_accelerator().HalfTensor(0).dtype] + if _BF16_TYPES is None: + _BF16_TYPES = [get_accelerator().BFloat16Tensor(0).dtype] def float_conversion(val): val_typecheck = val if isinstance(val_typecheck, (Parameter, Variable)): diff --git a/megatron/model/rmsnorm.py b/megatron/model/rmsnorm.py index 60e8978171..68b792c83d 100644 --- a/megatron/model/rmsnorm.py +++ b/megatron/model/rmsnorm.py @@ -1,16 +1,33 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. + +from deepspeed.accelerator import get_accelerator +from megatron import get_args + import torch +from torch.nn import init from torch.nn.parameter import Parameter + # Taken from facebookresearch/llama class RMSNorm(torch.nn.Module): - def __init__(self, dim: int, eps: float = 1e-6): + def __init__( + self, dim: int, eps: float = 1e-6, sequence_parallel: bool = False + ): super().__init__() self.eps = eps - self.weight = Parameter(torch.ones(dim)) + init_device = None + if get_accelerator().device_name() == "hpu": + init_device = get_accelerator().current_device_name() + self.weight = Parameter( + torch.empty(dim, device=init_device, dtype=get_args().params_dtype) + ) + init.ones_(self.weight) + self.sequence_parallel = sequence_parallel + setattr(self.weight, "sequence_parallel", self.sequence_parallel) def _norm(self, x): return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) def forward(self, x): output = self._norm(x.float()).type_as(x) - return output * self.weight \ No newline at end of file + return output * self.weight diff --git a/megatron/model/rotary_pos_embedding.py b/megatron/model/rotary_pos_embedding.py index 4d4497e0cd..0a7acb9efc 100644 --- a/megatron/model/rotary_pos_embedding.py +++ b/megatron/model/rotary_pos_embedding.py @@ -1,3 +1,4 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # coding=utf-8 # The following code has been taken from https://github.com/NVIDIA/NeMo/blob/ \ @@ -11,6 +12,10 @@ __all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb'] +# sin, cos tensors cached for all devices +cos_cached = None +sin_cached = None + class RotaryEmbedding(nn.Module): def __init__(self, dim, theta=10000): super().__init__() @@ -47,10 +52,19 @@ def apply_rotary_pos_emb(t, freqs): check https://kexue.fm/archives/8265 for detailed formulas """ rot_dim = freqs.shape[-1] - # ideally t_pass is empty so rotary pos embedding is applied to all tensor t - t, t_pass = t[..., :rot_dim], t[..., rot_dim:] + t_pass = None + if t.shape[-1] != rot_dim: + # ideally t_pass is empty so rotary pos embedding is applied to all tensor t + t, t_pass = t[..., :rot_dim], t[..., rot_dim:] + global cos_cached, sin_cached + if cos_cached is None or sin_cached is None or t.shape[0] != cos_cached.shape[0]: + freqs_ = freqs[:t.shape[0]] + cos_cached = freqs_.cos().to(t.dtype) + sin_cached = freqs_.sin().to(t.dtype) # first part is cosine component # second part is sine component, need to change signs with _rotate_half method - t = (t * freqs.cos().to(t.dtype)) + (_rotate_half(t) * freqs.sin().to(t.dtype)) - return t if t_pass.shape[-1] == 0 else torch.cat((t, t_pass), dim=-1) + t = (t * cos_cached) + (_rotate_half(t) * sin_cached) + if t_pass is None: + return t + return torch.cat((t, t_pass), dim=-1) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 924f46c688..592ff2855b 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -1,3 +1,4 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. """Transformer.""" @@ -24,6 +25,7 @@ try: from deepspeed.sequence.layer import DistributedAttention + dist_attn_supported = True except ImportError: dist_attn_supported = False @@ -36,9 +38,12 @@ try: # FlashAttention (1.x) from flash_attn.flash_attn_interface import flash_attn_unpadded_func - from flash_attn.flash_attn_triton import flash_attn_func except ImportError: flash_attn_unpadded_func = None + +try: + from flash_attn.flash_attn_triton import flash_attn_func +except ImportError: flash_attn_func = None try: @@ -66,28 +71,31 @@ hyperparameters: transformer hyperparameters """ + class DropPath(MegatronModule): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """ - def __init__(self, drop_prob=0.): + def __init__(self, drop_prob=0.0): super(DropPath, self).__init__() self.drop_prob = drop_prob def forward(self, hidden_state): - if self.drop_prob == 0. or not self.training: + if self.drop_prob == 0.0 or not self.training: return hidden_state keep_prob = 1 - self.drop_prob # work with diff dim tensors, not just 2D ConvNets # hidden_state: [s, b, h] shape = (1,) + (hidden_state.shape[1],) + (1,) * (hidden_state.ndim - 2) - random_tensor = keep_prob + \ - torch.rand(shape, dtype=hidden_state.dtype, device=hidden_state.device) + random_tensor = keep_prob + torch.rand( + shape, dtype=hidden_state.dtype, device=hidden_state.device + ) random_tensor.floor_() # binarize output = hidden_state.div(keep_prob) * random_tensor return output + class ParallelMLP(MegatronModule): """MLP. @@ -116,7 +124,7 @@ def __init__(self, config, moe=False, enable_expert_tensor_parallelism=False): gather_output=False, skip_bias_add=True, moe=moe, - enable_expert_tensor_parallelism=enable_expert_tensor_parallelism + enable_expert_tensor_parallelism=enable_expert_tensor_parallelism, ) self.bias_gelu_fusion = False @@ -128,13 +136,17 @@ def __init__(self, config, moe=False, enable_expert_tensor_parallelism=False): elif args.onnx_safe: self.activation_func = erf_gelu elif args.swiglu: + def swiglu(x): x = torch.chunk(x, 2, dim=-1) return F.silu(x[0]) * x[1] + self.activation_func = swiglu elif args.squared_relu: + def squared_relu(x): return torch.pow(F.relu(x), 2) + self.activation_func = squared_relu else: self.bias_gelu_fusion = args.bias_gelu_fusion @@ -149,7 +161,7 @@ def squared_relu(x): bias=self.add_bias, input_is_parallel=True, moe=moe, - enable_expert_tensor_parallelism=enable_expert_tensor_parallelism + enable_expert_tensor_parallelism=enable_expert_tensor_parallelism, ) def forward(self, hidden_states): @@ -171,10 +183,12 @@ def forward(self, hidden_states): output, output_bias = self.dense_4h_to_h(intermediate_parallel) return output, output_bias + class SwitchMLP(MegatronModule): """ Routes input to one of N MLP "experts" """ + def __init__(self, config): super(SwitchMLP, self).__init__() args = get_args() @@ -191,29 +205,29 @@ def forward(self, hidden_states): route = self.router(hidden_states) route = torch.nn.functional.softmax(route, dim=2) max_prob, max_ind = torch.max(route, dim=2) - max_prob = torch.unsqueeze(max_prob, 2) # [s b 1] + max_prob = torch.unsqueeze(max_prob, 2) # [s b 1] # TODO (rprenger) TODO this could be made easier to read # Converting [s, b, h] to [s*b, h]. # Each vector could be routed differently - hidden_states = hidden_states.view(-1, hidden_states.size(2)) # [s*b h] - max_prob = max_prob.view(-1, max_prob.size(2)) # [s*b 1] - max_ind = max_ind.view(-1) # [s*b] + hidden_states = hidden_states.view(-1, hidden_states.size(2)) # [s*b h] + max_prob = max_prob.view(-1, max_prob.size(2)) # [s*b 1] + max_ind = max_ind.view(-1) # [s*b] output_total = torch.empty_like(hidden_states) output_bias_total = torch.empty_like(hidden_states) - #TODO (rprenger) This does each expert in serial, but it could be parallelized + # TODO (rprenger) This does each expert in serial, but it could be parallelized for expert_num, expert in enumerate(self.experts): local_indices = (max_ind == expert_num).nonzero() - hidden = hidden_states[local_indices,:] + hidden = hidden_states[local_indices, :] output, output_bias = expert(hidden) output_bias = output_bias.expand_as(output) - output_total[local_indices,:] = output - output_bias_total[local_indices,:] = output_bias + output_total[local_indices, :] = output + output_bias_total[local_indices, :] = output_bias - output_total = output_total*max_prob - output_bias_total = output_bias_total*max_prob + output_total = output_total * max_prob + output_bias_total = output_bias_total * max_prob output_total = output_total.view(s, b, h) output_bias_total = output_bias_total.view(s, b, h) @@ -222,8 +236,7 @@ def forward(self, hidden_states): class CoreAttention(MegatronModule): - def __init__(self, layer_number, config, - attn_mask_type=AttnMaskType.padding): + def __init__(self, layer_number, config, attn_mask_type=AttnMaskType.padding): super(CoreAttention, self).__init__() self.fp16 = config.fp16 self.bf16 = config.bf16 @@ -242,14 +255,19 @@ def __init__(self, layer_number, config, seq_parallel_world_size = 1 if parallel_state.sequence_parallel_is_initialized(): seq_parallel_world_size = parallel_state.get_sequence_parallel_world_size() - world_size = seq_parallel_world_size if seq_parallel_world_size > 1 else parallel_state.get_tensor_model_parallel_world_size() + world_size = ( + seq_parallel_world_size + if seq_parallel_world_size > 1 + else parallel_state.get_tensor_model_parallel_world_size() + ) - self.hidden_size_per_partition = core.utils.divide(projection_size, - world_size) + self.hidden_size_per_partition = core.utils.divide(projection_size, world_size) self.hidden_size_per_attention_head = core.utils.divide( - projection_size, config.num_attention_heads) + projection_size, config.num_attention_heads + ) self.num_attention_heads_per_partition = core.utils.divide( - config.num_attention_heads, world_size) + config.num_attention_heads, world_size + ) coeff = None self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) @@ -258,49 +276,56 @@ def __init__(self, layer_number, config, self.norm_factor *= coeff self.scale_mask_softmax = FusedScaleMaskSoftmax( - self.fp16, self.bf16, + self.fp16, + self.bf16, self.attn_mask_type, config.masked_softmax_fusion, attention_mask_func, self.attention_softmax_in_fp32, - coeff) + coeff, + ) # Dropout. Note that for a single iteration, this layer will generate # different outputs on different number of parallel partitions but # on average it should not be partition dependent. self.attention_dropout = torch.nn.Dropout(config.attention_dropout) - def forward(self, query_layer, key_layer, - value_layer, attention_mask): + def forward(self, query_layer, key_layer, value_layer, attention_mask): # =================================== # Raw attention scores. [b, np, s, s] # =================================== # [b, np, sq, sk] - output_size = (query_layer.size(1), - query_layer.size(2), - query_layer.size(0), - key_layer.size(0)) + output_size = ( + query_layer.size(1), + query_layer.size(2), + query_layer.size(0), + key_layer.size(0), + ) # [sq, b, np, hn] -> [sq, b * np, hn] - query_layer = query_layer.view(output_size[2], - output_size[0] * output_size[1], -1) + query_layer = query_layer.view( + output_size[2], output_size[0] * output_size[1], -1 + ) # [sk, b, np, hn] -> [sk, b * np, hn] - key_layer = key_layer.view(output_size[3], - output_size[0] * output_size[1], -1) + key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) # preallocting input tensor: [b * np, sq, sk] matmul_input_buffer = parallel_state.get_global_memory_buffer().get_tensor( - (output_size[0]*output_size[1], output_size[2], output_size[3]), - query_layer.dtype, "mpu") + (output_size[0] * output_size[1], output_size[2], output_size[3]), + query_layer.dtype, + "mpu", + ) # Raw attention scores. [b * np, sq, sk] matmul_result = torch.baddbmm( matmul_input_buffer, - query_layer.transpose(0, 1), # [b * np, sq, hn] + query_layer.transpose(0, 1), # [b * np, sq, hn] key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] - beta=0.0, alpha=(1.0/self.norm_factor)) + beta=0.0, + alpha=(1.0 / self.norm_factor), + ) # change view to [b, np, sq, sk] attention_scores = matmul_result.view(*output_size) @@ -310,8 +335,7 @@ def forward(self, query_layer, key_layer, # =========================== # attention scores and attention mask [b, np, sq, sk] - attention_probs = self.scale_mask_softmax(attention_scores, - attention_mask) + attention_probs = self.scale_mask_softmax(attention_scores, attention_mask) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -329,18 +353,22 @@ def forward(self, query_layer, key_layer, # [sk, b, np, hn] --> [b, np, sq, hn] # context layer shape: [b, np, sq, hn] - output_size = (value_layer.size(1), - value_layer.size(2), - query_layer.size(0), - value_layer.size(3)) + output_size = ( + value_layer.size(1), + value_layer.size(2), + query_layer.size(0), + value_layer.size(3), + ) # change view [sk, b * np, hn] - value_layer = value_layer.view(value_layer.size(0), - output_size[0] * output_size[1], -1) + value_layer = value_layer.view( + value_layer.size(0), output_size[0] * output_size[1], -1 + ) # change view [b * np, sq, sk] - attention_probs = attention_probs.view(output_size[0] * output_size[1], - output_size[2], -1) + attention_probs = attention_probs.view( + output_size[0] * output_size[1], output_size[2], -1 + ) # matmul: [b * np, sq, hn] context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) @@ -352,8 +380,9 @@ def forward(self, query_layer, key_layer, context_layer = context_layer.permute(2, 0, 1, 3).contiguous() # [sq, b, np, hn] --> [sq, b, hp] - new_context_layer_shape = context_layer.size()[:-2] + \ - (self.hidden_size_per_partition,) + new_context_layer_shape = context_layer.size()[:-2] + ( + self.hidden_size_per_partition, + ) context_layer = context_layer.view(*new_context_layer_shape) return context_layer @@ -369,19 +398,47 @@ class FlashSelfAttention(torch.nn.Module): attention_dropout: The dropout rate to apply to the attention (default: 0.0) """ - def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0, - device=None, dtype=None): + + def __init__( + self, + causal=False, + softmax_scale=None, + attention_dropout=0.0, + device=None, + dtype=None, + ): super().__init__() - assert flash_attn_unpadded_func is not None or flash_attn_varlen_func is not None or flash_attn_builder is not None, \ - ('Please install FlashAttention first, e.g., with pip install flash-attn or implement your own flash attention') - assert rearrange is not None, 'Please install einops first, e.g., with pip install einops' + assert ( + flash_attn_unpadded_func is not None + or flash_attn_varlen_func is not None + or flash_attn_builder is not None + ), "Please install FlashAttention first, e.g., with pip install flash-attn or implement your own flash attention" + assert ( + rearrange is not None + ), "Please install einops first, e.g., with pip install einops" self.causal = causal self.softmax_scale = softmax_scale self.dropout_p = attention_dropout # Use FlashAttention-2 when args.use_flash_attn_v2 is True args = get_args() - self.flash_attn_func = flash_attn_varlen_func if args.use_flash_attn_v2 else flash_attn_unpadded_func + self.use_flash_attn_builder_v1 = False + self.use_flash_attn_builder_v2 = False + self.use_flash_attn = False + if args.use_flash_attn_builder: + if hasattr(flash_attn_builder, "flash_attn_func"): + self.flash_attn_func = flash_attn_builder.flash_attn_func + self.use_flash_attn_builder_v1 = True + else: + self.flash_attn_func = flash_attn_builder.flash_attn_func_v2 + self.use_flash_attn_builder_v2 = True + else: + self.flash_attn_func = ( + flash_attn_varlen_func + if args.use_flash_attn_v2 + else flash_attn_unpadded_func + ) + self.use_flash_attn = True def forward(self, q, k, v): """Implements the multihead softmax attention. @@ -390,52 +447,85 @@ def forward(self, q, k, v): q, k, v: The tensor containing the query, key, and value. (B, S, H, D) """ - assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q,k,v))) + assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q, k, v))) assert all((get_accelerator().on_accelerator(i) for i in (q, k, v))) - # if get_accelerator().device_name() == 'cuda': - # assert all((i.is_cuda for i in (q,k,v))) - # else: - # assert all((i.is_xpu for i in (q,k,v))) batch_size, seqlen_q = q.shape[0], q.shape[1] seqlen_k = k.shape[1] - if get_accelerator().device_name() == 'cuda': - # goes for cuda device - q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]] - cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, - device=q.device) + if self.use_flash_attn: + q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]] + cu_seqlens_q = torch.arange( + 0, + (batch_size + 1) * seqlen_q, + step=seqlen_q, + dtype=torch.int32, + device=q.device, + ) + elif self.use_flash_attn_builder_v1: + q, k, v = [ + rearrange(x, "b s h d -> b h s d").contiguous() for x in [q, k, v] + ] else: - # goes for other device - q, k, v = [rearrange(x, 'b s h d -> b h s d').contiguous() for x in [q, k, v]] + # use_flash_attn_builder_v2 + q, k, v = [rearrange(x, "b s h d -> b h s d") for x in [q, k, v]] if self.training: # during training q,k,v always have same seqlen assert seqlen_k == seqlen_q is_causal = self.causal - cu_seqlens_k = cu_seqlens_q if get_accelerator().device_name() == 'cuda' else None + cu_seqlens_k = ( + cu_seqlens_q if get_accelerator().device_name() == "cuda" else None + ) dropout_p = self.dropout_p else: # turn off FA causal mask after first inference autoregressive iteration # only on first autoregressive step q,k,v have same seqlen is_causal = seqlen_q == seqlen_k - cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, - device=q.device) if get_accelerator().device_name() == 'cuda' else None + cu_seqlens_k = ( + torch.arange( + 0, + (batch_size + 1) * seqlen_k, + step=seqlen_k, + dtype=torch.int32, + device=q.device, + ) + if get_accelerator().device_name() == "cuda" + else None + ) dropout_p = 0 - output = self.flash_attn_func( - q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k, - dropout_p, - softmax_scale=self.softmax_scale, causal=is_causal - ) if get_accelerator().device_name() == 'cuda' else flash_attn_builder.flash_attn_func( - q, k, v, self.dropout_p, self.softmax_scale, is_causal - ) + if self.use_flash_attn: + output = self.flash_attn_func( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + seqlen_q, + seqlen_k, + dropout_p, + softmax_scale=self.softmax_scale, + causal=is_causal, + ) + else: + # use_flash_attn_builder + output = self.flash_attn_func( + q, k, v, self.dropout_p, self.softmax_scale, is_causal + ) + + if self.use_flash_attn: + output = rearrange(output, "(b s) ... -> b s ...", b=batch_size) + elif self.use_flash_attn_builder_v1: + output = rearrange(output, "b h s d -> b s h d").contiguous() + else: + # use_flash_attn_builder_v2: + output = rearrange(output, "b h s d -> b s h d") - output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) if get_accelerator().device_name() == 'cuda' else rearrange( - output, 'b h s d -> b s h d').contiguous() return output + class FlashSelfAttentionTriton(torch.nn.Module): """Implement the scaled dot product attention with softmax. Arguments @@ -446,11 +536,22 @@ class FlashSelfAttentionTriton(torch.nn.Module): attention_dropout: The dropout rate to apply to the attention (default: 0.0) """ - def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0, - device=None, dtype=None): + + def __init__( + self, + causal=False, + softmax_scale=None, + attention_dropout=0.0, + device=None, + dtype=None, + ): super().__init__() - assert flash_attn_func is not None, ('Triton version of FlashAttention is not installed.') - assert rearrange is not None, 'Please install einops first, e.g., with pip install einops' + assert ( + flash_attn_func is not None + ), "Triton version of FlashAttention is not installed." + assert ( + rearrange is not None + ), "Please install einops first, e.g., with pip install einops" self.causal = causal self.softmax_scale = softmax_scale self.dropout_p = attention_dropout @@ -464,13 +565,13 @@ def forward(self, q, k, v): assert q.dtype in [torch.float16, torch.bfloat16] assert q.is_cuda - q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous() - for x in (q, k, v)] - + q, k, v = [rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)] + output = flash_attn_func(q, k, v, None, self.causal) - output = rearrange(output, 'b s h d -> s b (h d)').contiguous() + output = rearrange(output, "b s h d -> s b (h d)").contiguous() return output + class ParallelAttention(MegatronModule): """Parallel self-attention layer abstract class. @@ -478,9 +579,13 @@ class ParallelAttention(MegatronModule): and returns output of the same size. """ - def __init__(self, config, layer_number, - attention_type=AttnType.self_attn, - attn_mask_type=AttnMaskType.padding): + def __init__( + self, + config, + layer_number, + attention_type=AttnType.self_attn, + attn_mask_type=AttnMaskType.padding, + ): super(ParallelAttention, self).__init__() args = get_args() self.layer_number = max(1, layer_number) @@ -490,11 +595,18 @@ def __init__(self, config, layer_number, self.sequence_parallel = config.sequence_parallel self.num_attention_heads = config.num_attention_heads self.num_key_value_heads = config.num_key_value_heads - self.use_gqa = (self.num_attention_heads != self.num_key_value_heads) - - self.use_flash_attn = (args.use_flash_attn_v1 or args.use_flash_attn_triton or args.use_flash_attn_v2) \ - and attention_type == AttnType.self_attn \ + self.use_gqa = self.num_attention_heads != self.num_key_value_heads + + self.use_flash_attn = ( + ( + args.use_flash_attn_v1 + or args.use_flash_attn_triton + or args.use_flash_attn_v2 + or args.use_flash_attn_builder + ) + and attention_type == AttnType.self_attn and self.attn_mask_type == AttnMaskType.causal + ) self.use_flash_attn_triton = args.use_flash_attn_triton if self.use_flash_attn: global flash_attn_builder @@ -504,37 +616,53 @@ def __init__(self, config, layer_number, flash_attn_builder = None if args.use_flash_attn_v1: - assert flash_attn_unpadded_func != None or flash_attn_builder != None, ("Cannot import FlashAttention v1 " - "and Cannot find FlashAttention Builder") + assert ( + flash_attn_unpadded_func != None + ), "Cannot import FlashAttention v1 " if args.use_flash_attn_v2: - assert flash_attn_varlen_func != None, "Cannot import FlashAttention v2 " + assert ( + flash_attn_varlen_func != None + ), "Cannot import FlashAttention v2 " if args.use_flash_attn_triton: assert flash_attn_func != None, "Cannot import FlashAttention triton " + if args.use_flash_attn_builder: + assert ( + flash_attn_builder != None + ), "Cannot find FlashAttention op builder " - assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports ' - 'self-attention for now') - assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only ' - 'supports causal mask for now') + assert attention_type == AttnType.self_attn, ( + "FlashAttention code path only supports " "self-attention for now" + ) + assert self.attn_mask_type == AttnMaskType.causal, ( + "FlashAttention code path only " "supports causal mask for now" + ) if rearrange is None: - raise ImportError('einops is not installed, please install with pip install einops') + raise ImportError( + "einops is not installed, please install with pip install einops" + ) projection_size = config.kv_channels * config.num_attention_heads # Per attention head and per partition values. world_size = parallel_state.get_tensor_model_parallel_world_size() self.hidden_size_per_attention_head = core.utils.divide( - projection_size, config.num_attention_heads) + projection_size, config.num_attention_heads + ) self.num_attention_heads_per_partition = core.utils.divide( - config.num_attention_heads, world_size) + config.num_attention_heads, world_size + ) # Per GQA head and per partition values self.num_key_value_heads_per_partition = core.utils.divide( - config.num_key_value_heads, world_size) + config.num_key_value_heads, world_size + ) self.num_key_value_groups = core.utils.divide( - config.num_attention_heads, config.num_key_value_heads) + config.num_attention_heads, config.num_key_value_heads + ) kv_projection_size = config.kv_channels * config.num_key_value_heads assert self.hidden_size_per_attention_head == core.utils.divide( - kv_projection_size, config.num_key_value_heads) + kv_projection_size, config.num_key_value_heads + ) # Strided linear layer. if attention_type == AttnType.self_attn: @@ -544,7 +672,8 @@ def __init__(self, config, layer_number, config=config, init_method=config.init_method, bias=args.add_bias_linear, - gather_output=False) + gather_output=False, + ) else: assert attention_type == AttnType.cross_attn self.query = tensor_parallel.ColumnParallelLinear( @@ -553,8 +682,8 @@ def __init__(self, config, layer_number, config=config, init_method=config.init_method, bias=config.add_bias_linear, - gather_output=False) - + gather_output=False, + ) self.key_value = tensor_parallel.ColumnParallelLinear( config.hidden_size, @@ -562,28 +691,48 @@ def __init__(self, config, layer_number, config=config, init_method=config.init_method, bias=config.add_bias_linear, - gather_output=False) + gather_output=False, + ) # Currently FlashAttention only works with causal mask if self.use_flash_attn_triton: - local_attn = FlashSelfAttentionTriton(causal=True, attention_dropout=args.attention_dropout) + local_attn = FlashSelfAttentionTriton( + causal=True, attention_dropout=args.attention_dropout + ) elif self.use_flash_attn: - local_attn = FlashSelfAttention(causal=True, attention_dropout=config.attention_dropout) + local_attn = FlashSelfAttention( + causal=True, attention_dropout=config.attention_dropout + ) else: local_attn = CoreAttention(self.layer_number, config, self.attn_mask_type) - self.enable_ds_sequence_parallel = parallel_state.get_sequence_parallel_world_size() > 1 \ - or args.force_ds_sequence_parallel + self.enable_ds_sequence_parallel = ( + parallel_state.get_sequence_parallel_world_size() > 1 + or args.force_ds_sequence_parallel + ) if self.enable_ds_sequence_parallel: - assert dist_attn_supported, 'Distributed attention is not supported in this DeepSpeed version' - assert args.num_attention_heads % parallel_state.get_sequence_parallel_world_size() == 0 - self.dist_attn = DistributedAttention(local_attn, parallel_state.get_sequence_parallel_group()) + assert ( + dist_attn_supported + ), "Distributed attention is not supported in this DeepSpeed version" + assert ( + args.num_attention_heads + % parallel_state.get_sequence_parallel_world_size() + == 0 + ) + self.dist_attn = DistributedAttention( + local_attn, + parallel_state.get_sequence_parallel_group(), + gather_idx=1 if args.use_flash_attn_v1 or args.use_flash_attn_v2 else 0, + ) + # flash_attn_cuda assumes [b, s, nh, hd] layout, we need to make sure all2all gathers into the correct sequence dimension. else: if self.use_flash_attn: self.core_attention_flash = local_attn else: self.core_attention = local_attn - self.checkpoint_core_attention = config.recompute_granularity == 'selective' + self.checkpoint_core_attention = ( + config.recompute_granularity == "selective" + ) # Output. self.dense = tensor_parallel.RowParallelLinear( @@ -593,29 +742,38 @@ def __init__(self, config, layer_number, init_method=config.output_layer_init_method, bias=args.add_bias_linear, input_is_parallel=True, - skip_bias_add=True) - + skip_bias_add=True, + ) - def _checkpointed_attention_forward(self, query_layer, key_layer, - value_layer, attention_mask, - rotary_pos_emb=None): + def _checkpointed_attention_forward( + self, query_layer, key_layer, value_layer, attention_mask, rotary_pos_emb=None + ): """Forward method with activation checkpointing.""" + def custom_forward(*inputs): query_layer = inputs[0] key_layer = inputs[1] value_layer = inputs[2] attention_mask = inputs[3] - output_ = self.core_attention(query_layer, key_layer, - value_layer, attention_mask) + output_ = self.core_attention( + query_layer, key_layer, value_layer, attention_mask + ) return output_ - q_pos_emb, k_pos_emb = (None, None) if rotary_pos_emb is None \ - else rotary_pos_emb + q_pos_emb, k_pos_emb = ( + (None, None) if rotary_pos_emb is None else rotary_pos_emb + ) hidden_states = tensor_parallel.checkpoint( custom_forward, - False, query_layer, key_layer, value_layer, attention_mask, - q_pos_emb, k_pos_emb) + False, + query_layer, + key_layer, + value_layer, + attention_mask, + q_pos_emb, + k_pos_emb, + ) return hidden_states @@ -626,28 +784,49 @@ def _allocate_memory(self, inference_max_sequence_len, batch_size): self.num_attention_heads_per_partition, self.hidden_size_per_attention_head, dtype=self.params_dtype, - device=get_accelerator().current_device_name()) + device=get_accelerator().current_device_name(), + ) def repeat_kv(self, hidden_states, n_rep): slen, batch, num_key_value_heads_per_partition, head_dim = hidden_states.shape if n_rep == 1: return hidden_states - hidden_states = hidden_states[:, :, :, None, :].expand( - slen, batch, num_key_value_heads_per_partition, n_rep, head_dim) - return hidden_states.reshape(slen, batch, - num_key_value_heads_per_partition * n_rep, - head_dim) - + elif num_key_value_heads_per_partition == 1: + # If no of KV heads is 1 then just perform expand operation + # instead of unsqueeze, expand and reshape to match query states. + return hidden_states.expand(slen, batch, n_rep, head_dim) + else: + hidden_states = hidden_states[:, :, :, None, :].expand( + slen, batch, num_key_value_heads_per_partition, n_rep, head_dim + ) + return hidden_states.reshape( + slen, batch, num_key_value_heads_per_partition * n_rep, head_dim + ) + def split_tensor(self, mixed_x_layer): - query_layer = mixed_x_layer[:, :, :, :-2, :].reshape(mixed_x_layer.shape[:2] + (-1, self.hidden_size_per_attention_head)) - key_layer = mixed_x_layer[:, :, :, -2, :] - value_layer = mixed_x_layer[:, :, :, -1, :] + query_layer, key_layer, value_layer = torch.split( + mixed_x_layer, [self.num_key_value_groups, 1, 1], dim=-2 + ) + query_layer = query_layer.reshape( + mixed_x_layer.shape[:2] + + ( + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) + ) + key_layer = torch.squeeze(key_layer, -2) + value_layer = torch.squeeze(value_layer, -2) return query_layer, key_layer, value_layer - def forward(self, hidden_states, attention_mask, - encoder_output=None, inference_params=None, - rotary_pos_emb=None): + def forward( + self, + hidden_states, + attention_mask, + encoder_output=None, + inference_params=None, + rotary_pos_emb=None, + ): # hidden_states: [sq, b, h] # ================================================= @@ -659,15 +838,20 @@ def forward(self, hidden_states, attention_mask, inf_max_seq_len = inference_params.max_sequence_len inf_max_batch_size = inference_params.max_batch_size inference_key_memory = self._allocate_memory( - inf_max_seq_len, inf_max_batch_size) + inf_max_seq_len, inf_max_batch_size + ) inference_value_memory = self._allocate_memory( - inf_max_seq_len, inf_max_batch_size) + inf_max_seq_len, inf_max_batch_size + ) inference_params.key_value_memory_dict[self.layer_number] = ( - inference_key_memory, inference_value_memory) + inference_key_memory, + inference_value_memory, + ) is_first_step = True else: - inference_key_memory, inference_value_memory = \ + inference_key_memory, inference_value_memory = ( inference_params.key_value_memory_dict[self.layer_number] + ) # ===================== # Query, Key, and Value @@ -678,43 +862,45 @@ def forward(self, hidden_states, attention_mask, mixed_x_layer, _ = self.query_key_value(hidden_states) # [sq, b, ((nq + 2 * nkv) * hn)] --> [sq, b, nkv, (nq // nkv + 2), hn] - new_tensor_shape = mixed_x_layer.size()[:-1] + \ - (-1, (self.num_key_value_groups + 2), - self.hidden_size_per_attention_head) + new_tensor_shape = mixed_x_layer.size()[:-1] + ( + -1, + (self.num_key_value_groups + 2), + self.hidden_size_per_attention_head, + ) mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) # [sq, b, nkv, (nq // nkv + 2), hn] --> 3 [sq, b, np, hn] - (query_layer, - key_layer, - value_layer) = self.split_tensor(mixed_x_layer) + (query_layer, key_layer, value_layer) = self.split_tensor(mixed_x_layer) # Repeat kv if self.use_gqa: key_layer = self.repeat_kv(key_layer, self.num_key_value_groups) - value_layer = self.repeat_kv(value_layer, - self.num_key_value_groups) + value_layer = self.repeat_kv(value_layer, self.num_key_value_groups) else: - assert not self.use_gqa, 'GQA + cross-attn not tested yet' + assert not self.use_gqa, "GQA + cross-attn not tested yet" # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] mixed_kv_layer, _ = self.key_value(encoder_output) # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn] - new_tensor_shape = mixed_kv_layer.size()[:-1] + \ - (self.num_attention_heads_per_partition, - 2 * self.hidden_size_per_attention_head) + new_tensor_shape = mixed_kv_layer.size()[:-1] + ( + self.num_attention_heads_per_partition, + 2 * self.hidden_size_per_attention_head, + ) mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape) # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn] - (key_layer, - value_layer) = tensor_parallel.split_tensor_along_last_dim(mixed_kv_layer, 2) + (key_layer, value_layer) = tensor_parallel.split_tensor_along_last_dim( + mixed_kv_layer, 2 + ) # Attention head [sq, b, h] --> [sq, b, hp] query_layer, _ = self.query(hidden_states) # [sq, b, hp] --> [sq, b, np, hn] - new_tensor_shape = query_layer.size()[:-1] + \ - (self.num_attention_heads_per_partition, - self.hidden_size_per_attention_head) + new_tensor_shape = query_layer.size()[:-1] + ( + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) query_layer = query_layer.view(*new_tensor_shape) # ================================== @@ -726,7 +912,7 @@ def forward(self, hidden_states, attention_mask, if isinstance(rotary_pos_emb, tuple): rotary_pos_emb = rotary_pos_emb else: - rotary_pos_emb = ((rotary_pos_emb,) * 2) + rotary_pos_emb = (rotary_pos_emb,) * 2 if inference_params: batch_start = inference_params.batch_size_offset @@ -736,15 +922,16 @@ def forward(self, hidden_states, attention_mask, sequence_end = sequence_start + key_layer.size(0) assert sequence_end <= inference_key_memory.size(0) # Copy key and values. - inference_key_memory[sequence_start:sequence_end, - batch_start:batch_end, ...] = key_layer - inference_value_memory[sequence_start:sequence_end, - batch_start:batch_end, ...] = value_layer - key_layer = inference_key_memory[ - :sequence_end, batch_start:batch_end, ...] + inference_key_memory[ + sequence_start:sequence_end, batch_start:batch_end, ... + ] = key_layer + inference_value_memory[ + sequence_start:sequence_end, batch_start:batch_end, ... + ] = value_layer + key_layer = inference_key_memory[:sequence_end, batch_start:batch_end, ...] value_layer = inference_value_memory[ - :sequence_end, batch_start:batch_end, ...] - + :sequence_end, batch_start:batch_end, ... + ] # adjust the key rotary positional embedding if rotary_pos_emb is not None: @@ -766,7 +953,6 @@ def forward(self, hidden_states, attention_mask, k_pos_emb = k_pos_emb[:sequence_end, :, :, :] rotary_pos_emb = (q_pos_emb, k_pos_emb) - # ================================== # core attention computation # ================================== @@ -782,38 +968,58 @@ def forward(self, hidden_states, attention_mask, # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb) if self.enable_ds_sequence_parallel: + batch_dim_idx = 1 if self.use_flash_attn: if not self.use_flash_attn_triton: - query_layer, key_layer, value_layer = [rearrange(x, 's b ... -> b s ...').contiguous() - for x in (query_layer, key_layer, value_layer)] - - context_layer = self.dist_attn(query_layer, key_layer, value_layer) + query_layer, key_layer, value_layer = [ + rearrange(x, "s b ... -> b s ...").contiguous() + for x in (query_layer, key_layer, value_layer) + ] + batch_dim_idx = 0 + + context_layer = self.dist_attn( + query_layer, key_layer, value_layer, batch_dim_idx + ) if not self.use_flash_attn_triton: - context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous() + context_layer = rearrange( + context_layer, "b s h d -> s b (h d)" + ).contiguous() else: - context_layer = self.dist_attn(query_layer, key_layer, value_layer, attention_mask) + context_layer = self.dist_attn( + query_layer, key_layer, value_layer, attention_mask + ) else: if self.use_flash_attn: if not self.use_flash_attn_triton: - query_layer, key_layer, value_layer = [rearrange(x, 's b ... -> b s ...').contiguous() - for x in (query_layer, key_layer, value_layer)] + query_layer, key_layer, value_layer = [ + rearrange(x, "s b ... -> b s ...").contiguous() + for x in (query_layer, key_layer, value_layer) + ] if self.sequence_parallel: - context_layer = self.core_attention_flash(query_layer, key_layer, value_layer) + context_layer = self.core_attention_flash( + query_layer, key_layer, value_layer + ) else: with tensor_parallel.get_cuda_rng_tracker().fork(): - context_layer = self.core_attention_flash(query_layer, key_layer, value_layer) + context_layer = self.core_attention_flash( + query_layer, key_layer, value_layer + ) if not self.use_flash_attn_triton: - context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous() + context_layer = rearrange( + context_layer, "b s h d -> s b (h d)" + ).contiguous() else: if self.checkpoint_core_attention: context_layer = self._checkpointed_attention_forward( - query_layer, key_layer, value_layer, attention_mask) + query_layer, key_layer, value_layer, attention_mask + ) else: context_layer = self.core_attention( - query_layer, key_layer, value_layer, attention_mask) + query_layer, key_layer, value_layer, attention_mask + ) # ================= # Output. [sq, b, h] @@ -836,22 +1042,21 @@ def bias_dropout_add(x, bias, residual, prob, training): def get_bias_dropout_add(training): def _bias_dropout_add(x, bias, residual, prob): return bias_dropout_add(x, bias, residual, prob, training) + return _bias_dropout_add @torch.jit.script -def bias_dropout_add_fused_train(x: torch.Tensor, - bias: Optional[torch.Tensor], - residual: torch.Tensor, - prob: float) -> torch.Tensor: +def bias_dropout_add_fused_train( + x: torch.Tensor, bias: Optional[torch.Tensor], residual: torch.Tensor, prob: float +) -> torch.Tensor: return bias_dropout_add(x, bias, residual, prob, True) @torch.jit.script -def bias_dropout_add_fused_inference(x: torch.Tensor, - bias: Optional[torch.Tensor], - residual: torch.Tensor, - prob: float) -> torch.Tensor: +def bias_dropout_add_fused_inference( + x: torch.Tensor, bias: Optional[torch.Tensor], residual: torch.Tensor, prob: float +) -> torch.Tensor: return bias_dropout_add(x, bias, residual, prob, False) @@ -862,10 +1067,15 @@ class ParallelTransformerLayer(MegatronModule): output of the same size. """ - def __init__(self, config, - layer_number, layer_type=LayerType.encoder, - self_attn_mask_type=AttnMaskType.padding, - drop_path_rate=0., num_experts=1): + def __init__( + self, + config, + layer_number, + layer_type=LayerType.encoder, + self_attn_mask_type=AttnMaskType.padding, + drop_path_rate=0.0, + num_experts=1, + ): # retriever=None): args = get_args() @@ -873,106 +1083,130 @@ def __init__(self, config, self.layer_number = layer_number self.layer_type = layer_type - self.apply_residual_connection_post_layernorm \ - = config.apply_residual_connection_post_layernorm + self.apply_residual_connection_post_layernorm = ( + config.apply_residual_connection_post_layernorm + ) self.bf16 = config.bf16 self.fp32_residual_connection = config.fp32_residual_connection # Layernorm on the input data. - if args.normalization == 'layernorm': - if get_accelerator().device_name() == 'cuda': + if args.normalization == "layernorm": + if get_accelerator().device_name() == "cuda": self.input_layernorm = LayerNorm( config.hidden_size, eps=config.layernorm_epsilon, no_persist_layer_norm=args.no_persist_layer_norm, sequence_parallel=config.sequence_parallel, apply_layernorm_1p=args.apply_layernorm_1p, - mem_efficient_ln=args.mem_efficient_ln) + mem_efficient_ln=args.mem_efficient_ln, + ) else: self.input_layernorm = LayerNorm( - config.hidden_size, - eps=config.layernorm_epsilon) + config.hidden_size, eps=config.layernorm_epsilon + ) else: - self.input_layernorm = RMSNorm(config.hidden_size, config.layernorm_epsilon) + self.input_layernorm = RMSNorm( + config.hidden_size, + config.layernorm_epsilon, + sequence_parallel=config.sequence_parallel, + ) + # self.input_layernorm = RMSNorm(config.hidden_size, config.layernorm_epsilon_ # Self attention. self.self_attention = ParallelAttention( config, layer_number, attention_type=AttnType.self_attn, - attn_mask_type=self_attn_mask_type) + attn_mask_type=self_attn_mask_type, + ) self.hidden_dropout = config.hidden_dropout self.bias_dropout_fusion = config.bias_dropout_fusion self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None # Layernorm on the attention output - if args.normalization == 'layernorm': - if get_accelerator().device_name() == 'cuda': + if args.normalization == "layernorm": + if get_accelerator().device_name() == "cuda": self.post_attention_layernorm = LayerNorm( config.hidden_size, eps=config.layernorm_epsilon, no_persist_layer_norm=not config.persist_layer_norm, sequence_parallel=config.sequence_parallel, apply_layernorm_1p=args.apply_layernorm_1p, - mem_efficient_ln=args.mem_efficient_ln) + mem_efficient_ln=args.mem_efficient_ln, + ) else: self.post_attention_layernorm = LayerNorm( - config.hidden_size, - eps=config.layernorm_epsilon) + config.hidden_size, eps=config.layernorm_epsilon + ) else: - self.post_attention_layernorm = RMSNorm(config.hidden_size, config.layernorm_epsilon) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, + config.layernorm_epsilon, + sequence_parallel=config.sequence_parallel, + ) # Cross attention. - if self.layer_type in (LayerType.decoder, - LayerType.retro_decoder, - LayerType.retro_decoder_with_retriever, - LayerType.retro_encoder): + if self.layer_type in ( + LayerType.decoder, + LayerType.retro_decoder, + LayerType.retro_decoder_with_retriever, + LayerType.retro_encoder, + ): self.inter_attention = ParallelAttention( - config, - layer_number, - attention_type=AttnType.cross_attn) + config, layer_number, attention_type=AttnType.cross_attn + ) # Layernorm on the attention output. - if args.normalization == 'layernorm': + if args.normalization == "layernorm": self.post_inter_attention_layernorm = LayerNorm( config.hidden_size, eps=config.layernorm_epsilon, no_persist_layer_norm=not config.persist_layer_norm, sequence_parallel=config.sequence_parallel, apply_layernorm_1p=args.apply_layernorm_1p, - mem_efficient_ln=args.mem_efficient_ln) + mem_efficient_ln=args.mem_efficient_ln, + ) else: - self.post_inter_attention_layernorm = RMSNorm(config.hidden_size, config.layernorm_epsilon) + self.post_inter_attention_layernorm = RMSNorm( + config.hidden_size, + config.layernorm_epsilon, + sequence_parallel=config.sequence_parallel, + ) # MLP self.num_experts = num_experts if args.num_experts_switch is not None: - self.mlp = SwitchMLP(config) # Megatron-LM's MoE + self.mlp = SwitchMLP(config) # Megatron-LM's MoE else: - if self.num_experts <= 1: # dense, not MoE + if self.num_experts <= 1: # dense, not MoE self.mlp = ParallelMLP(config) - else: # DeepSpeed's MoE + else: # DeepSpeed's MoE enable_expert_tensor_parallelism = args.enable_expert_tensor_parallelism - self.mlp = MoE(args.hidden_size, - ParallelMLP(config, - moe=True, - enable_expert_tensor_parallelism=enable_expert_tensor_parallelism), - num_experts=self.num_experts, - ep_size=args.moe_expert_parallel_size, - k=args.topk, - use_residual=(args.mlp_type == 'residual'), - capacity_factor=args.moe_train_capacity_factor, - eval_capacity_factor=args.moe_eval_capacity_factor, - min_capacity=args.moe_min_capacity, - drop_tokens=args.moe_token_dropping, - use_tutel=args.use_tutel, - enable_expert_tensor_parallelism=enable_expert_tensor_parallelism, - top2_2nd_expert_sampling=args.moe_top2_2nd_expert_sampling) + self.mlp = MoE( + args.hidden_size, + ParallelMLP( + config, + moe=True, + enable_expert_tensor_parallelism=enable_expert_tensor_parallelism, + ), + num_experts=self.num_experts, + ep_size=args.moe_expert_parallel_size, + k=args.topk, + use_residual=(args.mlp_type == "residual"), + capacity_factor=args.moe_train_capacity_factor, + eval_capacity_factor=args.moe_eval_capacity_factor, + min_capacity=args.moe_min_capacity, + drop_tokens=args.moe_token_dropping, + use_tutel=args.use_tutel, + enable_expert_tensor_parallelism=enable_expert_tensor_parallelism, + top2_2nd_expert_sampling=args.moe_top2_2nd_expert_sampling, + ) # Set bias+dropout+add fusion grad_enable execution handler. - TORCH_MAJOR = int(torch.__version__.split('.')[0]) - TORCH_MINOR = int(torch.__version__.split('.')[1]) + TORCH_MAJOR = int(torch.__version__.split(".")[0]) + TORCH_MINOR = int(torch.__version__.split(".")[1]) use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10) - self.bias_dropout_add_exec_handler = \ - nullcontext if use_nvfuser else torch.enable_grad + self.bias_dropout_add_exec_handler = ( + nullcontext if use_nvfuser else torch.enable_grad + ) if args.retro_add_retriever: retro_args = get_retro_args() @@ -990,23 +1224,24 @@ def __init__(self, config, pre_process=True, post_process=False, ) - self._retriever_key = 'retriever' + self._retriever_key = "retriever" else: self.retriever = None - def default_decoder_cross_attention(self, - encoder_output, - enc_dec_attn_mask, - layernorm_input, - layernorm_output, - bias_dropout_add_func): - '''Cross attention for a standard encoder-decoder model.''' + def default_decoder_cross_attention( + self, + encoder_output, + enc_dec_attn_mask, + layernorm_input, + layernorm_output, + bias_dropout_add_func, + ): + """Cross attention for a standard encoder-decoder model.""" # Attention. - attention_output, attention_bias = \ - self.inter_attention(layernorm_output, - enc_dec_attn_mask, - encoder_output=encoder_output) + attention_output, attention_bias = self.inter_attention( + layernorm_output, enc_dec_attn_mask, encoder_output=encoder_output + ) # Residual connection. if self.apply_residual_connection_post_layernorm: @@ -1020,21 +1255,17 @@ def default_decoder_cross_attention(self, # Bias-dropout-add. with self.bias_dropout_add_exec_handler(): layernorm_input = bias_dropout_add_func( - attention_output, - attention_bias, - residual, - self.hidden_dropout) + attention_output, attention_bias, residual, self.hidden_dropout + ) # Layer norm. layernorm_output = self.post_inter_attention_layernorm(layernorm_input) return layernorm_input, layernorm_output - def retro_encoder_cross_attention(self, - retriever_output, - layernorm_input, - layernorm_output, - bias_dropout_add_func): + def retro_encoder_cross_attention( + self, retriever_output, layernorm_input, layernorm_output, bias_dropout_add_func + ): """Cross attention for Retro encoder. Notation: @@ -1046,16 +1277,15 @@ def retro_encoder_cross_attention(self, r : Number of retrieved tokens (neighbors + continuation). """ - ns, bs, d = layernorm_output.shape # [r, bs * l * k, d] + ns, bs, d = layernorm_output.shape # [r, bs * l * k, d] # Divide sequence dimension into chunks. - chunked_outputs = layernorm_output.reshape(self.retro_retrieved_length, - -1, - self.retro_num_neighbors, - d) - chunked_outputs_before_layer_norm = \ - layernorm_input.reshape(self.retro_retrieved_length, -1, - self.retro_num_neighbors, d) # [r, bs*l, k, d] + chunked_outputs = layernorm_output.reshape( + self.retro_retrieved_length, -1, self.retro_num_neighbors, d + ) + chunked_outputs_before_layer_norm = layernorm_input.reshape( + self.retro_retrieved_length, -1, self.retro_num_neighbors, d + ) # [r, bs*l, k, d] # Per-chunk attention. layernorm_inputs = [] @@ -1063,51 +1293,55 @@ def retro_encoder_cross_attention(self, for k in range(self.retro_num_neighbors): # Attention. - chunked_output = chunked_outputs[:,:,k].contiguous() - attention_output, attention_bias = \ - self.inter_attention( - chunked_output, # Q (neighbor embedding) - None, - encoder_output=retriever_output) # K, V (hidden act) + chunked_output = chunked_outputs[:, :, k].contiguous() + attention_output, attention_bias = self.inter_attention( + chunked_output, # Q (neighbor embedding) + None, + encoder_output=retriever_output, + ) # K, V (hidden act) # Residual connection. if self.apply_residual_connection_post_layernorm: residual = chunked_output else: - residual = chunked_outputs_before_layer_norm[:,:,k] + residual = chunked_outputs_before_layer_norm[:, :, k] # Re-enable torch grad to enable fused optimization. with torch.enable_grad(): layernorm_input = bias_dropout_add_func( attention_output, - None if attention_bias is None else attention_bias.expand_as(residual), + ( + None + if attention_bias is None + else attention_bias.expand_as(residual) + ), residual, - self.hidden_dropout) + self.hidden_dropout, + ) layernorm_inputs.append(layernorm_input) # Layer norm. - layernorm_output = \ - self.post_inter_attention_layernorm(layernorm_input) + layernorm_output = self.post_inter_attention_layernorm(layernorm_input) layernorm_outputs.append(layernorm_output) # Concatenate layer norms. # layernorm_input : [r, k * bs * l, d] # layernorm_output : [r, k * bs * l, d] - layernorm_input = \ - torch.stack(layernorm_inputs, dim=1).reshape(ns, bs, d) - layernorm_output = \ - torch.stack(layernorm_outputs, dim=1).reshape(ns, bs, d) + layernorm_input = torch.stack(layernorm_inputs, dim=1).reshape(ns, bs, d) + layernorm_output = torch.stack(layernorm_outputs, dim=1).reshape(ns, bs, d) return layernorm_input, layernorm_output - def retro_decoder_cross_attention(self, - retriever_input, - retriever_output, - retriever_attn_mask, - layernorm_input, - layernorm_output, - inference_params, - bias_dropout_add_func): + def retro_decoder_cross_attention( + self, + retriever_input, + retriever_output, + retriever_attn_mask, + layernorm_input, + layernorm_output, + inference_params, + bias_dropout_add_func, + ): """Cross attention for Retro decoder. Notation: @@ -1128,22 +1362,27 @@ def retro_decoder_cross_attention(self, first_ns = ns % self.retro_chunk_length if first_ns > 0: raise Exception("test this case.") - first_chunk, rest_chunk = \ - layernorm_output[:first_ns], layernorm_output[first_ns:] + first_chunk, rest_chunk = ( + layernorm_output[:first_ns], + layernorm_output[first_ns:], + ) first_chunk = torch.nn.functional.pad( first_chunk, (0, 0, 0, 0, 0, self.retro_chunk_length - first_ns), - 'constant', - 0) - chunked_output = \ - torch.cat((first_chunk, rest_chunk), dim=0) # [l * m, bs, d] + "constant", + 0, + ) + chunked_output = torch.cat( + (first_chunk, rest_chunk), dim=0 + ) # [l * m, bs, d] else: - chunked_output = layernorm_output # [l * m, bs, d] - chunked_output = chunked_output \ - .reshape(l, self.retro_chunk_length, bs, d) \ - .permute(1, 2, 0, 3) \ - .reshape(self.retro_chunk_length, bs * l, d) \ + chunked_output = layernorm_output # [l * m, bs, d] + chunked_output = ( + chunked_output.reshape(l, self.retro_chunk_length, bs, d) + .permute(1, 2, 0, 3) + .reshape(self.retro_chunk_length, bs * l, d) .contiguous() + ) # Get Encoder Output retriever_output = self.retriever( @@ -1151,9 +1390,11 @@ def retro_decoder_cross_attention(self, attention_mask=retriever_attn_mask, retriever_output=chunked_output, retriever_attn_mask=retriever_attn_mask, - inference_params=inference_params) # [r, k * bs * l , d] + inference_params=inference_params, + ) # [r, k * bs * l , d] retriever_output = retriever_output.reshape( - self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d) # [r * k, bs * l, d] + self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d + ) # [r * k, bs * l, d] # Chunks. pad = (ns - 1) % self.retro_chunk_length @@ -1161,18 +1402,20 @@ def retro_decoder_cross_attention(self, padded_chunks = torch.nn.functional.pad( attending_chunks, (0, 0, 0, 0, 0, self.retro_chunk_length - 1), - 'constant', 0) - padded_chunked_output = padded_chunks \ - .reshape(l, self.retro_chunk_length, bs, d) \ - .permute(1, 2, 0, 3) + "constant", + 0, + ) + padded_chunked_output = padded_chunks.reshape( + l, self.retro_chunk_length, bs, d + ).permute(1, 2, 0, 3) padded_chunked_output = padded_chunked_output.reshape( - self.retro_chunk_length, bs * l, d).contiguous() + self.retro_chunk_length, bs * l, d + ).contiguous() # Encoder output. - attention_output, attention_bias = \ - self.inter_attention(padded_chunked_output, - None, - encoder_output=retriever_output) + attention_output, attention_bias = self.inter_attention( + padded_chunked_output, None, encoder_output=retriever_output + ) # Residual connection. if self.apply_residual_connection_post_layernorm: @@ -1184,17 +1427,27 @@ def retro_decoder_cross_attention(self, with torch.enable_grad(): layernorm_input = bias_dropout_add_func( attention_output, - None if attention_bias is None else attention_bias.expand_as(attention_output), + ( + None + if attention_bias is None + else attention_bias.expand_as(attention_output) + ), torch.zeros_like(attention_output), - self.hidden_dropout) - layernorm_input = layernorm_input \ - .reshape(self.retro_chunk_length, bs, l, d) \ - .permute(2, 0, 1, 3) # [l, m, bs, d] - layernorm_input = layernorm_input.reshape(self.retro_chunk_length * l, bs, d) + self.hidden_dropout, + ) + layernorm_input = layernorm_input.reshape( + self.retro_chunk_length, bs, l, d + ).permute( + 2, 0, 1, 3 + ) # [l, m, bs, d] + layernorm_input = layernorm_input.reshape( + self.retro_chunk_length * l, bs, d + ) layernorm_input = torch.nn.functional.pad( - layernorm_input, - (0, 0, 0, 0, pad, 0), - 'constant', 0)[:ns] # [ns, b, d] + layernorm_input, (0, 0, 0, 0, pad, 0), "constant", 0 + )[ + :ns + ] # [ns, b, d] layernorm_input = layernorm_input + residual # Layer norm post the decoder attention @@ -1202,25 +1455,31 @@ def retro_decoder_cross_attention(self, return retriever_output, layernorm_input, layernorm_output - def forward(self, hidden_states, attention_mask=None, - encoder_output=None, enc_dec_attn_mask=None, - retriever_input=None, - retriever_output=None, - retriever_attn_mask=None, - inference_params=None, - rotary_pos_emb=None): + def forward( + self, + hidden_states, + attention_mask=None, + encoder_output=None, + enc_dec_attn_mask=None, + retriever_input=None, + retriever_output=None, + retriever_attn_mask=None, + inference_params=None, + rotary_pos_emb=None, + aggregated_moe_loss=None, + ): # hidden_states: [s, b, h] # Layer norm at the beginning of the transformer layer. layernorm_output = self.input_layernorm(hidden_states) # Self attention. - attention_output, attention_bias = \ - self.self_attention( - layernorm_output, - attention_mask, - inference_params=inference_params, - rotary_pos_emb=rotary_pos_emb) + attention_output, attention_bias = self.self_attention( + layernorm_output, + attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + ) # Residual connection. if self.apply_residual_connection_post_layernorm: @@ -1245,14 +1504,14 @@ def forward(self, hidden_states, attention_mask=None, attention_bias = attention_bias.expand_as(residual) with self.bias_dropout_add_exec_handler(): layernorm_input = bias_dropout_add_func( - attention_output, - attention_bias, - residual, - self.hidden_dropout) + attention_output, attention_bias, residual, self.hidden_dropout + ) else: - out = torch.nn.functional.dropout(attention_output + attention_bias, - p=self.hidden_dropout, - training=self.training) + out = torch.nn.functional.dropout( + attention_output + attention_bias, + p=self.hidden_dropout, + training=self.training, + ) layernorm_input = residual + self.drop_path(out) # Layer norm post the self attention. @@ -1262,23 +1521,25 @@ def forward(self, hidden_states, attention_mask=None, if self.layer_type == LayerType.encoder: pass elif self.layer_type == LayerType.decoder: - layernorm_input, layernorm_output = \ - self.default_decoder_cross_attention( - encoder_output, - enc_dec_attn_mask, - layernorm_input, - layernorm_output, - bias_dropout_add_func) + layernorm_input, layernorm_output = self.default_decoder_cross_attention( + encoder_output, + enc_dec_attn_mask, + layernorm_input, + layernorm_output, + bias_dropout_add_func, + ) elif self.layer_type == LayerType.retro_encoder: - layernorm_input, layernorm_output = \ - self.retro_encoder_cross_attention( - retriever_output, - layernorm_input, - layernorm_output, - bias_dropout_add_func) - elif self.layer_type in (LayerType.retro_decoder, - LayerType.retro_decoder_with_retriever): - retriever_output, layernorm_input, layernorm_output = \ + layernorm_input, layernorm_output = self.retro_encoder_cross_attention( + retriever_output, + layernorm_input, + layernorm_output, + bias_dropout_add_func, + ) + elif self.layer_type in ( + LayerType.retro_decoder, + LayerType.retro_decoder_with_retriever, + ): + retriever_output, layernorm_input, layernorm_output = ( self.retro_decoder_cross_attention( retriever_input, retriever_output, @@ -1286,20 +1547,29 @@ def forward(self, hidden_states, attention_mask=None, layernorm_input, layernorm_output, inference_params, - bias_dropout_add_func) + bias_dropout_add_func, + ) + ) else: - raise Exception("Unsupported layer type, '%s'." % - self.layer_type.name) + raise Exception("Unsupported layer type, '%s'." % self.layer_type.name) # MLP. - moe_loss = torch.tensor(0.0, device=layernorm_output.device, dtype=layernorm_output.dtype) - mlp_bias = torch.tensor(0.0, device=layernorm_output.device, dtype=layernorm_output.dtype) + moe_loss = torch.tensor( + 0.0, device=layernorm_output.device, dtype=layernorm_output.dtype + ) + mlp_bias = torch.tensor( + 0.0, device=layernorm_output.device, dtype=layernorm_output.dtype + ) if self.num_experts == 1: mlp_output, mlp_bias = self.mlp(layernorm_output) else: mlp_output, moe_loss, _ = self.mlp(layernorm_output) + # when aggregated_moe_loss received, returned moe_loss is the aggregated moe loss + if aggregated_moe_loss is not None: + moe_loss += aggregated_moe_loss + # Second residual connection. if self.apply_residual_connection_post_layernorm: residual = layernorm_output @@ -1311,10 +1581,8 @@ def forward(self, hidden_states, attention_mask=None, mlp_bias = mlp_bias.expand_as(residual) with self.bias_dropout_add_exec_handler(): output = bias_dropout_add_func( - mlp_output, - mlp_bias, - residual, - self.hidden_dropout) + mlp_output, mlp_bias, residual, self.hidden_dropout + ) # Jit compiled function creates 'view' tensor. This tensor # potentially gets saved in the MPU checkpoint function context, @@ -1322,16 +1590,16 @@ def forward(self, hidden_states, attention_mask=None, # won't result in memory savings (like the data loader, or # p2p_communication), it serves to document the origin of this # 'view' tensor. - output = core.utils.make_viewless_tensor(inp = output, - requires_grad = output.requires_grad, - keep_graph = True) + output = core.utils.make_viewless_tensor( + inp=output, requires_grad=output.requires_grad, keep_graph=True + ) else: if mlp_bias is not None: mlp_output = mlp_output + mlp_bias - out = torch.nn.functional.dropout(mlp_output, - p=self.hidden_dropout, - training=self.training) + out = torch.nn.functional.dropout( + mlp_output, p=self.hidden_dropout, training=self.training + ) output = residual + self.drop_path(out) if self.layer_type == LayerType.retro_decoder_with_retriever: @@ -1360,25 +1628,81 @@ class ParallelTransformerLayerPipe(ParallelTransformerLayer): If no mask is provided, the module will query `self._args.attn_mask` for the mask and only return `super().forward(...)` """ + + def __init__( + self, + config, + layer_number, + layer_type=LayerType.encoder, + self_attn_mask_type=AttnMaskType.padding, + drop_path_rate=0.0, + num_experts=1, + input_aggregated_moe_loss=False, + return_aggregated_moe_loss=False, + ): + self.input_aggregated_moe_loss = input_aggregated_moe_loss + self.return_aggregated_moe_loss = return_aggregated_moe_loss + super().__init__( + config, + layer_number, + layer_type, + self_attn_mask_type, + drop_path_rate, + num_experts, + ) + def forward(self, inputs, **kwargs): assert torch.is_tensor(inputs) or isinstance(inputs, tuple) - if not hasattr(self, '_args'): + if not hasattr(self, "_args"): self._args = get_args() - rotary_pos_emb = self._args.rotary_pos_emb if self._args.use_rotary_position_embeddings else None + rotary_pos_emb = ( + self._args.rotary_pos_emb + if self._args.use_rotary_position_embeddings + else None + ) if torch.is_tensor(inputs) or len(inputs) == 1: + assert ( + not self.input_aggregated_moe_loss + ), f"Expecting an input tuple of size >= 2" # No attention mask forwarded, search for args.attn_mask hidden_states, attention_mask = inputs, self._args.attn_mask - # HACK: currently MoE model does not support pipeline parallel, so - # here we just ignore the moe_loss returned by forward() - return super().forward(hidden_states, attention_mask, **kwargs, rotary_pos_emb=rotary_pos_emb)[0] - elif len(inputs) == 2: - # Attention mask is an activation. - hidden_states, attention_mask = inputs[0], inputs[1] - # HACK: currently MoE model does not support pipeline parallel, so - # here we just ignore the moe_loss returned by forward() - return super().forward(*inputs, **kwargs, rotary_pos_emb=rotary_pos_emb)[0], attention_mask + output, moe_loss = super().forward( + hidden_states, attention_mask, **kwargs, rotary_pos_emb=rotary_pos_emb + ) + return (output, moe_loss) if self.return_aggregated_moe_loss else output + elif len(inputs) in (2, 3): + # Attention mask and aggregated_moe can both be activations. + return_attention_mask = False + if len(inputs) == 2: + if self.input_aggregated_moe_loss: + hidden_states, aggregated_moe_loss = inputs[0], inputs[1] + attention_mask = self._args.attn_mask + else: + hidden_states, attention_mask = inputs[0], inputs[1] + return_attention_mask = True + else: + hidden_states, attention_mask, aggregated_moe_loss = ( + inputs[0], + inputs[1], + inputs[2], + ) + + # Forward aggregated_moe_loss to ParallelTransformerLayer for further accumulation + if self.input_aggregated_moe_loss: + kwargs.update({"aggregated_moe_loss": aggregated_moe_loss}) + + output, moe_loss = super().forward( + hidden_states, attention_mask, **kwargs, rotary_pos_emb=rotary_pos_emb + ) + + ret = (output,) + if return_attention_mask: + ret += (attention_mask,) + if self.return_aggregated_moe_loss: + ret += (moe_loss,) + return ret else: - raise RuntimeError('Received more inputs than understood.') + raise RuntimeError("Received more inputs than understood.") class NoopTransformerLayer(MegatronModule): @@ -1401,15 +1725,20 @@ def __init__(self, layer_number): super().__init__() self.layer_number = layer_number - def forward(self, hidden_states, attention_mask, - encoder_output=None, enc_dec_attn_mask=None, - inference_params=None): + def forward( + self, + hidden_states, + attention_mask, + encoder_output=None, + enc_dec_attn_mask=None, + inference_params=None, + ): return hidden_states.clone() def _get_num_layers(args, model_type, is_decoder=False): """Compute the number of transformer layers resident on the current rank.""" - is_encoder_and_decoder_model = (model_type == ModelType.encoder_and_decoder) + is_encoder_and_decoder_model = model_type == ModelType.encoder_and_decoder if model_type == ModelType.retro_encoder: num_layers = args.retro_encoder_layers elif parallel_state.get_pipeline_model_parallel_world_size() > 1: @@ -1422,27 +1751,34 @@ def _get_num_layers(args, model_type, is_decoder=False): # the same whether or not a standalone embedding stage is used. num_ranks_in_encoder = ( args.pipeline_model_parallel_split_rank - 1 - if args.standalone_embedding_stage else - args.pipeline_model_parallel_split_rank + if args.standalone_embedding_stage + else args.pipeline_model_parallel_split_rank + ) + num_ranks_in_decoder = ( + args.transformer_pipeline_model_parallel_size - num_ranks_in_encoder + ) + assert args.encoder_num_layers % num_ranks_in_encoder == 0, ( + "encoder_num_layers (%d) must be divisible by number of ranks given to encoder (%d)" + % (args.encoder_num_layers, num_ranks_in_encoder) + ) + assert args.decoder_num_layers % num_ranks_in_decoder == 0, ( + "decoder_num_layers (%d) must be divisible by number of ranks given to decoder (%d)" + % (args.decoder_num_layers, num_ranks_in_decoder) ) - num_ranks_in_decoder = args.transformer_pipeline_model_parallel_size - num_ranks_in_encoder - assert args.encoder_num_layers % num_ranks_in_encoder == 0, \ - 'encoder_num_layers (%d) must be divisible by number of ranks given to encoder (%d)' % (args.encoder_num_layers, num_ranks_in_encoder) - assert args.decoder_num_layers % num_ranks_in_decoder == 0, \ - 'decoder_num_layers (%d) must be divisible by number of ranks given to decoder (%d)' % (args.decoder_num_layers, num_ranks_in_decoder) if parallel_state.is_pipeline_stage_before_split(): num_layers = ( 0 if args.standalone_embedding_stage - and parallel_state.get_pipeline_model_parallel_rank() == 0 else - args.encoder_num_layers // num_ranks_in_encoder + and parallel_state.get_pipeline_model_parallel_rank() == 0 + else args.encoder_num_layers // num_ranks_in_encoder ) else: num_layers = args.decoder_num_layers // num_ranks_in_decoder else: assert args.num_layers == args.encoder_num_layers - assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \ - 'num_layers must be divisible by transformer_pipeline_model_parallel_size' + assert ( + args.num_layers % args.transformer_pipeline_model_parallel_size == 0 + ), "num_layers must be divisible by transformer_pipeline_model_parallel_size" # When a standalone embedding stage is used, all transformer layers # are divided among pipeline rank >= 1, while on pipeline rank 0, @@ -1451,8 +1787,8 @@ def _get_num_layers(args, model_type, is_decoder=False): num_layers = ( 0 if args.standalone_embedding_stage - and parallel_state.get_pipeline_model_parallel_rank() == 0 else - args.num_layers // args.transformer_pipeline_model_parallel_size + and parallel_state.get_pipeline_model_parallel_rank() == 0 + else args.num_layers // args.transformer_pipeline_model_parallel_size ) else: if not is_decoder: @@ -1462,14 +1798,15 @@ def _get_num_layers(args, model_type, is_decoder=False): return num_layers -def _get_layer_type(model_type, default_layer_type, retro_layer_numbers, - layer_number): +def _get_layer_type(model_type, default_layer_type, retro_layer_numbers, layer_number): args = get_args() if args.retro_add_retriever and layer_number in retro_layer_numbers: if model_type == ModelType.retro_decoder: - return LayerType.retro_decoder_with_retriever \ - if layer_number == retro_layer_numbers[0] \ - else LayerType.retro_decoder + return ( + LayerType.retro_decoder_with_retriever + if layer_number == retro_layer_numbers[0] + else LayerType.retro_decoder + ) elif model_type == ModelType.retro_encoder: return LayerType.retro_encoder else: @@ -1478,17 +1815,41 @@ def _get_layer_type(model_type, default_layer_type, retro_layer_numbers, return default_layer_type +def get_num_experts_per_layer( + num_experts: list, num_layers: int, expert_interval: int, offset: int = 0 +) -> list: + assert ( + len(num_experts) == 1 or len(num_experts) == num_layers // expert_interval + ), "num_experts must be either a single value or a list of the same length as the number of MoE layers" + if len(num_experts) == 1: + num_experts = num_experts * (num_layers // expert_interval) + experts_per_layer = [] + for i in range(num_layers): + layer_num = i + 1 + offset + n_e = ( + num_experts[(layer_num - 1) // expert_interval] + if layer_num % expert_interval == 0 + else 1 + ) + experts_per_layer.append(n_e) + return experts_per_layer + + class ParallelTransformer(MegatronModule): """Transformer class.""" - def __init__(self, config, - model_type, layer_type=LayerType.encoder, - self_attn_mask_type=AttnMaskType.padding, - post_layer_norm=True, - pre_process=True, - post_process=True, - drop_path_rate=0.0, - num_experts=[1]): + def __init__( + self, + config, + model_type, + layer_type=LayerType.encoder, + self_attn_mask_type=AttnMaskType.padding, + post_layer_norm=True, + pre_process=True, + post_process=True, + drop_path_rate=0.0, + num_experts=[1], + ): super(ParallelTransformer, self).__init__() args = get_args() @@ -1511,14 +1872,15 @@ def __init__(self, config, self.recompute_granularity = config.recompute_granularity self.recompute_method = config.recompute_method self.recompute_num_layers = config.recompute_num_layers - self.distribute_saved_activations = \ + self.distribute_saved_activations = ( config.distribute_saved_activations and not config.sequence_parallel + ) self.sequence_parallel = config.sequence_parallel # Transformer Engine Init. self.transformer_engine_rope_available = False - if self.transformer_impl == 'transformer_engine': + if self.transformer_impl == "transformer_engine": global transformer_engine import transformer_engine from importlib.metadata import version @@ -1550,45 +1912,53 @@ def __init__(self, config, self.num_microbatches_in_previous_step = -1 self.microbatch_count = 0 - self.checkpoint_core_attention = config.recompute_granularity == 'selective' + self.checkpoint_core_attention = config.recompute_granularity == "selective" # Number of layers. - self.num_layers = _get_num_layers(args, model_type, - layer_type==LayerType.decoder) + self.num_layers = _get_num_layers( + args, model_type, layer_type == LayerType.decoder + ) self.drop_path_rates = [ - rate.item() for rate in - torch.linspace(0, self.drop_path_rate, config.num_layers)] + rate.item() + for rate in torch.linspace(0, self.drop_path_rate, config.num_layers) + ] self.retro_layer_numbers = None if model_type == ModelType.retro_decoder: retro_layer_start = 6 if config.num_layers <= 15 else 9 - self.retro_layer_numbers = \ - np.arange(retro_layer_start, args.num_layers + 1, 3).tolist() + self.retro_layer_numbers = np.arange( + retro_layer_start, args.num_layers + 1, 3 + ).tolist() if model_type == ModelType.retro_encoder: self.retro_layer_numbers = [1] # Transformer layers. if args.retro_add_retriever: - assert self.recompute_granularity != 'full', \ - "Full recompute not supported for Retro." - assert args.transformer_impl == 'local', \ - "Transformer engine does not support Retro layers." + assert ( + self.recompute_granularity != "full" + ), "Full recompute not supported for Retro." + assert ( + args.transformer_impl == "local" + ), "Transformer engine does not support Retro layers." + def build_layer(layer_number, n_e): - if args.transformer_impl == 'local': + if args.transformer_impl == "local": current_layer_type = _get_layer_type( - model_type, layer_type, self.retro_layer_numbers, - layer_number) + model_type, layer_type, self.retro_layer_numbers, layer_number + ) return ParallelTransformerLayer( config, layer_number, layer_type=current_layer_type, self_attn_mask_type=self_attn_mask_type, drop_path_rate=self.drop_path_rates[layer_number - 1], - num_experts=n_e) + num_experts=n_e, + ) else: - assert config.num_attention_heads == config.num_key_value_heads, \ - 'Transformer_engine does not support GQA' + assert ( + config.num_attention_heads == config.num_key_value_heads + ), "Transformer_engine does not support GQA" return transformer_engine.pytorch.TransformerLayer( config.hidden_size, config.ffn_hidden_size, @@ -1615,16 +1985,22 @@ def build_layer(layer_number, n_e): layer_type="encoder", drop_path_rate=self.drop_path_rates[layer_number - 1], set_parallel_mode=True, - fuse_qkv_params=True) + fuse_qkv_params=True, + ) if config.virtual_pipeline_model_parallel_size is not None: - assert config.num_layers % config.virtual_pipeline_model_parallel_size == 0, \ - 'num_layers_per_stage must be divisible by ' \ - 'virtual_pipeline_model_parallel_size' + assert ( + config.num_layers % config.virtual_pipeline_model_parallel_size == 0 + ), ( + "num_layers_per_stage must be divisible by " + "virtual_pipeline_model_parallel_size" + ) assert args.model_type != ModelType.encoder_and_decoder # Number of layers in each model chunk is the number of layers in the stage, # divided by the number of model chunks in a stage. - self.num_layers = self.num_layers // config.virtual_pipeline_model_parallel_size + self.num_layers = ( + self.num_layers // config.virtual_pipeline_model_parallel_size + ) # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of # layers to stages like (each list is a model chunk): # Stage 0: [0] [2] [4] [6] @@ -1634,12 +2010,14 @@ def build_layer(layer_number, n_e): # Stage 0: [0, 1] [4, 5] # Stage 1: [2, 3] [6, 7] offset = parallel_state.get_virtual_pipeline_model_parallel_rank() * ( - config.num_layers // config.virtual_pipeline_model_parallel_size) + \ - (parallel_state.get_pipeline_model_parallel_rank() * self.num_layers) + config.num_layers // config.virtual_pipeline_model_parallel_size + ) + (parallel_state.get_pipeline_model_parallel_rank() * self.num_layers) else: # Each stage gets a contiguous set of layers. - if args.model_type == ModelType.encoder_and_decoder and \ - parallel_state.get_pipeline_model_parallel_world_size() > 1: + if ( + args.model_type == ModelType.encoder_and_decoder + and parallel_state.get_pipeline_model_parallel_world_size() > 1 + ): pipeline_rank = parallel_state.get_pipeline_model_parallel_rank() if layer_type == LayerType.encoder: offset = pipeline_rank * self.num_layers @@ -1647,7 +2025,9 @@ def build_layer(layer_number, n_e): num_ranks_in_enc = args.pipeline_model_parallel_split_rank offset = (pipeline_rank - num_ranks_in_enc) * self.num_layers else: - offset = parallel_state.get_pipeline_model_parallel_rank() * self.num_layers + offset = ( + parallel_state.get_pipeline_model_parallel_rank() * self.num_layers + ) if self.num_layers == 0: # When a standalone embedding stage is used (e.g., @@ -1659,23 +2039,16 @@ def build_layer(layer_number, n_e): # this, we assign a 'no-op' layer on these ranks, which will # disconnect the input tensor from the output tensor. self.num_layers = 1 - self.layers = torch.nn.ModuleList([ NoopTransformerLayer(1) ]) + self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)]) else: - assert len(num_experts) == 1 or len(num_experts) == args.num_layers // args.expert_interval, \ - 'num_experts must be either a single value or a list of the same length as the number of MoE layers' - - # Create the list of MoE experts - if len(num_experts) == 1: - num_experts = num_experts * (args.num_layers // args.expert_interval) - # Build the layers self.layers = [] + experts_per_layer = get_num_experts_per_layer( + num_experts, self.num_layers, args.expert_interval, offset + ) for i in range(self.num_layers): layer_num = i + 1 + offset - if layer_num % args.expert_interval == 0: - n_e = num_experts[(layer_num-1) // args.expert_interval] - else: - n_e = 1 + n_e = experts_per_layer[i] self.layers.append(build_layer(layer_num, n_e)) self.layers = torch.nn.ModuleList(self.layers) @@ -1683,40 +2056,54 @@ def build_layer(layer_number, n_e): if model_type == ModelType.retro_encoder: for layer in self.layers: if layer.self_attention.use_flash_attn: - layer.self_attention.core_attention_flash.dropout_p = \ + layer.self_attention.core_attention_flash.dropout_p = ( torch.nn.Dropout(args.retro_encoder_attention_dropout) + ) else: - layer.self_attention.core_attention.attention_dropout.p =\ + layer.self_attention.core_attention.attention_dropout.p = ( args.retro_encoder_attention_dropout + ) layer.hidden_dropout = args.retro_encoder_hidden_dropout if self.post_process and self.post_layer_norm: # Final layer norm before output. - if args.normalization == 'layernorm': - if get_accelerator().device_name() == 'cuda': + if args.normalization == "layernorm": + if get_accelerator().device_name() == "cuda": self.final_layernorm = LayerNorm( config.hidden_size, eps=config.layernorm_epsilon, no_persist_layer_norm=args.no_persist_layer_norm, sequence_parallel=config.sequence_parallel, apply_layernorm_1p=args.apply_layernorm_1p, - mem_efficient_ln=args.mem_efficient_ln) + mem_efficient_ln=args.mem_efficient_ln, + ) else: self.final_layernorm = LayerNorm( - config.hidden_size, - eps=config.layernorm_epsilon) + config.hidden_size, eps=config.layernorm_epsilon + ) else: - self.final_layernorm = RMSNorm(config.hidden_size, config.layernorm_epsilon) + self.final_layernorm = RMSNorm( + config.hidden_size, + config.layernorm_epsilon, + sequence_parallel=config.sequence_parallel, + ) def _get_layer(self, layer_number): return self.layers[layer_number] - def _checkpointed_forward(self, hidden_states, attention_mask, - encoder_output, enc_dec_attn_mask, - rotary_pos_emb, is_first_microbatch): + def _checkpointed_forward( + self, + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + rotary_pos_emb, + is_first_microbatch, + ): args = get_args() """Forward method with activation checkpointing.""" + def custom(start, end): def custom_forward(*args, **kwargs): x_, *args = args @@ -1728,11 +2115,14 @@ def custom_forward(*args, **kwargs): x_, moe_loss = output else: x_ = output - moe_loss = torch.tensor(0.0, device=x_.device, dtype=x_.dtype, requires_grad=True) + moe_loss = torch.tensor( + 0.0, device=x_.device, dtype=x_.dtype, requires_grad=True + ) moe_losses.append(moe_loss) return (x_, *moe_losses) + return custom_forward - + if args.deepspeed and args.deepspeed_activation_checkpointing: moe_losses = [] # Make sure memory is freed. @@ -1740,9 +2130,18 @@ def custom_forward(*args, **kwargs): l = 0 while l < self.num_layers: hidden_states, *local_moe_losses = tensor_parallel.checkpoint( - custom(l, l + self.checkpoint_num_layers), False, - hidden_states, attention_mask, encoder_output, enc_dec_attn_mask, - None, None, None, None, rotary_pos_emb) + custom(l, l + self.checkpoint_num_layers), + False, + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + None, + None, + None, + None, + rotary_pos_emb, + ) moe_losses.extend(local_moe_losses) l += self.checkpoint_num_layers @@ -1750,66 +2149,105 @@ def custom_forward(*args, **kwargs): else: moe_losses = [] te_forward_kwargs = {} - if self.transformer_impl == 'transformer_engine': - te_forward_kwargs['is_first_microbatch'] = is_first_microbatch + if self.transformer_impl == "transformer_engine": + te_forward_kwargs["is_first_microbatch"] = is_first_microbatch if self.transformer_engine_rope_available: - te_forward_kwargs['rotary_pos_emb'] = rotary_pos_emb + te_forward_kwargs["rotary_pos_emb"] = rotary_pos_emb - if self.recompute_method == 'uniform': + if self.recompute_method == "uniform": # Uniformly divide the total number of Transformer layers and # checkpoint the input activation of each divided chunk. # A method to further reduce memory usage reducing checkpoints. l = 0 while l < self.num_layers: - if self.transformer_impl == 'transformer_engine': - hidden_states, *local_moe_losses = transformer_engine.pytorch.distributed.checkpoint( - custom(l, l + self.recompute_num_layers), - self.distribute_saved_activations, - tensor_parallel.get_cuda_rng_tracker, - mpu.get_tensor_model_parallel_group(), - hidden_states, attention_mask, encoder_output, - enc_dec_attn_mask, **te_forward_kwargs) + if self.transformer_impl == "transformer_engine": + hidden_states, *local_moe_losses = ( + transformer_engine.pytorch.distributed.checkpoint( + custom(l, l + self.recompute_num_layers), + self.distribute_saved_activations, + tensor_parallel.get_cuda_rng_tracker, + mpu.get_tensor_model_parallel_group(), + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + **te_forward_kwargs, + ) + ) else: hidden_states, *local_moe_losses = tensor_parallel.checkpoint( custom(l, l + self.recompute_num_layers), self.distribute_saved_activations, - hidden_states, attention_mask, - encoder_output, enc_dec_attn_mask, - None, None, None, None, rotary_pos_emb) + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + None, + None, + None, + None, + rotary_pos_emb, + ) moe_losses.extend(local_moe_losses) l += self.recompute_num_layers - elif self.recompute_method == 'block': + elif self.recompute_method == "block": # Checkpoint the input activation of only a set number of individual # Transformer layers and skip the rest. # A method fully use the device memory removing redundant re-computation. for l in range(self.num_layers): if l < self.recompute_num_layers: - if self.transformer_impl == 'transformer_engine': - hidden_states, *local_moe_losses = transformer_engine.pytorch.distributed.checkpoint( - custom(l, l + 1), - self.distribute_saved_activations, - tensor_parallel.get_cuda_rng_tracker, - mpu.get_tensor_model_parallel_group(), - hidden_states, attention_mask, encoder_output, - enc_dec_attn_mask, **te_forward_kwargs) + if self.transformer_impl == "transformer_engine": + hidden_states, *local_moe_losses = ( + transformer_engine.pytorch.distributed.checkpoint( + custom(l, l + 1), + self.distribute_saved_activations, + tensor_parallel.get_cuda_rng_tracker, + mpu.get_tensor_model_parallel_group(), + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + **te_forward_kwargs, + ) + ) else: - hidden_states, *local_moe_losses = tensor_parallel.checkpoint( - custom(l, l + 1), - self.distribute_saved_activations, - hidden_states, attention_mask, - encoder_output, enc_dec_attn_mask, - None, None, None, None, rotary_pos_emb) + hidden_states, *local_moe_losses = ( + tensor_parallel.checkpoint( + custom(l, l + 1), + self.distribute_saved_activations, + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + None, + None, + None, + None, + rotary_pos_emb, + ) + ) else: - if self.transformer_impl == 'transformer_engine': + if self.transformer_impl == "transformer_engine": hidden_states, *local_moe_losses = custom(l, l + 1)( - hidden_states, attention_mask, encoder_output, - enc_dec_attn_mask, **te_forward_kwargs) + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + **te_forward_kwargs, + ) else: hidden_states, *local_moe_losses = custom(l, l + 1)( - hidden_states, attention_mask, - encoder_output, enc_dec_attn_mask, - None, None, None, None, rotary_pos_emb) - + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + None, + None, + None, + None, + rotary_pos_emb, + ) + moe_losses.extend(local_moe_losses) else: raise ValueError("Invalid activation recompute method.") @@ -1825,19 +2263,25 @@ def set_input_tensor(self, input_tensor): forward_step_func""" self.input_tensor = input_tensor - def forward(self, hidden_states, attention_mask, - encoder_output=None, enc_dec_attn_mask=None, - retriever_input=None, - retriever_output=None, - retriever_attn_mask=None, - inference_params=None, - rotary_pos_emb=None): + def forward( + self, + hidden_states, + attention_mask, + encoder_output=None, + enc_dec_attn_mask=None, + retriever_input=None, + retriever_output=None, + retriever_attn_mask=None, + inference_params=None, + rotary_pos_emb=None, + ): # hidden_states: [s, b, h] # Checks. if inference_params: - assert self.recompute_granularity is None, \ - 'inference does not work with activation checkpointing' + assert ( + self.recompute_granularity is None + ), "inference does not work with activation checkpointing" # TODO: Below old DeepSpeed code are commented because it's unsure whether # it is still relevant. @@ -1892,64 +2336,77 @@ def forward(self, hidden_states, attention_mask, with rng_context: # The fp8_autocast context manager is a no-op when enabled=True # The if...else serves to short circuit name resolution for fp8_autocast - with transformer_engine.pytorch.fp8_autocast( - enabled=self.use_fp8, - fp8_recipe=self.fp8_recipe, - fp8_group=self.fp8_group - ) if self.use_fp8 else nullcontext(): + with ( + transformer_engine.pytorch.fp8_autocast( + enabled=self.use_fp8, + fp8_recipe=self.fp8_recipe, + fp8_group=self.fp8_group, + ) + if self.use_fp8 + else nullcontext() + ): # Determine if the current iteration is first microbatch if self.num_microbatches_in_previous_step != get_num_microbatches(): - self.microbatch_count = 0 # Reset count on new batch size rampup interval + self.microbatch_count = ( + 0 # Reset count on new batch size rampup interval + ) self.num_microbatches_in_previous_step = get_num_microbatches() - is_first_microbatch = self.microbatch_count % get_num_microbatches() == 0 + is_first_microbatch = ( + self.microbatch_count % get_num_microbatches() == 0 + ) # Forward pass. moe_losses = [] if self.checkpoint_activations: - hidden_states, moe_losses = self._checkpointed_forward(hidden_states, - attention_mask, - encoder_output, - enc_dec_attn_mask, - rotary_pos_emb, - is_first_microbatch) - elif self.recompute_granularity == 'full': - hidden_states, moe_losses = self._checkpointed_forward(hidden_states, - attention_mask, - encoder_output, - enc_dec_attn_mask, - rotary_pos_emb, - is_first_microbatch) + hidden_states, moe_losses = self._checkpointed_forward( + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + rotary_pos_emb, + is_first_microbatch, + ) + elif self.recompute_granularity == "full": + hidden_states, moe_losses = self._checkpointed_forward( + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + rotary_pos_emb, + is_first_microbatch, + ) else: forward_kwargs = { - 'encoder_output': encoder_output, - 'enc_dec_attn_mask': enc_dec_attn_mask, - 'inference_params': inference_params, + "encoder_output": encoder_output, + "enc_dec_attn_mask": enc_dec_attn_mask, + "inference_params": inference_params, } - if self.transformer_impl == 'transformer_engine': - forward_kwargs['is_first_microbatch'] = is_first_microbatch - forward_kwargs['checkpoint_core_attention'] = self.checkpoint_core_attention + if self.transformer_impl == "transformer_engine": + forward_kwargs["is_first_microbatch"] = is_first_microbatch + forward_kwargs["checkpoint_core_attention"] = ( + self.checkpoint_core_attention + ) if self.transformer_engine_rope_available: - forward_kwargs['rotary_pos_emb'] = rotary_pos_emb + forward_kwargs["rotary_pos_emb"] = rotary_pos_emb else: - forward_kwargs['rotary_pos_emb'] = rotary_pos_emb - forward_kwargs['retriever_input'] = retriever_input - forward_kwargs['retriever_output'] = retriever_output - forward_kwargs['retriever_attn_mask'] = retriever_attn_mask + forward_kwargs["rotary_pos_emb"] = rotary_pos_emb + forward_kwargs["retriever_input"] = retriever_input + forward_kwargs["retriever_output"] = retriever_output + forward_kwargs["retriever_attn_mask"] = retriever_attn_mask for index in range(self.num_layers): layer = self._get_layer(index) hidden_states = layer( - hidden_states, - attention_mask, - **forward_kwargs) + hidden_states, attention_mask, **forward_kwargs + ) # First Retro decoder layer returns both hidden_states # and retriever_output. Make retriever_output available # to subsequence Retro layers. if isinstance(hidden_states, tuple): - assert (len(hidden_states) == 2 or len(hidden_states) == 3) + assert len(hidden_states) == 2 or len(hidden_states) == 3 if len(hidden_states) == 2: if not self.ds_inference: hidden_states, moe_loss = hidden_states @@ -1975,6 +2432,7 @@ def forward(self, hidden_states, attention_mask, return (hidden_states, *moe_losses) + class LMHeadPipe(MegatronModule): """ Arguments: @@ -1988,11 +2446,13 @@ class LMHeadPipe(MegatronModule): def __init__(self, hidden_size, vocab_size, config): args = get_args() super(LMHeadPipe, self).__init__() - self.lm_head = tensor_parallel.ColumnParallelLinear(input_size=hidden_size, - output_size=vocab_size, - bias=False, - config=config, - init_method=config.init_method,) + self.lm_head = tensor_parallel.ColumnParallelLinear( + input_size=hidden_size, + output_size=vocab_size, + bias=False, + config=config, + init_method=config.init_method, + ) def forward(self, inputs, **kwargs): assert torch.is_tensor(inputs) or isinstance(inputs, tuple) @@ -2001,10 +2461,10 @@ def forward(self, inputs, **kwargs): else: hidden_states = inputs - if not hasattr(self, '_args'): + if not hasattr(self, "_args"): self._args = get_args() - if hasattr(self._args, 'attn_mask'): + if hasattr(self._args, "attn_mask"): attention_mask = None else: attention_mask = inputs[1] @@ -2012,7 +2472,7 @@ def forward(self, inputs, **kwargs): logits, _ = self.lm_head(hidden_states) # If cmd args has attn_mask, we don't forward it as an activation. - if hasattr(self._args, 'attn_mask'): + if hasattr(self._args, "attn_mask"): return logits else: return logits, attention_mask diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 12a458375d..99145ff4f4 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -1,13 +1,9 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': - from apex.optimizers import FusedAdam as Adam - from apex.optimizers import FusedSGD as SGD -else: - from torch.optim import Adam - from torch.optim import SGD +import torch +from typing import Callable, Any, Iterable, Union from megatron import get_args from .distrib_optimizer import DistributedOptimizer @@ -15,19 +11,60 @@ from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer -def get_param_groups(modules, - no_weight_decay_cond, - scale_lr_cond, - lr_mult): - """creates param groups based on weight decay condition (regularized vs non regularized) - and learning rate scale condition (args.lr vs lr_mult * args.lr) - scale_lr_cond is used during finetuning where head of the network requires a scaled - version of the base learning rate. +import ezpz as ez +RANK = ez.get_rank() + + +def get_param_groups( + modules: Union[torch.nn.Module, Iterable[torch.nn.Module]], + no_weight_decay_cond: Callable[[str, torch.Tensor], bool], + scale_lr_cond: Callable[[str, torch.Tensor], bool], + lr_mult: Any, + use_galore: bool = False, +): + """ + Creates param groups (regularized vs non) based on: + + - weight decay condition. + - learning rate scale condition (args.lr vs lr_mult * args.lr) + - scale_lr_cond is used during finetuning, where head of the network + requires a scaled version of the base learning rate. + # if 'galore' in args.optimizer.lower(): + # # make parameters with "rank" to a single group, if param_name has "mlp" or "attn" + # galore_params = [] + # target_modules_list = ["attn", "mlp"] + # # for module_name, module in param_groups: + # for group_id, group in enumerate(param_groups): + # for param, p in enumerate(group['params']): + # if not isinstance(module, torch.nn.Linear): + # continue + # if not any(target_key in module_name for target_key in target_modules_list): + # continue + # print('enable GaLore for weights in module: ', module_name) + # galore_params.append(module.weight) + # id_galore_params = [id(p) for p in galore_params] + # # make parameters without "rank" to another group + # regular_params = [p for p in param_groups if id(p) not in id_galore_params] + # # then call galore_adamw + # param_groups = [ + # { + # 'params': regular_params + # }, + # { + # 'params': galore_params, + # 'rank': RANK, + # 'update_proj_gap': args.update_proj_gap, + # 'scale': args.galore_scale, + # 'proj_type': args.proj_type + # } + # ] """ wd_no_scale_lr = [] wd_scale_lr = [] no_wd_no_scale_lr = [] no_wd_scale_lr = [] + galore_params = [] + target_modules_list = ["attn", "mlp"] for module in modules: for name, param in module.named_parameters(): if not param.requires_grad: @@ -65,20 +102,30 @@ def get_param_groups(modules, return param_groups -def get_megatron_optimizer(model, - no_weight_decay_cond=None, - scale_lr_cond=None, - lr_mult=1.0): + +def get_megatron_optimizer( + model, + no_weight_decay_cond=None, + scale_lr_cond=None, + lr_mult=1.0 +): args = get_args() + assert args is not None # Base optimizer. - param_groups = get_param_groups(model, - no_weight_decay_cond, - scale_lr_cond, - lr_mult) + param_groups = get_param_groups( + model, + no_weight_decay_cond, + scale_lr_cond, + lr_mult + ) if args.create_moe_param_group: - from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer - param_groups = split_params_into_different_moe_groups_for_optimizer(param_groups) + from deepspeed.moe.utils import ( + split_params_into_different_moe_groups_for_optimizer + ) + param_groups = split_params_into_different_moe_groups_for_optimizer( + param_groups + ) if args.cpu_optimizer: assert args.optimizer == 'adam', 'CPU offloading is for Adam' @@ -87,45 +134,209 @@ def get_megatron_optimizer(model, else: from deepspeed.ops.adam import DeepSpeedCPUAdam cpu_adam_optimizer = DeepSpeedCPUAdam - optimizer = cpu_adam_optimizer(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps) - else: - if args.optimizer == 'adam': - if args.ds_fused_adam: - global Adam - from deepspeed.ops.adam import FusedAdam - Adam = FusedAdam - optimizer = Adam(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps) - elif args.optimizer == 'sgd': - optimizer = SGD(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - momentum=args.sgd_momentum) - else: - raise Exception('{} optimizer is not supported.'.format( - args.optimizer)) + optimizer = cpu_adam_optimizer( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + ) + elif args.optimizer.lower() == "galore_adamw": + from galore_torch import GaLoreAdamW, GaLoreAdamW8bit + # redefine way to call galore_adamw + optimizer = GaLoreAdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer.lower() == "galore_adamw": + # redefine way to call galore_adamw + optimizer = GaLoreAdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay) + # implement adafactor + elif args.optimizer.lower() == "adafactor": + import transformers + args.beta1 = None if args.beta1 == 0.0 else args.beta1 + optimizer = transformers.optimization.Adafactor( + param_groups, + lr=args.lr, + eps=(1e-30, 1e-3), + clip_threshold=1.0, + decay_rate=-0.8, + beta1=args.beta1, + weight_decay=args.weight_decay, + relative_step=False, + scale_parameter=False, + warmup_init=False, + ) + # low-rank adafactor + elif args.optimizer.lower() == "galore_adafactor": + args.beta1 = None if args.beta1 == 0.0 else args.beta1 + optimizer = GaLoreAdafactor( + param_groups, + lr=args.lr, + eps=(1e-30, 1e-3), + clip_threshold=1.0, + decay_rate=-0.8, + beta1=args.beta1, + weight_decay=args.weight_decay, + relative_step=False, + scale_parameter=False, + warmup_init=False, + ) + # 8-bit Adam + elif args.optimizer.lower() == "adam8bit": + import bitsandbytes as bnb + optimizer = bnb.optim.Adam8bit(param_groups, lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer.lower() == "galore_adamw8bit": + optimizer = GaLoreAdamW8bit(param_groups, lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer.lower() == 'galore_adamw8bit_per_layer': + # TODO: seems scheduler call twice in one update step, need to check, for now double the num_training_steps, warmup_steps and update_proj_gap + optimizer_dict = {} + for p in model.parameters(): + if p.requires_grad: + if id(p) in id_galore_params: + optimizer_dict[p] = GaLoreAdamW8bit([{'params': [p], 'rank': args.rank, 'update_proj_gap': args.update_proj_gap * 2, 'scale': args.galore_scale, 'proj_type': args.proj_type}], lr=args.lr, weight_decay=args.weight_decay) + else: + optimizer_dict[p] = bnb.optim.Adam8bit([p], lr=args.lr, weight_decay=args.weight_decay) + # get scheduler dict + scheduler_dict = {} + from galore_torch.peft_pretraining import training_utils + for p in model.parameters(): + if p.requires_grad: + scheduler_dict[p] = training_utils.get_scheculer( + optimizer=optimizer_dict[p], + scheduler_type=args.scheduler, + num_training_steps=args.num_training_steps * 2, + warmup_steps=args.warmup_steps * 2, + min_lr_ratio=args.min_lr_ratio, + ) + + def optimizer_hook(p): + if p.grad is None: + return + optimizer_dict[p].step() + optimizer_dict[p].zero_grad() + scheduler_dict[p].step() + # Register the hook onto every parameter + for p in model.parameters(): + if p.requires_grad: + p.register_post_accumulate_grad_hook(optimizer_hook) + layer_wise_flag = True + elif str(args.optimizer) == 'ipex.lamb': + from intel_extension_for_pytorch.optim._lamb import Lamb + optimizer = Lamb( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + ) + elif str(args.optimizer) == 'ipex.fusedlamb': + from intel_extension_for_pytorch.optim._lamb import Lamb + optimizer = Lamb( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + fused=True, + ) + elif str(args.optimizer).lower() == 'ds.fusedlamb': + from deepspeed.ops.lamb import FusedLamb + optimizer = FusedLamb( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + ) + elif str(args.optimizer).lower() == 'adamwschedulefree': + import schedulefree + optimizer = schedulefree.AdamWScheduleFree( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + warmup_steps=args.lr_warmup_iters, + foreach=args.schedulefree_for_each, + ) + elif str(args.optimizer).lower() == 'sgdschedulefree': + import schedulefree + optimizer = schedulefree.SGDScheduleFree( + param_groups, + lr=args.lr, + momentum=args.sgd_momentum, + weight_decay=args.weight_decay, + warmup_steps=args.lr_warmup_iters, + foreach=args.schedulefree_for_each, + ) + elif str(args.optimizer).lower() == 'apex.adam': + assert get_accelerator().device_name() == 'cuda' + from apex.optimizers import FusedAdam as Adam + optimizer = Adam( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) + elif str(args.optimizer).lower() == 'apex.sgd': + from apex.optimizers import FusedSGD as SGD + optimizer = SGD( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + momentum=args.sgd_momentum + ) + elif str(args.optimizer).lower() == 'adamw': + optimizer = torch.optim.AdamW( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) + elif args.optimizer == 'adam': + if args.ds_fused_adam: + # global Adam + from deepspeed.ops.adam import FusedAdam + Adam = FusedAdam + else: + Adam = torch.optim.Adam + optimizer = Adam( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) + elif args.optimizer == 'sgd': + optimizer = torch.optim.SGD( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + momentum=args.sgd_momentum + ) + elif str(args.optimizer).lower() == 'sophiag': + from .sophia import SophiaG + optimizer = SophiaG( + param_groups, + lr=args.lr, + betas=(args.sophiag_beta1, args.sophiag_beta2), + rho = args.sophiag_rho, + weight_decay=args.weight_decay + ) + else: + raise TypeError(f'{args.optimizer} optimizer is not supported.') if args.deepspeed: return optimizer - # Determine whether the params have main-grad field. params_have_main_grad = False if args.use_contiguous_buffers_in_local_ddp: params_have_main_grad = True - # Mixed precision optimizer. # - Note: both the Float16Optimizer and the DistributedOptimizer inherit # from the MixedPrecisionOptimizer, which manages any optimizer where # the model params and main params are distinct. if args.fp16 or args.bf16 or args.use_distributed_optimizer: - # Grad scaler: # if loss-scale is provided, instantiate the constant scaler. # if we are using fp16 and loss-scale is not present, use a @@ -133,11 +344,9 @@ def get_megatron_optimizer(model, # otherwise we are running in bf16 with no loss-scale so # leave it as None. grad_scaler = None - # Constant loss scale. if args.loss_scale: grad_scaler = ConstantGradScaler(args.loss_scale) - # Dynamic loss scale. else: if args.fp16: @@ -148,11 +357,11 @@ def get_megatron_optimizer(model, backoff_factor=0.5, growth_interval=args.loss_scale_window, hysteresis=args.hysteresis) - # Megatron optimizer. - opt_ty = DistributedOptimizer \ - if args.use_distributed_optimizer else \ - Float16OptimizerWithFloat16Params + opt_ty = ( + DistributedOptimizer if args.use_distributed_optimizer + else Float16OptimizerWithFloat16Params + ) return opt_ty(optimizer, args.clip_grad, args.log_num_zeros_in_grad, @@ -163,10 +372,12 @@ def get_megatron_optimizer(model, args.params_dtype, grad_scaler, model) - # FP32. - return FP32Optimizer(optimizer, args.clip_grad, - args.log_num_zeros_in_grad, - params_have_main_grad, - args.use_contiguous_buffers_in_local_ddp, - model) + return FP32Optimizer( + optimizer, + args.clip_grad, + args.log_num_zeros_in_grad, + params_have_main_grad, + args.use_contiguous_buffers_in_local_ddp, + model + ) diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py index afec8f220c..b5141d0059 100644 --- a/megatron/optimizer/clip_grads.py +++ b/megatron/optimizer/clip_grads.py @@ -8,10 +8,14 @@ except ModuleNotFoundError: from torch import inf as inf -from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': +# from deepspeed.accelerator import get_accelerator +# if get_accelerator().device_name() == 'cuda': +try: from apex.multi_tensor_apply import multi_tensor_applier import amp_C + HAS_APEX = True +except Exception: + HAS_APEX = False from megatron.model.module import param_is_not_shared from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate @@ -71,7 +75,7 @@ def clip_grad_norm_fp32(parameters, grads_for_norm, else: if norm_type == 2.0: - if get_accelerator().device_name() == 'cuda': + if get_accelerator().device_name() == 'cuda' and HAS_APEX: dummy_overflow_buf = torch.cuda.IntTensor([0]) # Use apex's multi-tensor applier for efficiency reasons. # Multi-tensor applier takes a function and a list of list diff --git a/megatron/optimizer/distrib_optimizer.py b/megatron/optimizer/distrib_optimizer.py index 1aeeac3444..10331607d9 100644 --- a/megatron/optimizer/distrib_optimizer.py +++ b/megatron/optimizer/distrib_optimizer.py @@ -2,10 +2,11 @@ """Megatron distributed optimizer.""" -from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': +# from deepspeed.accelerator import get_accelerator +# if get_accelerator().device_name() == 'cuda': +try: from apex.optimizers import FusedAdam as Adam -else: +except Exception: from torch.optim import Adam import math diff --git a/megatron/optimizer/sophia.py b/megatron/optimizer/sophia.py new file mode 100644 index 0000000000..4c4e074790 --- /dev/null +++ b/megatron/optimizer/sophia.py @@ -0,0 +1,202 @@ +import math +import torch +from torch import Tensor +from torch.optim.optimizer import Optimizer +from typing import List, Optional + + +#SOphiaG implementation from https://github.com/Liuhong99/Sophia/blob/main/sophia.py, copy pasted here because no pip and not sure about submodules + +class SophiaG(Optimizer): + def __init__(self, params, lr=1e-4, betas=(0.965, 0.99), rho = 0.04, + weight_decay=1e-1, *, maximize: bool = False, + capturable: bool = False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= rho: + raise ValueError("Invalid rho parameter at index 1: {}".format(rho)) + if not 0.0 <= weight_decay: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + defaults = dict(lr=lr, betas=betas, rho=rho, + weight_decay=weight_decay, + maximize=maximize, capturable=capturable) + super(SophiaG, self).__init__(params, defaults) + + def __setstate__(self, state): + super().__setstate__(state) + for group in self.param_groups: + group.setdefault('maximize', False) + group.setdefault('capturable', False) + state_values = list(self.state.values()) + step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step']) + if not step_is_tensor: + for s in state_values: + s['step'] = torch.tensor(float(s['step'])) + + @torch.no_grad() + def update_hessian(self): + for group in self.param_groups: + beta1, beta2 = group['betas'] + for p in group['params']: + if p.grad is None: + continue + state = self.state[p] + + if len(state) == 0: + state['step'] = torch.zeros((1,), dtype=torch.float, device=p.device) \ + if self.defaults['capturable'] else torch.tensor(0.) + state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) + state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format) + + if 'hessian' not in state.keys(): + state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format) + + state['hessian'].mul_(beta2).addcmul_(p.grad, p.grad, value=1 - beta2) + + + @torch.no_grad() + def step(self, closure=None, bs=5120): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + params_with_grad = [] + grads = [] + exp_avgs = [] + state_steps = [] + hessian = [] + beta1, beta2 = group['betas'] + + for p in group['params']: + if p.grad is None: + continue + params_with_grad.append(p) + + if p.grad.is_sparse: + raise RuntimeError('Hero does not support sparse gradients') + grads.append(p.grad) + state = self.state[p] + # State initialization + if len(state) == 0: + state['step'] = torch.zeros((1,), dtype=torch.float, device=p.device) \ + if self.defaults['capturable'] else torch.tensor(0.) + state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) + state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format) + + if 'hessian' not in state.keys(): + state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format) + + exp_avgs.append(state['exp_avg']) + state_steps.append(state['step']) + hessian.append(state['hessian']) + + if self.defaults['capturable']: + bs = torch.ones((1,), dtype=torch.float, device=p.device) * bs + + sophiag(params_with_grad, + grads, + exp_avgs, + hessian, + state_steps, + bs=bs, + beta1=beta1, + beta2=beta2, + rho=group['rho'], + lr=group['lr'], + weight_decay=group['weight_decay'], + maximize=group['maximize'], + capturable=group['capturable']) + + return loss + +def sophiag(params: List[Tensor], + grads: List[Tensor], + exp_avgs: List[Tensor], + hessian: List[Tensor], + state_steps: List[Tensor], + capturable: bool = False, + *, + bs: int, + beta1: float, + beta2: float, + rho: float, + lr: float, + weight_decay: float, + maximize: bool): + + if not all(isinstance(t, torch.Tensor) for t in state_steps): + raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors") + + + func = _single_tensor_sophiag + + func(params, + grads, + exp_avgs, + hessian, + state_steps, + bs=bs, + beta1=beta1, + beta2=beta2, + rho=rho, + lr=lr, + weight_decay=weight_decay, + maximize=maximize, + capturable=capturable) + +def _single_tensor_sophiag(params: List[Tensor], + grads: List[Tensor], + exp_avgs: List[Tensor], + hessian: List[Tensor], + state_steps: List[Tensor], + *, + bs: int, + beta1: float, + beta2: float, + rho: float, + lr: float, + weight_decay: float, + maximize: bool, + capturable: bool): + + for i, param in enumerate(params): + grad = grads[i] if not maximize else -grads[i] + exp_avg = exp_avgs[i] + hess = hessian[i] + step_t = state_steps[i] + + if capturable: + assert param.is_cuda and step_t.is_cuda and bs.is_cuda + + if torch.is_complex(param): + grad = torch.view_as_real(grad) + exp_avg = torch.view_as_real(exp_avg) + hess = torch.view_as_real(hess) + param = torch.view_as_real(param) + + # update step + step_t += 1 + + # Perform stepweight decay + param.mul_(1 - lr * weight_decay) + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + + if capturable: + step_size = lr + step_size_neg = step_size.neg() + + ratio = (exp_avg.abs() / (rho * bs * hess + 1e-15)).clamp(None,1) + param.addcmul_(exp_avg.sign(), ratio, value=step_size_neg) + else: + step_size_neg = - lr + + ratio = (exp_avg.abs() / (rho * bs * hess + 1e-15)).clamp(None,1) + param.addcmul_(exp_avg.sign(), ratio, value=step_size_neg) diff --git a/megatron/profiler.py b/megatron/profiler.py new file mode 100644 index 0000000000..aeab144846 --- /dev/null +++ b/megatron/profiler.py @@ -0,0 +1,56 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. + +import torch + +on_step_begin = [] +on_step_end = [] + +def trigger(phase): + [f() for f in phase] + +def setup_profiler(args, device): + if args.profile is None: + return + + start_step, end_step = map(int, args.profile_steps.split(',')) + active_steps = end_step - start_step + 1 + cur_step = 0 + + def on_step_begin_fn(): + nonlocal cur_step + cur_step = cur_step + 1 + on_step_begin.append(on_step_begin_fn) + + def when(cond, clbk): + def fn(): + if cond(): + clbk() + return fn + + def is_start_step(): + return cur_step == start_step + + def is_end_step(): + return cur_step == end_step + + def is_capture_step(): + return cur_step >= start_step and cur_step <= end_step + + if args.profile.startswith('pt') and ( + args.profile_ranks is None or torch.distributed.get_rank() in args.profile_ranks + ): + schedule = torch.profiler.schedule(wait=0, warmup=0, active=active_steps, repeat=1) + activities = [torch.profiler.ProfilerActivity.CPU] + activities.extend([torch.profiler.ProfilerActivity.HPU] if device.startswith("hpu") else []) + activities.extend([torch.profiler.ProfilerActivity.CUDA] if device.startswith("cuda") else []) + full = args.profile == 'pt-full' + + profiler = torch.profiler.profile( + schedule=schedule, + activities=activities, + on_trace_ready=torch.profiler.tensorboard_trace_handler(args.tensorboard_dir, use_gzip=True), + with_stack=full) + + on_step_begin.append(when(is_start_step, profiler.start)) + on_step_end.append(when(is_capture_step, profiler.step)) + on_step_end.append(when(is_end_step, profiler.stop)) diff --git a/megatron/timers.py b/megatron/timers.py index 384c7c37a3..5a9ba1f21d 100644 --- a/megatron/timers.py +++ b/megatron/timers.py @@ -8,8 +8,14 @@ import torch from deepspeed.accelerator import get_accelerator +from tensorboard.summary import Writer from packaging import version +try: + import wandb +except Exception: + wandb = None + class TimerBase(ABC): @@ -33,11 +39,10 @@ def elapsed(self, reset=True, barrier=False): pass - class DummyTimer(TimerBase): def __init__(self): - super().__init__('dummy timer') + super().__init__("dummy timer") def start(self, barrier=False): return @@ -49,9 +54,7 @@ def reset(self): return def elapsed(self, reset=True, barrier=False): - raise Exception('dummy timer should not be used to ' - 'calculate elapsed time') - + raise Exception("dummy timer should not be used to " "calculate elapsed time") class Timer(TimerBase): @@ -72,37 +75,32 @@ def __init__(self, name): self._barrier_group = None self._start_time = time.time() - def set_barrier_group(self, barrier_group): self._barrier_group = barrier_group - def start(self, barrier=False): """Start the timer.""" - assert not self._started, 'timer has already been started' + assert not self._started, "timer has already been started" if barrier: torch.distributed.barrier(group=self._barrier_group) get_accelerator().synchronize() self._start_time = time.time() self._started = True - def stop(self, barrier=False): """Stop the timer.""" - assert self._started, 'timer is not started' + assert self._started, "timer is not started" if barrier: torch.distributed.barrier(group=self._barrier_group) get_accelerator().synchronize() - self._elapsed += (time.time() - self._start_time) + self._elapsed += time.time() - self._start_time self._started = False - def reset(self): """Reset timer.""" self._elapsed = 0.0 self._started = False - def elapsed(self, reset=True, barrier=False): """Calculate the elapsed time.""" _started = self._started @@ -120,7 +118,6 @@ def elapsed(self, reset=True, barrier=False): return _elapsed - class Timers: """Group of timers.""" @@ -132,24 +129,27 @@ def __init__(self, log_level, log_option): self._dummy_timer = DummyTimer() self._max_log_level = 2 - def __call__(self, name, log_level=None): # If the timer has already been set, then check if the log-level # is provided, it matches the one that the timer was created with. if name in self._timers: if log_level is not None: - assert log_level == self._log_levels[name], \ - 'input log level {} does not match already existing '\ - 'log level {} for {} timer'.format( - log_level, self._log_levels[name], name) + assert log_level == self._log_levels[name], ( + "input log level {} does not match already existing " + "log level {} for {} timer".format( + log_level, self._log_levels[name], name + ) + ) return self._timers[name] # If timer does not exist and no log level is provided, # set it to the max log level which is 2. if log_level is None: log_level = self._max_log_level - assert log_level <= self._max_log_level, \ - 'log level {} is larger than max supported log level {}'.format( - log_level, self._max_log_level) + assert ( + log_level <= self._max_log_level + ), "log level {} is larger than max supported log level {}".format( + log_level, self._max_log_level + ) # Now if the input log level is larger than the one set for # the timers class, just ignore it and return a dummy timer. if log_level > self._log_level: @@ -159,7 +159,6 @@ def __call__(self, name, log_level=None): self._log_levels[name] = log_level return self._timers[name] - def _get_elapsed_time_all_ranks(self, names, reset, barrier): """ Assumptions: @@ -185,34 +184,35 @@ def _get_elapsed_time_all_ranks(self, names, reset, barrier): # pytorch yet. It is simpler to deal with a single tensor # and since we are only gathering a small amount of data, # it should be ok to use all-gather instead of gather. - rank_name_to_time = torch.zeros((world_size, len(names)), - dtype=torch.float, - device=get_accelerator().current_device_name()) + rank_name_to_time = torch.zeros( + (world_size, len(names)), + dtype=torch.float, + device=get_accelerator().current_device_name(), + ) for i, name in enumerate(names): if name in self._timers: # Here we don't need to pass the barrier flag as all # the processes are already in sync. This avoids the # issue of different timers having different barrier # groups inside their class. - rank_name_to_time[rank, i] = self._timers[name].elapsed( - reset=reset) + rank_name_to_time[rank, i] = self._timers[name].elapsed(reset=reset) # See the note above for why we are not using gather. - if version.parse(torch.__version__) >= version.parse('1.13'): - torch.distributed.all_gather_into_tensor(rank_name_to_time.view(-1), - rank_name_to_time[rank, :].view(-1)) + if version.parse(torch.__version__) >= version.parse("1.13"): + torch.distributed.all_gather_into_tensor( + rank_name_to_time.view(-1), rank_name_to_time[rank, :].view(-1) + ) else: - torch.distributed._all_gather_base(rank_name_to_time.view(-1), - rank_name_to_time[rank, :].view(-1)) + torch.distributed._all_gather_base( + rank_name_to_time.view(-1), rank_name_to_time[rank, :].view(-1) + ) return rank_name_to_time - def _get_global_min_max_time(self, names, reset, barrier, normalizer): """Report only min and max times across all ranks.""" - rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset, - barrier) + rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset, barrier) name_to_min_max_time = {} for i, name in enumerate(names): rank_to_time = rank_name_to_time[:, i] @@ -222,34 +222,36 @@ def _get_global_min_max_time(self, names, reset, barrier, normalizer): if rank_to_time.numel() > 0: name_to_min_max_time[name] = ( rank_to_time.min().item() / normalizer, - rank_to_time.max().item() / normalizer) + rank_to_time.max().item() / normalizer, + ) return name_to_min_max_time - - def _get_global_min_max_time_string(self, names, reset, barrier, - normalizer, max_only): + def _get_global_min_max_time_string( + self, names, reset, barrier, normalizer, max_only + ): name_to_min_max_time = self._get_global_min_max_time( - names, reset, barrier, normalizer) + names, reset, barrier, normalizer + ) if not name_to_min_max_time: return None - output_string = '(min, max) time across ranks (ms):' + output_string = "(min, max) time across ranks (ms):" for name in name_to_min_max_time: min_time, max_time = name_to_min_max_time[name] if max_only: - output_string += '\n {}: {:.2f}'.format( - (name+' ').ljust(48, '.'), max_time) + output_string += "\n {}: {:.2f}".format( + (name + " ").ljust(48, "."), max_time + ) else: - output_string += '\n {}: ({:.2f}, {:.2f})'.format( - (name+' ').ljust(48, '.'), min_time, max_time) + output_string += "\n {}: ({:.2f}, {:.2f})".format( + (name + " ").ljust(48, "."), min_time, max_time + ) return output_string - def _get_all_ranks_time_string(self, names, reset, barrier, normalizer): """Report times across all ranks.""" - rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset, - barrier) + rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset, barrier) - output_string = 'times across ranks (ms):' + output_string = "times across ranks (ms):" no_reported_timing = True for i, name in enumerate(names): not_yet_found = True @@ -258,32 +260,32 @@ def _get_all_ranks_time_string(self, names, reset, barrier, normalizer): no_reported_timing = False if not_yet_found: not_yet_found = False - output_string += '\n {}:'.format(name) - output_string += '\n rank {:2d}: {:.2f}'.format( - rank, rank_name_to_time[rank, i] / normalizer) + output_string += "\n {}:".format(name) + output_string += "\n rank {:2d}: {:.2f}".format( + rank, rank_name_to_time[rank, i] / normalizer + ) if no_reported_timing: return None return output_string - def log(self, names, rank=None, normalizer=1.0, reset=True, barrier=False): """Log a group of timers.""" # Print. assert normalizer > 0.0 - if self._log_option in ['max', 'minmax']: + if self._log_option in ["max", "minmax"]: max_only = False - if self._log_option == 'max': + if self._log_option == "max": max_only = True output_string = self._get_global_min_max_time_string( - names, reset, barrier, normalizer/1000.0, max_only) - elif self._log_option == 'all': - output_string = self._get_all_ranks_time_string(names, - reset, barrier, - normalizer/1000.0) + names, reset, barrier, normalizer / 1000.0, max_only + ) + elif self._log_option == "all": + output_string = self._get_all_ranks_time_string( + names, reset, barrier, normalizer / 1000.0 + ) else: - raise Exception('unknown timing log option {}'.format( - self._log_option)) + raise Exception("unknown timing log option {}".format(self._log_option)) # If no input rank is provided, log on last rank. if rank is None: @@ -291,9 +293,15 @@ def log(self, names, rank=None, normalizer=1.0, reset=True, barrier=False): if rank == torch.distributed.get_rank() and output_string is not None: print(output_string, flush=True) - - def write(self, names, writer, iteration, normalizer=1.0, - reset=False, barrier=False): + def write( + self, + names: list[str], + writer: Writer, + iteration: int, + normalizer: float = 1.0, + reset: bool = False, + barrier: bool = False, + ): """Write timers to a tensorboard writer Note that we only report maximum time across ranks to tensorboard. """ @@ -302,8 +310,22 @@ def write(self, names, writer, iteration, normalizer=1.0, # polutes the runs list, so we just add each as a scalar assert normalizer > 0.0 name_to_min_max_time = self._get_global_min_max_time( - names, reset, barrier, normalizer) - if writer is not None: + names, reset, barrier, normalizer + ) + # <<<<<<< HEAD + timer_data = { + "timers/iteration": iteration, + **{ + f"timers/{k}-time": name_to_min_max_time[k][1] + for k in name_to_min_max_time + }, + } + if wandb is not None and getattr(wandb, "run", None) is not None: + wandb.log(timer_data, commit=False) + # ======= + # if writer.is_enabled(): + # >>>>>>> 0d6e3793a1fc06eded9764ef15ad12bcc0281101 + if writer is not None: # and writer.is_enabled(): for name in name_to_min_max_time: _, max_time = name_to_min_max_time[name] - writer.add_scalar(name + '-time', max_time, iteration) + writer.add_scalar(f"{name}-time", max_time, iteration) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 023c2f756c..92853fb30d 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -1,3 +1,4 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. """Megatron tokenizers.""" @@ -45,11 +46,14 @@ def build_tokenizer(args): tokenizer = _NullTokenizer(args.vocab_size) elif args.tokenizer_type == 'HFTokenizer': assert args.tokenizer_model is not None - tokenizer = _HFTokenizer(args.tokenizer_model) + # tokenizer = _HFTokenizer(args.tokenizer_model) + tokenizer = _HFTokenizer(args.tokenizer_model, + args.seq_length, + args.trust_remote_code) else: raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type)) - + # Add vocab size. args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args) @@ -595,10 +599,30 @@ def additional_special_tokens_ids(self): class _HFTokenizer(AbstractTokenizer): """HF Tokenizer""" - def __init__(self, tokenizer_name_or_path): + def __init__(self, tokenizer_name_or_path, max_seq_len, trust_remote_code): name = tokenizer_name_or_path super().__init__(name) - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, + padding_side="right", + trust_remote_code=trust_remote_code, + use_fast=False) + DEFAULT_PAD_TOKEN = "[PAD]" + DEFAULT_EOS_TOKEN = "" + DEFAULT_BOS_TOKEN = "" + DEFAULT_UNK_TOKEN = "" + special_tokens_dict = dict() + if self.tokenizer.pad_token is None: + special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN + if self.tokenizer.eos_token is None: + special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN + if self.tokenizer.bos_token is None: + special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN + if self.tokenizer.unk_token is None: + special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN + self.tokenizer.add_special_tokens(special_tokens_dict) + # if self.tokenizer.pad_token == None: + # self.tokenizer.pad_token= "[PAD]" + self.tokenizer.model_max_length = max_seq_len self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} diff --git a/megatron/training.py b/megatron/training.py index 7e6c7dc6bb..d39f21c128 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -1,98 +1,134 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - """Pretrain utilities.""" +import time + +# The earliest we can measure the start time. +_TRAIN_START_TIME = time.time() +from collections import OrderedDict from datetime import datetime +import json +import logging import math +import os import sys import time -import json -# The earliest we can measure the start time. -_TRAIN_START_TIME = time.time() + +import deepspeed +from deepspeed.accelerator import get_accelerator +from deepspeed.compression.compress import init_compression, redundancy_clean +from deepspeed.runtime.data_pipeline.data_routing.helper import ( + convert_to_random_ltd, +) +import ezpz as ez import torch +import torch.distributed as tdist from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP -from megatron import get_args -from megatron import get_signal_handler -from megatron import get_timers -from megatron import get_tensorboard_writer -from megatron import get_current_global_batch_size -from megatron import get_num_microbatches -from megatron import is_last_rank -from megatron import update_num_microbatches +import wandb +from megatron import ( + get_args, + get_current_global_batch_size, + get_num_microbatches, + get_signal_handler, + get_tensorboard_writer, + get_timers, + is_last_rank, + update_num_microbatches, +) +from megatron.arguments import core_transformer_config_from_args +from megatron.checkpointing import load_checkpoint, save_checkpoint from megatron.core import mpu, tensor_parallel -from megatron import print_rank_0, is_rank_0 -from megatron import print_rank_last -from megatron.checkpointing import load_checkpoint -from megatron.checkpointing import save_checkpoint -from megatron.model import Float16Module -from megatron.model import GPTModel from megatron.core.enums import ModelType -from megatron.optimizer import get_megatron_optimizer -from megatron.initialize import initialize_megatron -from megatron.initialize import write_args_to_tensorboard -from megatron.initialize import set_jit_fusion_options -from megatron.optimizer_param_scheduler import OptimizerParamScheduler -from megatron.model import DistributedDataParallel as LocalDDP -from megatron.utils import check_adlr_autoresume_termination -from megatron.utils import unwrap_model -from megatron.data.data_samplers import build_pretraining_data_loader -from megatron.utils import calc_params_l2_norm from megatron.core.pipeline_parallel import get_forward_backward_func -from megatron.utils import report_memory, throughput_calculator, checkpoint_throughput_calculator, update_rotary_pos_emb +from megatron.data.data_samplers import build_pretraining_data_loader +from megatron.initialize import ( + initialize_megatron, + set_jit_fusion_options, + write_args_to_tensorboard, +) +from megatron.model import Float16Module, GPTModel +from megatron.model import DistributedDataParallel as LocalDDP +from megatron.model.transformer import ParallelTransformerLayer from megatron.model.vision.knn_monitor import compute_feature_bank -from megatron.arguments import core_transformer_config_from_args +from megatron.optimizer import get_megatron_optimizer +from megatron.optimizer_param_scheduler import OptimizerParamScheduler +from megatron.training_log import training_log +from megatron.utils import ( + PerfTrace, + Profile, + calc_params_l2_norm, + check_adlr_autoresume_termination, + checkpoint_throughput_calculator, + found_kill_switch, + unwrap_model, + update_rotary_pos_emb, +) -import deepspeed -from deepspeed.accelerator import get_accelerator -from deepspeed.compression.compress import init_compression, redundancy_clean -from deepspeed.runtime.data_pipeline.data_routing.helper import convert_to_random_ltd -from megatron.model.transformer import ParallelTransformerLayer +from megatron.profiler import ( + setup_profiler, + trigger, + on_step_begin, + on_step_end, +) + + +dlp = Profile("TRAINING") + +# from deepspeed import comm as dist -from deepspeed import comm as dist +RANK: int = ez.get_rank() +WORLD_SIZE: int = ez.get_world_size() +# DEVICE_TYPE: str = ez.get_torch_device() +DEVICE_TYPE: str = ez.dist.get_torch_device_type() +DEVICE: torch.device = torch.device(DEVICE_TYPE) -try: - import wandb -except (ImportError, ModuleNotFoundError): - wandb = None +log: logging.Logger = logging.getLogger(__name__) +LOG_LEVEL: str = str(os.environ.get("LOG_LEVEL", "INFO")).upper() +log.setLevel(LOG_LEVEL) if RANK == 0 else log.setLevel("CRITICAL") def print_datetime(string): """Note that this call will sync across all ranks.""" - torch.distributed.barrier() - time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - print_rank_0('[' + string + '] datetime: {} '.format(time_str)) + tdist.barrier() + time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + log.info("[" + string + "] datetime={} ".format(time_str)) -''' + +""" Since v0.9.0, deepspeed.initialize() has forbidden simultaneous setting of args.deepspeed_config (Path) and ds_config dict. -So, we use ds_config dict which is the more flexible option. -''' +So, we use ds_config dict which is the more flexible option +""" + + def _create_ds_config_dict(): args = get_args() - if isinstance(args.deepspeed_config, dict) : + assert args is not None + if isinstance(args.deepspeed_config, dict): ds_config_dict = args.deepspeed_config else: - with open(args.deepspeed_config, 'r', encoding='utf-8') as config_file: + with open(args.deepspeed_config, "r", encoding="utf-8") as config_file: ds_config_dict = json.load(config_file) - if args.universal_checkpoint: ds_config_dict["checkpoint"] = {"load_universal": True} - # Clear config path - args.deepspeed_config = None - + args.deepspeed_config = None return ds_config_dict - - -def pretrain(train_valid_test_dataset_provider, - model_provider, - model_type, - forward_step_func, - process_non_loss_data_func=None, - extra_args_provider=None, - args_defaults={}, - data_post_process=None, - external_args={}): + + +@dlp.log +def pretrain( + train_valid_test_dataset_provider, + model_provider, + model_type, + forward_step_func, + process_non_loss_data_func=None, + extra_args_provider=None, + args_defaults={}, + data_post_process=None, + external_args={}, +) -> list[torch.nn.Module]: """Main training program. This function will run the followings in the order provided: @@ -120,72 +156,112 @@ def pretrain(train_valid_test_dataset_provider, to it. It is used for programs to add their own arguments. args_defaults: a dictionary from argument-name to argument-value. It to set already parse arguments. - """ + Returns: + model (torch.nn.Module) + """ # Initalize and get arguments, timers, and Tensorboard writer. - initialize_megatron(extra_args_provider=extra_args_provider, - args_defaults=args_defaults, external_args=external_args) + initialize_megatron( + extra_args_provider=extra_args_provider, + args_defaults=args_defaults, + external_args=external_args, + ) + args = get_args() + assert args is not None + if found_kill_switch(): + print_datetime(f"Detected kill switch at {args.kill_switch_file}. Exiting") + sys.exit() + # Set pytorch JIT layer fusion options and warmup JIT functions. - if get_accelerator().device_name() == 'cuda': + # if get_accelerator().device_name() == "cuda": + if DEVICE_TYPE == "cuda" and torch.cuda.is_available(): set_jit_fusion_options() # Adjust the startup time so it reflects the largest value. # This will be closer to what scheduler will see (outside of # image ... launches. + before_allreduce = time.time() global _TRAIN_START_TIME - start_time_tensor = get_accelerator().DoubleTensor([_TRAIN_START_TIME]) - torch.distributed.all_reduce(start_time_tensor, - op=torch.distributed.ReduceOp.MIN) + log.info( + f"time to finish initialize_megatron: {time.time() - _TRAIN_START_TIME} seconds" + ) + # start_time_tensor = DEVICE.DoubleTensor([_TRAIN_START_TIME]) + start_time_tensor = torch.tensor( + [_TRAIN_START_TIME], dtype=torch.double, device=DEVICE_TYPE + ) + tdist.all_reduce(start_time_tensor, op=tdist.ReduceOp.MIN) + log.info(f"allreduce call time: {time.time()-before_allreduce} seconds") _TRAIN_START_TIME = start_time_tensor.item() - print_rank_0('time to initialize megatron (seconds): {:.3f}'.format( - time.time() - _TRAIN_START_TIME)) - print_datetime('after megatron is initialized') - - args = get_args() + log.info( + "time to initialize megatron (seconds)={:.3f}".format( + time.time() - _TRAIN_START_TIME + ) + ) + print_datetime("after megatron is initialized") + if os.getenv("DLIO_PROFILER_DATASET_DIR") is not None: + extra_trace_path = os.environ["DLIO_PROFILER_DATASET_DIR"] + else: + extra_trace_path = "" + os.makedirs(args.trace_dir, exist_ok=True) + PerfTrace.initialize_log( + f"{args.trace_dir}/trace-{ez.get_rank()}-of-{ez.get_world_size()}.pfw", + f"{args.data_cache_path}:{extra_trace_path}:{args.data_path}:{args.save}:{args.load}", + process_id=ez.get_rank(), + ) timers = get_timers() - + assert args is not None + assert timers is not None if args.deepspeed: args.deepspeed_config_dict = _create_ds_config_dict() - if "curriculum_learning" in args.deepspeed_config_dict and \ - "enabled" in args.deepspeed_config_dict["curriculum_learning"]: - args.curriculum_learning_legacy = args.deepspeed_config_dict[ \ - "curriculum_learning"]["enabled"] + if ( + "curriculum_learning" in args.deepspeed_config_dict + and "enabled" in args.deepspeed_config_dict["curriculum_learning"] + ): + args.curriculum_learning_legacy = args.deepspeed_config_dict[ + "curriculum_learning" + ]["enabled"] if args.curriculum_learning_legacy and not args.no_pipeline_parallel: - from deepspeed.runtime.data_pipeline.curriculum_scheduler \ - import CurriculumScheduler - args.curriculum_scheduler = CurriculumScheduler( \ - args.deepspeed_config_dict["curriculum_learning"]) + from deepspeed.runtime.data_pipeline.curriculum_scheduler import ( + CurriculumScheduler, + ) + + args.curriculum_scheduler = CurriculumScheduler( + args.deepspeed_config_dict["curriculum_learning"] + ) if "compression_training" in args.deepspeed_config_dict: args.compression_training = True # Model, optimizer, and learning rate. - timers('model-and-optimizer-setup', log_level=0).start(barrier=True) + timers("model-and-optimizer-setup", log_level=0).start(barrier=True) model, optimizer, opt_param_scheduler = setup_model_and_optimizer( - model_provider, model_type, teacher=False, data_post_process=data_post_process, - build_train_valid_test_datasets_provider=train_valid_test_dataset_provider) - timers('model-and-optimizer-setup').stop() - print_datetime('after model, optimizer, and learning rate ' - 'scheduler are built') - + model_provider, + model_type, + teacher=False, + data_post_process=data_post_process, + build_train_valid_test_datasets_provider=train_valid_test_dataset_provider, + ) + timers("model-and-optimizer-setup").stop() + print_datetime("after model, optimizer, and learning rate " "scheduler are built") # Data stuff. - timers('train/valid/test-data-iterators-setup', log_level=0).start( - barrier=True) + timers("train/valid/test-data-iterators-setup", log_level=0).start(barrier=True) if args.virtual_pipeline_model_parallel_size is not None: all_data_iterators = [ - build_train_valid_test_data_iterators( - train_valid_test_dataset_provider) + build_train_valid_test_data_iterators(train_valid_test_dataset_provider) for _ in range(len(model)) ] - train_data_iterator = [data_iterators[0] - for data_iterators in all_data_iterators] - valid_data_iterator = [data_iterators[1] - for data_iterators in all_data_iterators] - test_data_iterator = [data_iterators[2] - for data_iterators in all_data_iterators] + train_data_iterator = [ + data_iterators[0] for data_iterators in all_data_iterators + ] + valid_data_iterator = [ + data_iterators[1] for data_iterators in all_data_iterators + ] + test_data_iterator = [ + data_iterators[2] for data_iterators in all_data_iterators + ] else: - train_data_iterator, valid_data_iterator, test_data_iterator \ - = build_train_valid_test_data_iterators( - train_valid_test_dataset_provider) + train_data_iterator, valid_data_iterator, test_data_iterator = ( + build_train_valid_test_data_iterators(train_valid_test_dataset_provider) + ) if args.data_efficiency_curriculum_learning: if args.deepspeed_dataloader is not None: # We use args to pass the deepspeed_dataloader because adding @@ -198,67 +274,79 @@ def pretrain(train_valid_test_dataset_provider, args.deepspeed_dataloader = None else: train_data_iterator = None - timers('train/valid/test-data-iterators-setup').stop() - print_datetime('after dataloaders are built') - + timers("train/valid/test-data-iterators-setup").stop() + print_datetime("after dataloaders are built") # args.teacher_model is used as global variable to pass the teacher model # for knowledge distillation. Users do not need to set it in the command # line to use kd, but users do need to provide teacher model configurations # like args.num_layers_teacher as described in setup_teacher_model() args.teacher_model = None - if args.mos or args.kd: # Set up teacher model + if args.mos or args.kd: # Set up teacher model args.teacher_model = setup_teacher_model(args, model_provider) - # Print setup timing. - print_rank_0('done with setup ...') - timers.log(['model-and-optimizer-setup', - 'train/valid/test-data-iterators-setup'], barrier=True) - + log.info("done with setup ...") + timers.log( + ["model-and-optimizer-setup", "train/valid/test-data-iterators-setup"], + barrier=True, + ) if not args.skip_train: - print_rank_0('training ...') - - if args.dataloader_type == 'cyclic' and args.retro_add_retriever: + log.info("training ...") + if args.dataloader_type == "cyclic" and args.retro_add_retriever: args.train_iters = args.retro_cyclic_train_iters - print_rank_0("retro cyclic train iters : %d" % args.train_iters) - + log.info("retro cyclic train iters : %d" % args.train_iters) iteration = 0 if args.do_train and args.train_iters > 0: - iteration = train(forward_step_func, - model, optimizer, opt_param_scheduler, - train_data_iterator, valid_data_iterator, - process_non_loss_data_func) - - print_datetime('after training is done') + iteration = train( + forward_step_func, + model, + optimizer, + opt_param_scheduler, + train_data_iterator, + valid_data_iterator, + process_non_loss_data_func, + ) + print_datetime("after training is done") # Clean the model if args.compression_training: model = [redundancy_clean(model[0], args.deepspeed_config_dict, mpu)] - if args.save and iteration != 0: save_checkpoint(iteration, model, optimizer, opt_param_scheduler) else: - print_rank_0('skipping training (--skip-train is on) ...') - + log.info("skipping training (--skip-train is on) ...") iteration = args.iteration - config = core_transformer_config_from_args(args) if args.do_valid: - prefix = f'iteration {iteration} on {args.eval_iters * args.global_batch_size}-sample draw from validation set' - evaluate_and_print_results(prefix, forward_step_func, - valid_data_iterator, model, - iteration, process_non_loss_data_func, config, - verbose=True, write_to_tensorboard=not args.skip_train) - + prefix = f"iteration {iteration} on {args.eval_iters * args.global_batch_size}-sample draw from validation set" + _ = evaluate_and_print_results( + prefix, + forward_step_func, + valid_data_iterator, + model, + iteration, + process_non_loss_data_func, + config, + verbose=True, + write_to_tensorboard=not args.skip_train, + ) if args.do_test: - prefix = f'iteration {iteration} on {args.eval_iters * args.global_batch_size}-sample draw from test set' - evaluate_and_print_results(prefix, forward_step_func, - test_data_iterator, model, - iteration, process_non_loss_data_func, config, - verbose=True, write_to_tensorboard=not args.skip_train, test=True) + prefix = f"iteration {iteration} on {args.eval_iters * args.global_batch_size}-sample draw from test set" + _ = evaluate_and_print_results( + prefix, + forward_step_func, + test_data_iterator, + model, + iteration, + process_non_loss_data_func, + config, + verbose=True, + write_to_tensorboard=not args.skip_train, + test=True, + ) return model +@dlp.log def update_train_iters(args): - # For iteration-based training, we don't need to do anything if args.train_iters: return @@ -280,16 +368,15 @@ def update_train_iters(args): update_num_microbatches(0, consistency_check=False) # Constant phase # Note that we throw away any partial last batch. - iterations += (args.train_samples - consumed_samples) // \ - args.global_batch_size + iterations += (args.train_samples - consumed_samples) // args.global_batch_size args.train_iters = iterations - print_rank_0('setting training iterations to {}'.format(args.train_iters)) + log.info("setting training iterations to {}".format(args.train_iters)) -def setup_teacher_model(args, model_provider): - - print_rank_0('***>>>>> Student model checkpoint iteration:{}'.format(args.iteration)) +@dlp.log +def setup_teacher_model(args, model_provider): + log.info("***>>>>> Student model checkpoint iteration:{}".format(args.iteration)) iteration_stuent = args.iteration num_layers_student = args.num_layers num_experts_student = args.num_experts @@ -297,7 +384,7 @@ def setup_teacher_model(args, model_provider): num_attention_heads_student = args.num_attention_heads load_student = args.load - print_rank_0('***>>>>> Setting up the teacher model') + log.info("***>>>>> Setting up the teacher model") args.num_layers = args.num_layers_teacher args.num_experts = args.num_experts_teacher @@ -305,7 +392,7 @@ def setup_teacher_model(args, model_provider): args.num_attention_heads = args.num_attention_heads_teacher args.load = args.load_teacher teacher_model, _, _ = load_model_weights_only(model_provider) - print_rank_0('***>>>>> Teacher model:{}'.format(teacher_model)) + log.info("***>>>>> Teacher model:{}".format(teacher_model)) args.num_layers = num_layers_student args.num_experts = num_experts_student @@ -316,16 +403,27 @@ def setup_teacher_model(args, model_provider): return teacher_model -def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True): + +@dlp.log +@ez.dist.timeitlogit(rank=RANK) +def get_model( + model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True +): """Build the model.""" args = get_args() + accelerator = get_accelerator() + assert accelerator is not None + assert args is not None args.model_type = model_type # Build model. - if mpu.get_pipeline_model_parallel_world_size() > 1 and \ - args.virtual_pipeline_model_parallel_size is not None: - assert model_type != ModelType.encoder_and_decoder, \ - "Interleaved schedule not supported for model with both encoder and decoder" + if ( + mpu.get_pipeline_model_parallel_world_size() > 1 + and args.virtual_pipeline_model_parallel_size is not None + ): + assert ( + model_type != ModelType.encoder_and_decoder + ), "Interleaved schedule not supported for model with both encoder and decoder" model = [] for i in range(args.virtual_pipeline_model_parallel_size): mpu.set_virtual_pipeline_model_parallel_rank(i) @@ -333,8 +431,7 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap pre_process = mpu.is_pipeline_first_stage() post_process = mpu.is_pipeline_last_stage() this_model = model_provider_func( - pre_process=pre_process, - post_process=post_process + pre_process=pre_process, post_process=post_process ) this_model.model_type = model_type model.append(this_model) @@ -345,37 +442,37 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap add_decoder = True if model_type == ModelType.encoder_and_decoder: if mpu.get_pipeline_model_parallel_world_size() > 1: - assert args.pipeline_model_parallel_split_rank is not None, \ - "Split rank needs to be specified for model with both encoder and decoder" + assert ( + args.pipeline_model_parallel_split_rank is not None + ), "Split rank needs to be specified for model with both encoder and decoder" rank = mpu.get_pipeline_model_parallel_rank() split_rank = args.pipeline_model_parallel_split_rank world_size = mpu.get_pipeline_model_parallel_world_size() pre_process = rank == 0 or rank == split_rank - post_process = (rank == (split_rank - 1)) or ( - rank == (world_size - 1)) + post_process = (rank == (split_rank - 1)) or (rank == (world_size - 1)) add_encoder = mpu.is_pipeline_stage_before_split() add_decoder = mpu.is_pipeline_stage_after_split() model = model_provider_func( pre_process=pre_process, post_process=post_process, add_encoder=add_encoder, - add_decoder=add_decoder) + add_decoder=add_decoder, + ) else: model = model_provider_func( - pre_process=pre_process, - post_process=post_process + pre_process=pre_process, post_process=post_process ) model.model_type = model_type - if not isinstance(model, list): model = [model] # Disallow training and inference with Transformer Engine # for non-GPT models args.allow_transformer_engine = all([type(m) == GPTModel for m in model]) - assert args.allow_transformer_engine or args.transformer_impl == 'local', \ - 'Transformer Engine is only approved for GPT models' + assert ( + args.allow_transformer_engine or args.transformer_impl == "local" + ), "Transformer Engine is only approved for GPT models" # Set tensor model parallel attributes if not set. # Only parameters that are already tensor model parallel have these @@ -383,56 +480,84 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap # are set for all params so the optimizer can use them. for model_module in model: for param in model_module.parameters(): - tensor_parallel.set_defaults_if_not_set_tensor_model_parallel_attributes(param) + tensor_parallel.set_defaults_if_not_set_tensor_model_parallel_attributes( + param + ) # Print number of parameters. if mpu.get_data_parallel_rank() == 0: - print(' > number of parameters on (tensor, pipeline) ' - 'model parallel rank ({}, {}): {}'.format( - mpu.get_tensor_model_parallel_rank(), - mpu.get_pipeline_model_parallel_rank(), - sum([sum([p.ds_numel if hasattr(p,'ds_id') else p.nelement() for p in model_module.parameters()]) - for model_module in model])), flush=True) + print( + " > number of parameters on (tensor, pipeline) " + "model parallel rank ({}, {})={}".format( + mpu.get_tensor_model_parallel_rank(), + mpu.get_pipeline_model_parallel_rank(), + sum( + [ + sum( + [ + p.ds_numel if hasattr(p, "ds_id") else p.nelement() + for p in model_module.parameters() + ] + ) + for model_module in model + ] + ), + ), + flush=True, + ) if args.deepspeed: return model # GPU allocation. for model_module in model: - model_module.to(get_accelerator().current_device_name()) - + model_module.to(DEVICE_TYPE) # Fp16 conversion. if args.fp16 or args.bf16: model = [Float16Module(model_module, args) for model_module in model] if wrap_with_ddp: - if args.DDP_impl == 'torch': - i = get_accelerator().current_device() - model = [torchDDP(model_module, device_ids=[i], output_device=i, - process_group=mpu.get_data_parallel_group()) - for model_module in model] - - elif args.DDP_impl == 'local': - model = [LocalDDP(model_module, - args.accumulate_allreduce_grads_in_fp32, - args.use_contiguous_buffers_in_local_ddp) - for model_module in model] + if args.DDP_impl == "torch": + i = accelerator.current_device() + model = [ + torchDDP( + model_module, + device_ids=[i], + output_device=i, + process_group=mpu.get_data_parallel_group(), + ) + for model_module in model + ] + + elif args.DDP_impl == "local": + model = [ + LocalDDP( + model_module, + args.accumulate_allreduce_grads_in_fp32, + args.use_contiguous_buffers_in_local_ddp, + ) + for model_module in model + ] # broad cast params from data parallel src rank to other data parallel ranks if args.data_parallel_random_init: for model_module in model: model_module.broadcast_params() else: - raise NotImplementedError('Unknown DDP implementation specified: ' - '{}. Exiting.'.format(args.DDP_impl)) + raise NotImplementedError( + "Unknown DDP implementation specified: " + "{}. Exiting.".format(args.DDP_impl) + ) return model +@dlp.log +@ez.dist.timeitlogit(rank=RANK) def get_optimizer_param_scheduler(optimizer): """Build the learning rate scheduler.""" args = get_args() - + assert args is not None # Iteration-based training. if args.train_iters: if args.lr_decay_iters is None: @@ -458,8 +583,7 @@ def get_optimizer_param_scheduler(optimizer): else: lr_warmup_steps = args.lr_warmup_samples else: - raise Exception( - 'either train-iters or train-samples should be provided.') + raise Exception("either train-iters or train-samples should be provided.") opt_param_scheduler = OptimizerParamScheduler( optimizer, @@ -473,74 +597,75 @@ def get_optimizer_param_scheduler(optimizer): wd_incr_steps=wd_incr_steps, wd_incr_style=args.weight_decay_incr_style, use_checkpoint_opt_param_scheduler=args.use_checkpoint_opt_param_scheduler, - override_opt_param_scheduler=args.override_opt_param_scheduler) + override_opt_param_scheduler=args.override_opt_param_scheduler, + ) return opt_param_scheduler + +@dlp.log def load_model_weights_only(model_provider_func): """Setup model and optimizer.""" args = get_args() - print_rank_0('***>>>>> Args:{}'.format(args)) - + assert args is not None + log.info("***>>>>> Args:{}".format(args)) model = get_model(model_provider_func) - optimizer = None lr_scheduler = None - if args.deepspeed: # When loading just the model weights, ZeRO can be disabled. - if 'zero_optimization' in args.deepspeed_config_dict: - del args.deepspeed_config_dict['zero_optimization'] + if "zero_optimization" in args.deepspeed_config_dict: + del args.deepspeed_config_dict["zero_optimization"] model, optimizer, _, lr_scheduler = deepspeed.initialize( - model=model[0], - config=args.deepspeed_config_dict + model=model[0], config=args.deepspeed_config_dict ) - assert not isinstance(model, deepspeed.PipelineEngine), \ - 'Weight loading only mode is not supported in pipeline parallelism yet.' - + assert not isinstance(model, deepspeed.PipelineEngine), ( + "Weight loading only mode is not supported in " "pipeline parallelism yet." + ) model = [model] - - print_datetime('before load checkpoint') + print_datetime("before load checkpoint") if args.load is not None: - iteration = load_checkpoint(model, optimizer, lr_scheduler, strict=True, load_only_weights=True) - - print_datetime('after load checkpoint weights') - + _ = load_checkpoint( + model, optimizer, lr_scheduler, strict=True, load_only_weights=True + ) + print_datetime("after load checkpoint weights") return model, optimizer, lr_scheduler -def setup_model_and_optimizer(model_provider_func, - model_type, - no_wd_decay_cond=None, - scale_lr_cond=None, - lr_mult=1.0, - teacher=False, - data_post_process=None, - build_train_valid_test_datasets_provider=None): +@dlp.log +@ez.dist.timeitlogit(rank=RANK) +def setup_model_and_optimizer( + model_provider_func, + model_type, + no_wd_decay_cond=None, + scale_lr_cond=None, + lr_mult=1.0, + teacher=False, + data_post_process=None, + build_train_valid_test_datasets_provider=None, +): """Setup model and optimizer.""" args = get_args() - + assert args is not None model = get_model(model_provider_func, model_type) - # initialize the compression here student_global_steps = 0 if args.kd or args.mos: model, _, _, _ = deepspeed.initialize( - model=model[0], - args=args, - mpu=mpu if args.no_pipeline_parallel else None, - config=args.deepspeed_config_dict, - ) + model=model[0], + args=args, + mpu=mpu if args.no_pipeline_parallel else None, + config=args.deepspeed_config_dict, + ) model = [model] if args.load is not None: args.iteration = load_checkpoint(model, None, None, strict=False) else: args.iteration = 0 student_global_steps = model[0].global_steps - print_rank_0('***>>>>> Student model, global step:{}'.format(student_global_steps)) - + log.info("***>>>>> Student model, global step:{}".format(student_global_steps)) if args.compression_training: model, _, _, _ = deepspeed.initialize( model=model[0], @@ -550,10 +675,7 @@ def setup_model_and_optimizer(model_provider_func, ) model = [model] model = [init_compression(model[0].module, args.deepspeed_config_dict, mpu)] - - unwrapped_model = unwrap_model(model, - (torchDDP, LocalDDP, Float16Module)) - + unwrapped_model = unwrap_model(model, (torchDDP, LocalDDP, Float16Module)) if args.inference: optimizer = None opt_param_scheduler = None @@ -561,125 +683,173 @@ def setup_model_and_optimizer(model_provider_func, if teacher: optimizer = None else: - optimizer = get_megatron_optimizer(model, no_wd_decay_cond, - scale_lr_cond, lr_mult) + optimizer = get_megatron_optimizer( + model, no_wd_decay_cond, scale_lr_cond, lr_mult + ) # opt_param_scheduler is the old lr_scheduler plus weight decay scheduling opt_param_scheduler = get_optimizer_param_scheduler(optimizer) - if args.deepspeed: - print_rank_0("DeepSpeed is enabled.") - pp = mpu.get_pipeline_model_parallel_world_size() - if args.data_efficiency_curriculum_learning and build_train_valid_test_datasets_provider is not None: + log.info("DeepSpeed is enabled.") + # pp = mpu.get_pipeline_model_parallel_world_size() + if ( + args.data_efficiency_curriculum_learning + and build_train_valid_test_datasets_provider is not None + ): + log.info( + "Caught 'args.data_efficiency_curriculum_learning' " + "and 'build_train_valid_test_datasets_provider is not None'" + ) train_ds = None # Only need to build dataset on tp rank 0 since Megatron has the # broadcast_data() function that broadcast data from tp rank 0. if mpu.get_tensor_model_parallel_rank() == 0: + log.info("Caught 'mpu.get_tensor_model_parallel_rank() == 0'") # Number of train/valid/test samples. if args.train_samples: train_samples = args.train_samples update_train_iters(args) else: train_samples = args.train_iters * args.global_batch_size + log.info(f"{train_samples=}") # eval_iters and test_iters here are not actually used, only for # satisfying the input of build_train_valid_test_datasets_provider. # We only need to build the training data here. And we follow # baseline's logic to build eval/test dataset later in # build_train_valid_test_data_iterators. - eval_iters = (args.train_iters // args.eval_interval + 1) * \ - args.eval_iters + eval_iters = ( + args.train_iters // args.eval_interval + 1 + ) * args.eval_iters test_iters = args.eval_iters - train_val_test_num_samples = [train_samples, - eval_iters * args.global_batch_size, - test_iters * args.global_batch_size] + train_val_test_num_samples = [ + train_samples, + eval_iters * args.global_batch_size, + test_iters * args.global_batch_size, + ] + log.info(f"{train_val_test_num_samples=}") # Build the datasets. train_ds, _, _ = build_train_valid_test_datasets_provider( - train_val_test_num_samples) - model, optimizer, args.deepspeed_dataloader, opt_param_scheduler = deepspeed.initialize( - model=model[0], - optimizer=optimizer, - args=args, - lr_scheduler=opt_param_scheduler, - training_data=train_ds, - mpu=mpu if args.no_pipeline_parallel else None, - config=args.deepspeed_config_dict, - ) + train_val_test_num_samples + ) + with Profile("deepspeed.initialize"): + model, optimizer, args.deepspeed_dataloader, opt_param_scheduler = ( + deepspeed.initialize( + model=model[0], + optimizer=optimizer, + args=args, + lr_scheduler=opt_param_scheduler, + training_data=train_ds, + mpu=mpu if args.no_pipeline_parallel else None, + config=args.deepspeed_config_dict, + ) + ) model.set_data_post_process_func(data_post_process) else: - model, optimizer, _, opt_param_scheduler = deepspeed.initialize( - model=model[0], - optimizer=optimizer, - args=args, - lr_scheduler=opt_param_scheduler, - mpu=mpu if args.no_pipeline_parallel else None, - config=args.deepspeed_config_dict, + log.info( + "Did NOT catch: ('args.data_efficiency_curriculum_learning' " + "and 'build_train_valid_test_datasets_provider is not None')" ) + tds0 = time.time() + if os.environ.get("PYINSTRUMENT_PROFILER", None): + profiler = ez.profile.get_context_manager(rank=RANK, outdir=args.save) + else: + profiler = Profile("deepspeed.initialize") + log.info("Calling 'deepspeed.initialize'...") + log.info(f"Wrapped with: {profiler=}") + with profiler: + model, optimizer, _, opt_param_scheduler = deepspeed.initialize( + model=model[0], + optimizer=optimizer, + args=args, + lr_scheduler=opt_param_scheduler, + mpu=mpu if args.no_pipeline_parallel else None, + config=args.deepspeed_config_dict, + ) + log.info(f"'deepspeed.initialize' took: {time.time() - tds0:.5f}s") if isinstance(model, deepspeed.PipelineEngine): # hack to get batch_fn from pretrain_gpt.py model.set_batch_fn(model.module._megatron_batch_fn) - - assert model.grid.get_pipe_parallel_rank() == mpu.get_pipeline_model_parallel_rank() - assert model.grid.get_slice_parallel_rank() == mpu.get_tensor_model_parallel_rank() + assert ( + model.grid.get_pipe_parallel_rank() + == mpu.get_pipeline_model_parallel_rank() + ) + assert ( + model.grid.get_slice_parallel_rank() + == mpu.get_tensor_model_parallel_rank() + ) assert model.grid.get_data_parallel_rank() == mpu.get_data_parallel_rank() model = [model] - - # Compression has its own checkpoint loading path (e.g, loading both teacher and student models). So if compression is enabled, we skip the following checkpoint loading. + # Compression has its own checkpoint loading path (e.g, loading both teacher + # and student models). So if compression is enabled, we skip the following + # checkpoint loading. no_post_init_checkpoint_loading = args.kd or args.mos if not no_post_init_checkpoint_loading: if args.load is not None: timers = get_timers() - timers('load-checkpoint', log_level=0).start(barrier=True) + assert timers is not None + timers("load-checkpoint", log_level=0).start(barrier=True) args.iteration = load_checkpoint(model, optimizer, opt_param_scheduler) - timers('load-checkpoint').stop(barrier=True) - timers.log(['load-checkpoint']) + timers("load-checkpoint").stop(barrier=True) + timers.log(["load-checkpoint"]) else: args.iteration = 0 else: model[0].global_steps = student_global_steps - # We only support local DDP with multiple micro-batches. if len(model) > 1 or mpu.get_pipeline_model_parallel_world_size() > 1: - assert args.DDP_impl == 'local' - + assert args.DDP_impl == "local" # get model without FP16 and/or TorchDDP wrappers - if args.iteration == 0 and len(unwrapped_model) == 1 \ - and hasattr(unwrapped_model[0], 'init_state_dict_from_bert'): - print_rank_0("Initializing ICT from pretrained BERT model") + if ( + args.iteration == 0 + and len(unwrapped_model) == 1 + and hasattr(unwrapped_model[0], "init_state_dict_from_bert") + ): + log.info("Initializing ICT from pretrained BERT model") unwrapped_model[0].init_state_dict_from_bert() if args.fp16: + assert optimizer is not None optimizer.reload_model_params() - # random-LTD requires converting transformer layers if args.random_ltd: model[0] = convert_to_random_ltd(model[0], ParallelTransformerLayer) - return model, optimizer, opt_param_scheduler - -def train_step(forward_step_func, data_iterator, - model, optimizer, opt_param_scheduler, config): +@dlp.log +def train_step( + forward_step_func, data_iterator, model, optimizer, opt_param_scheduler, config +): """Single training step.""" args = get_args() timers = get_timers() - + accelerator = get_accelerator() + assert args is not None and timers is not None and accelerator is not None + grad_norm = None + num_zeros_in_grad = None if args.deepspeed and args.ds_pipeline_enabled: - skipped_iter = 0 num_zeros_in_grad = 0 assert isinstance(model[0], deepspeed.PipelineEngine) loss = model[0].train_batch(data_iter=data_iterator) + additional_losses = model[0].get_additional_losses() + loss_key = ( + "lm loss" if additional_losses is None else "loss" + ) # use "lm loss" for backward compatibility + loss_dict = OrderedDict({loss_key: loss}) + if additional_losses is not None: + loss_dict.update(additional_losses) grad_norm = model[0].get_global_grad_norm() - return {'lm loss' : loss}, skipped_iter, grad_norm, num_zeros_in_grad + update_successful = model[0].was_step_applied() + skipped_iter = 0 if update_successful else 1 + return loss_dict, skipped_iter, grad_norm, num_zeros_in_grad # Set grad to zero. if not args.deepspeed: - if args.DDP_impl == 'local' and args.use_contiguous_buffers_in_local_ddp: + if args.DDP_impl == "local" and args.use_contiguous_buffers_in_local_ddp: for partition in model: partition.zero_grad_buffer() optimizer.zero_grad() # Forward pass. - timers('forward-backward', log_level=1).start( - barrier=args.barrier_with_L1_time) + timers("forward-backward", log_level=1).start(barrier=args.barrier_with_L1_time) forward_backward_func = get_forward_backward_func() if args.mos or args.kd: # args.teacher_forward is used as global variable to enable kd loss @@ -691,26 +861,29 @@ def train_step(forward_step_func, data_iterator, if args.timing_log_level < 2: config.timers = None + num_microbatches = get_num_microbatches() + assert num_microbatches is not None losses_reduced = forward_backward_func( forward_step_func=forward_step_func, data_iterator=data_iterator, model=model, - num_microbatches=get_num_microbatches(), + num_microbatches=num_microbatches, seq_length=args.seq_length, micro_batch_size=args.micro_batch_size, decoder_seq_length=args.decoder_seq_length, - forward_only=False) + forward_only=False, + ) # reset timers if necessary if config.timers is None: config.timers = timers - timers('forward-backward').stop() + timers("forward-backward").stop() if args.mos or args.kd: args.teacher_forward = False # Empty unused memory. - if args.empty_unused_memory_level >= 1: - torch.cuda.empty_cache() + if args.empty_unused_memory_level >= 1 and accelerator is not None: + accelerator.empty_cache() # Reduce gradients. if not args.deepspeed: @@ -718,21 +891,23 @@ def train_step(forward_step_func, data_iterator, # Vision gradients. if args.vision_pretraining and args.vision_pretraining_type == "dino": - unwrapped_model = unwrap_model(model[0], - (torchDDP, LocalDDP, Float16Module)) + unwrapped_model = unwrap_model(model[0], (torchDDP, LocalDDP, Float16Module)) unwrapped_model.cancel_gradients_last_layer(args.curr_iteration) # Update parameters. - timers('optimizer', log_level=1).start(barrier=args.barrier_with_L1_time) + timers("optimizer", log_level=1).start(barrier=args.barrier_with_L1_time) if args.deepspeed: - increment = get_num_microbatches() * \ - args.micro_batch_size * \ - args.data_parallel_size - model[0].step(lr_kwargs={'increment': increment}) - update_successful = model[0].was_step_applied() + increment = ( + get_num_microbatches() * args.micro_batch_size * args.data_parallel_size + ) + try: + model[0].step(lr_kwargs={"increment": increment}) + update_successful = model[0].was_step_applied() + except Exception: + update_successful = False else: update_successful, grad_norm, num_zeros_in_grad = optimizer.step(args, timers) - timers('optimizer').stop() + timers("optimizer").stop() # Gather params. if not args.deepspeed and update_successful: @@ -740,511 +915,258 @@ def train_step(forward_step_func, data_iterator, # Vision momentum. if args.vision_pretraining and args.vision_pretraining_type == "dino": - unwrapped_model = unwrap_model(model[0], - (torchDDP, LocalDDP, Float16Module)) + unwrapped_model = unwrap_model(model[0], (torchDDP, LocalDDP, Float16Module)) unwrapped_model.update_momentum(args.curr_iteration) # Update learning rate. if args.deepspeed: - skipped_iter = 0 - grad_norm = None + skipped_iter = 0 if update_successful else 1 + grad_norm = model[0].get_global_grad_norm() + # Empty unused memory. + if args.empty_unused_memory_level >= 2 and accelerator is not None: + accelerator.empty_cache() + # XXX: [saforem2]: ---------------------------------------------------- + # Is `num_zeros_in_grad` worth calculating (/ implementing) ?? + # the `Megatron`-specific implementation is at: + # [megatron.optimizer.clip_grads.count_zeros_fp32](./optimizer/clip_grads.py) + # For now, explicitly set to None + # --------------------------------------------------------------------- num_zeros_in_grad = None - loss_reduced = {} for key in losses_reduced[0]: losses_reduced_for_key = [x[key] for x in losses_reduced] - loss_reduced[key] = sum(losses_reduced_for_key) / len(losses_reduced_for_key) + loss_reduced[key] = sum(losses_reduced_for_key) / len( + losses_reduced_for_key + ) return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad + if update_successful: + increment = ( + get_num_microbatches() * args.micro_batch_size * args.data_parallel_size + ) + opt_param_scheduler.step(increment=increment) + skipped_iter = 0 else: - if update_successful: - increment = get_num_microbatches() * \ - args.micro_batch_size * \ - args.data_parallel_size - opt_param_scheduler.step(increment=increment) - skipped_iter = 0 - else: - skipped_iter = 1 - - # Empty unused memory. - if args.empty_unused_memory_level >= 2: - torch.cuda.empty_cache() - - if mpu.is_pipeline_last_stage(ignore_virtual=True): - # Average loss across microbatches. - loss_reduced = {} - for key in losses_reduced[0]: - losses_reduced_for_key = [x[key] for x in losses_reduced] - loss_reduced[key] = sum(losses_reduced_for_key) / len(losses_reduced_for_key) - return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad - return {}, skipped_iter, grad_norm, num_zeros_in_grad + skipped_iter = 1 + # Empty unused memory. + if args.empty_unused_memory_level >= 2 and accelerator is not None: + accelerator.empty_cache() -def training_log(loss_dict, total_loss_dict, learning_rate, iteration, - loss_scale, report_memory_flag, skipped_iter, - grad_norm, params_norm, num_zeros_in_grad, - model=None, optimizer=None): - """Log training information such as losses, timing, ....""" - args = get_args() - timers = get_timers() - writer = get_tensorboard_writer() - - # Advanced, skipped, and Nan iterations. - advanced_iters_key = 'advanced iterations' - skipped_iters_key = 'skipped iterations' - nan_iters_key = 'nan iterations' - # Advanced iterations. - if not skipped_iter: - total_loss_dict[advanced_iters_key] = total_loss_dict.get( - advanced_iters_key, 0) + 1 - else: - if advanced_iters_key not in total_loss_dict: - total_loss_dict[advanced_iters_key] = 0 - # Skipped iterations. - total_loss_dict[skipped_iters_key] = total_loss_dict.get( - skipped_iters_key, 0) + skipped_iter - # Update losses and set nan iterations - got_nan = False - for key in loss_dict: - if not skipped_iter: - total_loss_dict[key] = total_loss_dict.get( - key, get_accelerator().FloatTensor([0.0])) + loss_dict[key] - else: - value = loss_dict[key].float().sum().item() - is_nan = value == float('inf') or \ - value == -float('inf') or \ - value != value - got_nan = got_nan or is_nan - total_loss_dict[nan_iters_key] = total_loss_dict.get( - nan_iters_key, 0) + int(got_nan) - - # Logging. - timers_to_log = [ - 'forward-backward', - 'forward-compute', - 'backward-compute', - 'batch-generator', - 'forward-recv', - 'forward-send', - 'backward-recv', - 'backward-send', - 'forward-send-forward-recv', - 'forward-send-backward-recv', - 'backward-send-forward-recv', - 'backward-send-backward-recv', - 'forward-backward-send-forward-backward-recv', - 'layernorm-grads-all-reduce', - 'embedding-grads-all-reduce', - 'grads-all-reduce', - 'grads-reduce-scatter', - 'params-all-gather', - 'optimizer-copy-to-main-grad', - 'optimizer-unscale-and-check-inf', - 'optimizer-clip-main-grad', - 'optimizer-count-zeros', - 'optimizer-inner-step', - 'optimizer-copy-main-to-model-params', - 'optimizer'] - - # Calculate batch size. - batch_size = args.micro_batch_size * args.data_parallel_size * \ - get_num_microbatches() - - total_iterations = total_loss_dict[advanced_iters_key] + \ - total_loss_dict[skipped_iters_key] - - # Tensorboard values. - # Timer requires all the ranks to call. - if args.log_timers_to_tensorboard and \ - (iteration % args.tensorboard_log_interval == 0): - timers.write(timers_to_log, writer, iteration, - normalizer=total_iterations) - if writer and (iteration % args.tensorboard_log_interval == 0): - writer.add_scalar('steps-vs-samples/y=steps,x=samples', iteration, args.consumed_train_samples) - writer.add_scalar('steps-vs-samples/y=samples,x=steps', args.consumed_train_samples, iteration) - writer.add_scalar('steps-vs-tokens/y=steps,x=tokens', iteration, args.consumed_train_tokens) - writer.add_scalar('steps-vs-tokens/y=tokens,x=steps', args.consumed_train_tokens, iteration) - if args.log_learning_rate_to_tensorboard: - writer.add_scalar('learning-rate/learning-rate', learning_rate, iteration) - writer.add_scalar('learning-rate/learning-rate vs samples', learning_rate, - args.consumed_train_samples) - writer.add_scalar('learning-rate/learning-rate vs tokens', learning_rate, - args.consumed_train_tokens) - if args.log_batch_size_to_tensorboard: - writer.add_scalar('batch-size/batch-size', batch_size, iteration) - writer.add_scalar('batch-size/batch-size vs samples', batch_size, - args.consumed_train_samples) - writer.add_scalar('batch-size/batch-size vs tokens', batch_size, - args.consumed_train_tokens) - for key in loss_dict: - writer.add_scalar(f"lm-loss-training/{key}", loss_dict[key], iteration) - writer.add_scalar(f"lm-loss-training/{key}" + ' vs samples', loss_dict[key], - args.consumed_train_samples) - writer.add_scalar(f"lm-loss-training/{key}" + ' vs tokens', loss_dict[key], - args.consumed_train_tokens) - if args.fp16 and args.log_loss_scale_to_tensorboard: - writer.add_scalar('loss-scale/loss-scale', loss_scale, iteration) - writer.add_scalar('loss-scale/loss-scale vs samples', loss_scale, - args.consumed_train_samples) - writer.add_scalar('loss-scale/loss-scale vs tokens', loss_scale, - args.consumed_train_tokens) - if args.log_world_size_to_tensorboard: - writer.add_scalar('world-size/world-size', args.world_size, iteration) - writer.add_scalar('world-size/world-size vs samples', args.world_size, - args.consumed_train_samples) - writer.add_scalar('world-size/world-size vs tokens', args.world_size, - args.consumed_train_tokens) - if grad_norm is not None: - writer.add_scalar('grad-norm/grad-norm', grad_norm, iteration) - writer.add_scalar('grad-norm/grad-norm vs samples', grad_norm, - args.consumed_train_samples) - writer.add_scalar('grad-norm/grad-norm vs tokens', grad_norm, - args.consumed_train_tokens) - if num_zeros_in_grad is not None: - writer.add_scalar('num-zeros/num-zeros', num_zeros_in_grad, iteration) - writer.add_scalar('num-zeros/num-zeros vs samples', num_zeros_in_grad, - args.consumed_train_samples) - writer.add_scalar('num-zeros/num-zeros vs tokens', num_zeros_in_grad, - args.consumed_train_tokens) - if params_norm is not None: - writer.add_scalar('params-norm/params-norm', params_norm, iteration) - writer.add_scalar('params-norm/params-norm vs samples', params_norm, - args.consumed_train_samples) - writer.add_scalar('params-norm/params-norm vs tokens', params_norm, - args.consumed_train_tokens) - if hasattr(args, 'actual_seq_length'): - writer.add_scalar('seqlen/actual_seq_length', args.actual_seq_length, - iteration) - writer.add_scalar('seqlen/actual_seq_length vs samples', args.actual_seq_length, - args.consumed_train_samples) - writer.add_scalar('seqlen/actual_seq_length vs tokens', args.actual_seq_length, - args.consumed_train_tokens) - if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: - writer.add_scalar('seqlen/curriculum_seqlen', args.curriculum_seqlen, - iteration) - writer.add_scalar('seqlen/curriculum_seqlen vs samples', args.curriculum_seqlen, - args.consumed_train_samples) - writer.add_scalar('seqlen/curriculum_seqlen vs tokens', args.curriculum_seqlen, - args.consumed_train_tokens) - if args.random_ltd: - writer.add_scalar('seqlen/random_ltd_reserved_length', args.random_ltd_reserved_length, - iteration) - writer.add_scalar('seqlen/random_ltd_reserved_length vs samples', args.random_ltd_reserved_length, - args.consumed_train_samples) - writer.add_scalar('seqlen/random_ltd_reserved_length vs tokens', args.random_ltd_reserved_length, - args.consumed_train_tokens) - if args.log_memory_to_tensorboard: - mem_stats = torch.cuda.memory_stats() - writer.add_scalar( - "mem-reserved-bytes", - mem_stats["reserved_bytes.all.current"], - iteration, - ) - writer.add_scalar( - "mem-allocated-bytes", - mem_stats["allocated_bytes.all.current"], - iteration, - ) - writer.add_scalar( - "mem-allocated-count", - mem_stats["allocation.all.current"], - iteration, + if mpu.is_pipeline_last_stage(ignore_virtual=True): + # Average loss across microbatches. + loss_reduced = {} + for key in losses_reduced[0]: + losses_reduced_for_key = [x[key] for x in losses_reduced] + loss_reduced[key] = sum(losses_reduced_for_key) / len( + losses_reduced_for_key ) - - if iteration % args.tensorboard_log_interval == 0: - # This logging write various optimizer states to tensorboard. This - # feature may consume extra GPU memory thus is set at false by default. - if args.log_optimizer_states_to_tensorboard and optimizer is not None: - opt_stats = [0.0] * 8 - opt_stats_2 = [0.0] * 4 - for _, group in enumerate(optimizer.param_groups): - for _, param in enumerate(group['params']): - opt_stats[0] += (torch.norm(optimizer.state[param]['exp_avg_sq']).item())**2 - opt_stats[1] += (torch.norm(optimizer.state[param]['exp_avg_sq'].sqrt()).item())**2 - opt_stats[2] += (torch.norm(optimizer.state[param]['exp_avg']).item())**2 - opt_stats[3] += (torch.norm(param).item())**2 - opt_stats[4] += torch.norm(optimizer.state[param]['exp_avg_sq'],p=1).item() - opt_stats[5] += torch.norm(optimizer.state[param]['exp_avg_sq'].sqrt(),p=1).item() - opt_stats[6] += torch.norm(optimizer.state[param]['exp_avg'],p=1).item() - opt_stats[7] += torch.norm(param,p=1).item() - opt_stats_2[0] = max(opt_stats_2[0], abs(optimizer.state[param]['exp_avg_sq'].max().item()), abs(optimizer.state[param]['exp_avg_sq'].min().item())) - opt_stats_2[1] = max(opt_stats_2[1], optimizer.state[param]['exp_avg_sq'].sqrt().abs_().max().item()) - opt_stats_2[2] = max(opt_stats_2[2], abs(optimizer.state[param]['exp_avg'].max().item()), abs(optimizer.state[param]['exp_avg'].min().item())) - opt_stats_2[3] = max(opt_stats_2[3], abs(param.max().item()), abs(param.min().item())) - # print('step {} rank {} before sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) - if args.zero_stage > 0: - # ZeRO partiions optimizer states - opt_stats = get_accelerator().FloatTensor(opt_stats) - torch.distributed.all_reduce(opt_stats, group=mpu.get_sequence_data_parallel_group()) - opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) - torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, - group=mpu.get_sequence_data_parallel_group()) - - if args.tensor_model_parallel_size > 1: - opt_stats = get_accelerator().FloatTensor(opt_stats) - torch.distributed.all_reduce(opt_stats, group=mpu.get_tensor_model_parallel_group()) - opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) - torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, - group=mpu.get_tensor_model_parallel_group()) - - if args.pipeline_model_parallel_size > 1: - opt_stats = get_accelerator().FloatTensor(opt_stats) - torch.distributed.all_reduce(opt_stats, group=mpu.get_pipeline_model_parallel_group()) - opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) - torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, - group=mpu.get_pipeline_model_parallel_group()) - - # print('step {} rank {} after sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) - if writer and is_last_rank(): - writer.add_scalar('optimizer/variance_l2 vs tokens', opt_stats[0]**0.5, args.consumed_train_tokens) - writer.add_scalar('optimizer/variance_sqrt_l2 vs tokens', opt_stats[1]**0.5, args.consumed_train_tokens) - writer.add_scalar('optimizer/momentum_l2 vs tokens', opt_stats[2]**0.5, args.consumed_train_tokens) - writer.add_scalar('optimizer/weight_l2 vs tokens', opt_stats[3]**0.5, args.consumed_train_tokens) - writer.add_scalar('optimizer/variance_l1 vs tokens', opt_stats[4], args.consumed_train_tokens) - writer.add_scalar('optimizer/variance_sqrt_l1 vs tokens', opt_stats[5], args.consumed_train_tokens) - writer.add_scalar('optimizer/momentum_l1 vs tokens', opt_stats[6], args.consumed_train_tokens) - writer.add_scalar('optimizer/weight_l1 vs tokens', opt_stats[7], args.consumed_train_tokens) - writer.add_scalar('optimizer/variance_abs_max vs tokens', opt_stats_2[0], args.consumed_train_tokens) - writer.add_scalar('optimizer/variance_sqrt_abs_max vs tokens', opt_stats_2[1], args.consumed_train_tokens) - writer.add_scalar('optimizer/momentum_abs_max vs tokens', opt_stats_2[2], args.consumed_train_tokens) - writer.add_scalar('optimizer/weight_abs_max vs tokens', opt_stats_2[3], args.consumed_train_tokens) - - writer.add_scalar('optimizer/variance_l2', opt_stats[0]**0.5, iteration) - writer.add_scalar('optimizer/variance_sqrt_l2', opt_stats[1]**0.5, iteration) - writer.add_scalar('optimizer/momentum_l2', opt_stats[2]**0.5, iteration) - writer.add_scalar('optimizer/weight_l2', opt_stats[3]**0.5, iteration) - writer.add_scalar('optimizer/variance_l1', opt_stats[4], iteration) - writer.add_scalar('optimizer/variance_sqrt_l1', opt_stats[5], iteration) - writer.add_scalar('optimizer/momentum_l1', opt_stats[6], iteration) - writer.add_scalar('optimizer/weight_l1', opt_stats[7], iteration) - writer.add_scalar('optimizer/variance_abs_max', opt_stats_2[0], iteration) - writer.add_scalar('optimizer/variance_sqrt_abs_max', opt_stats_2[1], iteration) - writer.add_scalar('optimizer/momentum_abs_max', opt_stats_2[2], iteration) - writer.add_scalar('optimizer/weight_abs_max', opt_stats_2[3], iteration) - - assert args is not None - if iteration % args.log_interval == 0: - elapsed_time = timers('interval-time').elapsed(barrier=True) - elapsed_time_per_iteration = elapsed_time / total_iterations - seq_len = args.seq_length - if hasattr(args, 'actual_seq_length'): - seq_len = args.actual_seq_length - samples_per_sec, tflops, approx_parameters_in_billions = throughput_calculator( - model, - args, - elapsed_time, - total_iterations - ) - samples_per_sec_per_replica = samples_per_sec / args.data_parallel_size - tokens_per_sec = samples_per_sec * seq_len - tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size - tokens_per_gpu_per_second = tokens_per_sec / args.world_size - tokens_per_gpu_per_second_per_replica = tokens_per_gpu_per_second / args.data_parallel_size - wandb_metrics = {} - if wandb is not None and getattr(wandb, 'run', None) is not None: - assert wandb.run is not None - wandb_metrics = { - 'throughput/iteration-time': elapsed_time_per_iteration, # 1000 ms / s - 'throughput/samples_per_sec': samples_per_sec, - 'throughput/samples_per_sec_per_replica': samples_per_sec_per_replica, - 'throughput/tokens_per_sec': tokens_per_sec, - 'throughput/tokens_per_sec_per_replica': tokens_per_sec_per_replica, - 'throughput/tokens_per_gpu_per_sec': tokens_per_gpu_per_second, - 'throughput/tokens_per_gpu_per_sec_per_replica': tokens_per_gpu_per_second_per_replica, - 'throughput/tflops': tflops, - 'throughput/approx_params_in_billions': approx_parameters_in_billions, - 'throughput/elapsed_ms_per_iteration': elapsed_time_per_iteration, - 'throughput/iteration': iteration, - } - if loss_dict is not None: - wandb_metrics |= { - 'loss/iteration': iteration, - **{f'loss/{k}': v for k, v in loss_dict.items()} - } - if writer and args.log_timers_to_tensorboard: - writer.add_scalar('iteration-time/iteration-time', - elapsed_time_per_iteration, iteration) - writer.add_scalar('iteration-time/iteration-time vs samples', - elapsed_time_per_iteration, args.consumed_train_samples) - writer.add_scalar('iteration-time/iteration-time vs tokens', - elapsed_time_per_iteration, args.consumed_train_tokens) - log_string = ' iteration {:8d}/{:8d} |'.format( - iteration, args.train_iters) - log_string += ' consumed samples: {:12d} |'.format( - args.consumed_train_samples) - log_string += ' consumed tokens: {:12d} |'.format( - args.consumed_train_tokens) - log_string += ' elapsed time per iteration (ms): {:.1f} |'.format( - elapsed_time_per_iteration * 1000.0) - log_string += ' learning rate: {:.3E} |'.format(learning_rate) - log_string += ' global batch size: {:5d} |'.format(batch_size) - if wandb is not None and getattr(wandb, 'run', None) is not None: - wandb_metrics |= { - 'training/iteration': iteration, - 'training/iteration_time': elapsed_time_per_iteration, - 'training/iteration_time_vs_tokens': ( - (elapsed_time_per_iteration - / args.consumed_train_tokens) - ), - 'training/iteration_time_vs_samples': ( - (elapsed_time_per_iteration - / args.consumed_train_samples), - ), - 'training/consumed_samples': args.consumed_train_samples, - 'training/consumed_tokens': args.consumed_train_tokens, - } - for key in total_loss_dict: - if key not in [advanced_iters_key, skipped_iters_key, - nan_iters_key]: - avg = total_loss_dict[key].item() / \ - float(max(1, total_loss_dict[advanced_iters_key])) - if avg > 0.0: - log_string += ' {}: {:.6E} |'.format(key, avg) - total_loss_dict[key] = get_accelerator().FloatTensor([0.0]) - if loss_scale is not None: - log_string += ' loss scale: {:.1f} |'.format(loss_scale) - wandb_metrics |= {'loss/loss_scale': loss_scale} - if grad_norm is not None: - log_string += ' grad norm: {:.3f} |'.format(grad_norm) - wandb_metrics |= {'loss/grad_norm': grad_norm} - if num_zeros_in_grad is not None: - log_string += ' num zeros: {:.1f} |'.format(num_zeros_in_grad) - wandb_metrics |= {'loss/num_zeros_in_grad': num_zeros_in_grad} - if params_norm is not None: - log_string += ' params norm: {:.3f} |'.format(params_norm) - wandb_metrics |= {'loss/params_norm': params_norm} - if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: - log_string += ' curriculum seqlen: {:5d} |'.format(args.curriculum_seqlen) - if args.random_ltd: - log_string += ' random ltd reserved length: {:5d} |'.format(args.random_ltd_reserved_length) - log_string += ' actual seqlen: {:5d} |'.format(seq_len) - log_string += ' number of skipped iterations: {:3d} |'.format( - total_loss_dict[skipped_iters_key]) - log_string += ' number of nan iterations: {:3d} |'.format( - total_loss_dict[nan_iters_key]) - log_string += ' samples per second: {:.3f} |'.format(samples_per_sec) - log_string += ' tokens per gpu per second (tgs): {:.3f} |'.format(tokens_per_gpu_per_second) - log_string += ' TFLOPs: {:.2f} |'.format(tflops) - total_loss_dict[advanced_iters_key] = 0 - total_loss_dict[skipped_iters_key] = 0 - total_loss_dict[nan_iters_key] = 0 - print_rank_last(log_string) - if report_memory_flag and learning_rate > 0.: - # Report memory after optimizer state has been initialized. - report_memory('(after {} iterations)'.format(iteration)) - report_memory_flag = False - if wandb is not None and getattr(wandb, 'run', None) is not None: - wandb_metrics |= {'training/skiped_iterations': total_loss_dict[skipped_iters_key]} - wandb_metrics |= {'training/nan_iterations': total_loss_dict[nan_iters_key]} - wandb.log(wandb_metrics) - if timers is not None: - timers.log(timers_to_log, normalizer=args.log_interval) - - return report_memory_flag + return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad + return {}, skipped_iter, grad_norm, num_zeros_in_grad +@dlp.log +@ez.dist.timeitlogit(rank=RANK) def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler): timers = get_timers() + assert timers is not None # Extra barrier is added to make sure # all ranks report the max time. # assert timers is not None - timers('save-checkpoint', log_level=0).start(barrier=True) + timers("save-checkpoint", log_level=0).start(barrier=True) save_checkpoint(iteration, model, optimizer, opt_param_scheduler) - timers('save-checkpoint').stop(barrier=True) - checkpoint_throughput_calculator(model, timers('save-checkpoint').elapsed(reset=False)) - timers.log(['save-checkpoint']) - - -def train(forward_step_func, model, optimizer, opt_param_scheduler, - train_data_iterator, valid_data_iterator, - process_non_loss_data_func): + timers("save-checkpoint").stop(barrier=True) + checkpoint_throughput_calculator( + model, timers("save-checkpoint").elapsed(reset=False) + ) + timers.log(["save-checkpoint"]) + + +@dlp.log +def train( + forward_step_func, + model, + optimizer, + opt_param_scheduler, + train_data_iterator, + valid_data_iterator, + process_non_loss_data_func, +): """Train the model function.""" args = get_args() timers = get_timers() - + accelerator = get_accelerator() + assert args is not None and timers is not None and accelerator is not None # Write args to tensorboard write_args_to_tensorboard() - + assert accelerator is not None + setup_profiler(args, accelerator.device_name()) if args.random_ltd: # random-ltd requires different randomness on each rank import random - random.seed(args.seed + torch.distributed.get_rank()) + random.seed(args.seed + torch.distributed.get_rank()) # Turn on training mode which enables dropout. for model_module in model: model_module.train() - + grad_norm = None # Tracking loss. total_loss_dict = {} - + loss_dict = {"skipped_iter": 0} # Iterations. iteration = args.iteration - # Translate args to core configuration config = core_transformer_config_from_args(args) + num_skipped_iters = 0 if not args.deepspeed: config.grad_scale_func = optimizer.scale_loss config.timers = timers - - timers('interval-time', log_level=0).start(barrier=True) - print_datetime('before the start of training step') + timers("interval-time", log_level=0).start(barrier=True) + print_datetime("before the start of training step") report_memory_flag = True if args.random_ltd: assert model[0].random_ltd_enabled() - args.random_ltd_layer_num = model[0].random_ltd_scheduler.get_random_ltd_layer_num() - - while iteration < args.train_iters and (args.train_tokens is None or \ - args.consumed_train_tokens < args.train_tokens): + args.random_ltd_layer_num = model[ + 0 + ].random_ltd_scheduler.get_random_ltd_layer_num() + ranges_to_skip = None + if args.train_range_to_skip is not None: + assert ( + len(args.train_range_to_skip) % 2 == 0 + ), f"""Expected --train-range-to-skip to have an even number of values. + Received: {len(args.train_range_to_skip)} + """ + ranges_to_skip = list( + zip( + args.train_range_to_skip[::2], + args.train_range_to_skip[1::2], + ) + ) + while iteration < args.train_iters and ( + args.train_tokens is None or args.consumed_train_tokens < args.train_tokens + ): + trigger(on_step_begin) update_num_microbatches(args.consumed_train_samples) if args.deepspeed: # inform deepspeed of any batch size changes - global_batch_size = mpu.get_data_parallel_world_size() * \ - args.micro_batch_size * \ - get_num_microbatches() + global_batch_size = ( + mpu.get_data_parallel_world_size() + * args.micro_batch_size + * get_num_microbatches() + ) model[0].set_train_batch_size(global_batch_size) - if args.curriculum_learning_legacy and not args.no_pipeline_parallel: - curriculum_seqlen = args.curriculum_scheduler.update_difficulty( \ - args.iteration + 1) + curriculum_seqlen = args.curriculum_scheduler.update_difficulty( + args.iteration + 1 + ) if iteration == 0 or curriculum_seqlen != args.curriculum_seqlen: if args.use_rotary_position_embeddings: update_rotary_pos_emb(curriculum_seqlen) args.curriculum_seqlen = curriculum_seqlen args.curr_iteration = iteration - loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \ - train_step(forward_step_func, - train_data_iterator, - model, - optimizer, - opt_param_scheduler, - config) + if ranges_to_skip is not None and any( + [i <= (iteration + 1) <= j for (i, j) in ranges_to_skip] + ): + log.info(f"Caught {iteration + 1} in 'ranges_to_skip', skipping!") + skipped_iter = 1 + num_skipped_iters += 1 + num_zeros_in_grad = None + gas = args.deepspeed_config_dict["gradient_accumulation_steps"] + for microstep in range(gas): + _batch = next(train_data_iterator) + _tokens = _batch["text"] + if ( + iteration < 10 + and os.environ.get("DUMP_SKIPPED_ITERS", None) + and RANK == 0 + ): + log.info(f"{_tokens.shape}, {len(train_data_iterator)=}") + log.info( + f"{iteration=} [{microstep}/{gas}]: ({_tokens.shape})\n{_tokens[:10]=}" + ) + + increment = ( + get_num_microbatches() * args.micro_batch_size * args.data_parallel_size + ) + model[0].skipped_steps += 1 + model[0].global_steps += 1 + model[0].micro_steps += 1 + model[0].global_samples += model[0].train_batch_size() + opt_param_scheduler.step(increment=increment) + else: + if os.getenv("TORCH_PROFILER_ENABLE") == "2": + from torch.profiler import profile, ProfilerActivity + + try: + activities = [ + ProfilerActivity.CPU, + ProfilerActivity.CUDA, + ProfilerActivity.XPU, # type:ignore + ] + except Exception: + log.warning("TORCH PROFILER WARNING: XPU is not supported") + activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA] + with profile(activities=activities) as prof: + loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = train_step( + forward_step_func, + train_data_iterator, + model, + optimizer, + opt_param_scheduler, + config, + ) + prof.export_chrome_trace( + f"{args.trace_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}-step{iteration}.json" + ) + else: + loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = train_step( + forward_step_func, + train_data_iterator, + model, + optimizer, + opt_param_scheduler, + config, + ) iteration += 1 args.iteration = iteration - new_samples = mpu.get_data_parallel_world_size() * \ - args.micro_batch_size * \ - get_num_microbatches() + new_samples = ( + mpu.get_data_parallel_world_size() + * args.micro_batch_size + * get_num_microbatches() + ) args.consumed_train_samples += new_samples # This actual_seq_length is used for actual consumed tokens calculation, flops calculation, and logging. args.actual_seq_length = args.seq_length if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: args.actual_seq_length = args.curriculum_seqlen if args.random_ltd: - args.random_ltd_reserved_length = model[0].random_ltd_scheduler.get_current_seq() + args.random_ltd_reserved_length = model[ + 0 + ].random_ltd_scheduler.get_current_seq() if args.random_ltd_reserved_length < args.actual_seq_length: - args.actual_seq_length = (args.actual_seq_length * (args.num_layers - args.random_ltd_layer_num) + args.random_ltd_reserved_length * args.random_ltd_layer_num) // args.num_layers + args.actual_seq_length = ( + args.actual_seq_length + * (args.num_layers - args.random_ltd_layer_num) + + args.random_ltd_reserved_length * args.random_ltd_layer_num + ) // args.num_layers if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: - if hasattr(args, 'data_efficiency_curriculum_learning_numel'): - act_mbsz = args.data_efficiency_curriculum_learning_numel / args.curriculum_seqlen + if hasattr(args, "data_efficiency_curriculum_learning_numel"): + act_mbsz = ( + args.data_efficiency_curriculum_learning_numel + / args.curriculum_seqlen + ) act_token = act_mbsz * args.actual_seq_length - args.consumed_train_tokens += mpu.get_data_parallel_world_size() * \ - get_num_microbatches() * act_token + args.consumed_train_tokens += ( + mpu.get_data_parallel_world_size() + * get_num_microbatches() + * act_token + ) else: args.consumed_train_tokens += new_samples * args.actual_seq_length else: args.consumed_train_tokens += new_samples * args.actual_seq_length - # Logging. if args.deepspeed: - if hasattr(model[0].optimizer, 'cur_scale'): + if hasattr(model[0].optimizer, "cur_scale"): loss_scale = model[0].optimizer.cur_scale else: loss_scale = None @@ -1253,81 +1175,103 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, params_norm = None if args.log_params_norm: params_norm = calc_params_l2_norm(model) - report_memory_flag = training_log(loss_dict, total_loss_dict, - optimizer.param_groups[0]['lr'], - iteration, loss_scale, - report_memory_flag, skipped_iter, - grad_norm, params_norm, num_zeros_in_grad, - model, optimizer) - + report_memory_flag = training_log( + loss_dict, + total_loss_dict, + optimizer.param_groups[0]["lr"], + iteration, + loss_scale, + report_memory_flag, + skipped_iter, + grad_norm, + params_norm, + num_zeros_in_grad, + model, + optimizer, + ) # Autoresume - if args.adlr_autoresume and \ - (iteration % args.adlr_autoresume_interval == 0): - check_adlr_autoresume_termination(iteration, model, optimizer, - opt_param_scheduler) - + if args.adlr_autoresume and (iteration % args.adlr_autoresume_interval == 0): + check_adlr_autoresume_termination( + iteration, model, optimizer, opt_param_scheduler + ) # Evaluation - if args.eval_interval and iteration % args.eval_interval == 0 and \ - args.do_valid: - prefix = 'iteration {}'.format(iteration) - evaluate_and_print_results(prefix, forward_step_func, - valid_data_iterator, model, - iteration, process_non_loss_data_func, - config, False) - + if args.eval_interval and iteration % args.eval_interval == 0 and args.do_valid: + prefix = "iteration {}".format(iteration) + evaluate_and_print_results( + prefix, + forward_step_func, + valid_data_iterator, + model, + iteration, + process_non_loss_data_func, + config, + False, + ) # Checkpointing saved_checkpoint = False if args.exit_signal_handler: signal_handler = get_signal_handler() - if any(signal_handler.signals_received()): - save_checkpoint_and_time(iteration, model, optimizer, - opt_param_scheduler) - print_datetime('exiting program after receiving SIGTERM.') + # if any(signal_handler.signals_received()): + if signal_handler is not None and any(signal_handler.signals_received()): + save_checkpoint_and_time( + iteration, model, optimizer, opt_param_scheduler + ) + print_datetime("exiting program after receiving SIGTERM.") sys.exit() - - if args.save and args.save_interval and \ - iteration % args.save_interval == 0: - save_checkpoint_and_time(iteration, model, optimizer, - opt_param_scheduler) + if args.save and args.save_interval and iteration % args.save_interval == 0: + save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler) saved_checkpoint = True - # Exiting based on duration if args.exit_duration_in_mins: train_time = (time.time() - _TRAIN_START_TIME) / 60.0 - done_cuda = get_accelerator().IntTensor( - [train_time > args.exit_duration_in_mins]) - torch.distributed.all_reduce( - done_cuda, op=torch.distributed.ReduceOp.MAX) + done_cuda = accelerator.IntTensor([train_time > args.exit_duration_in_mins]) + torch.distributed.all_reduce(done_cuda, op=torch.distributed.ReduceOp.MAX) done = done_cuda.item() if done: if not saved_checkpoint: - save_checkpoint_and_time(iteration, model, optimizer, - opt_param_scheduler) - print_datetime('exiting program after {} minutes'.format(train_time)) + save_checkpoint_and_time( + iteration, model, optimizer, opt_param_scheduler + ) + print_datetime("exiting program after {} minutes".format(train_time)) sys.exit() - # Exiting based on iterations if args.exit_interval and iteration % args.exit_interval == 0: if args.save and not saved_checkpoint: - save_checkpoint_and_time(iteration, model, optimizer, - opt_param_scheduler) + save_checkpoint_and_time( + iteration, model, optimizer, opt_param_scheduler + ) torch.distributed.barrier() - print_datetime('exiting program at iteration {}'.format(iteration)) + print_datetime("exiting program at iteration {}".format(iteration)) + sys.exit() + trigger(on_step_end) + # Exiting based on kill switch file + if found_kill_switch(): + if args.save and not saved_checkpoint: + save_checkpoint_and_time( + iteration, model, optimizer, opt_param_scheduler + ) + torch.distributed.barrier() + print_datetime( + f"Detected kill switch at {args.kill_switch_file}, " + f"iteration={iteration}. Exiting" + ) sys.exit() - - return iteration -def evaluate(forward_step_func, - data_iterator, - model, - process_non_loss_data_func, - config, - verbose=False): +@dlp.log +def evaluate( + forward_step_func, + data_iterator, + model, + process_non_loss_data_func, + config, + verbose=False, +): """Evaluation.""" args = get_args() - + accelerator = get_accelerator() + assert args is not None and accelerator is not None if args.vision_pretraining and args.vision_pretraining_type == "dino": compute_feature_bank(model) @@ -1349,73 +1293,82 @@ def evaluate(forward_step_func, total_loss_dict = {} + num_microbatches = get_num_microbatches() + assert num_microbatches is not None + forward_backward_func = get_forward_backward_func() + with torch.no_grad(): iteration = 0 while iteration < args.eval_iters: iteration += 1 if verbose and iteration % args.log_interval == 0: - print_rank_0('Evaluating iter {}/{}'.format(iteration, - args.eval_iters)) + log.info("Evaluating iter {}/{}".format(iteration, args.eval_iters)) - forward_backward_func = get_forward_backward_func() # Don't care about timing during evaluation config.timers = None if args.deepspeed and args.ds_pipeline_enabled: # DeepSpeed uses eval_batch() and already aggregates losses. assert isinstance(model, list) and len(model) == 1 loss = model[0].eval_batch(data_iterator) - loss_dicts = [{'lm loss' : loss}] * get_num_microbatches() + loss_dicts = [{"lm loss": loss}] * num_microbatches else: loss_dicts = forward_backward_func( forward_step_func=forward_step_func, data_iterator=data_iterator, model=model, - num_microbatches=get_num_microbatches(), + num_microbatches=num_microbatches, seq_length=args.seq_length, micro_batch_size=args.micro_batch_size, decoder_seq_length=args.decoder_seq_length, - forward_only=True) + forward_only=True, + ) config.timers = get_timers() # Empty unused memory if args.empty_unused_memory_level >= 1: - torch.cuda.empty_cache() + accelerator.empty_cache() if mpu.is_pipeline_last_stage(ignore_virtual=True): # Reduce across processes. for loss_dict in loss_dicts: for key in loss_dict: - if 'moe' not in key: - total_loss_dict[key] = total_loss_dict.get( - key, get_accelerator().FloatTensor([0.0])) + loss_dict[key] - - args.consumed_valid_samples += mpu.get_data_parallel_world_size() \ - * args.micro_batch_size \ - * get_num_microbatches() + if "moe" not in key: + total_loss_dict[key] = ( + total_loss_dict.get(key, accelerator.FloatTensor([0.0])) + + loss_dict[key] + ) + + args.consumed_valid_samples += ( + mpu.get_data_parallel_world_size() + * args.micro_batch_size + * num_microbatches + ) collected_non_loss_data = None if process_non_loss_data_func is not None and is_last_rank(): collected_non_loss_data = forward_backward_func( forward_step_func=forward_step_func, data_iterator=data_iterator, model=model, - num_microbatches=get_num_microbatches(), + num_microbatches=num_microbatches, seq_length=args.seq_length, micro_batch_size=args.micro_batch_size, decoder_seq_length=args.decoder_seq_length, forward_only=True, - collect_non_loss_data=True) + collect_non_loss_data=True, + ) # Move model back to the train mode. for model_module in model: model_module.train() for key in total_loss_dict: - total_loss_dict[key] /= args.eval_iters * get_num_microbatches() + total_loss_dict[key] /= args.eval_iters * num_microbatches if args.curriculum_learning_legacy and not args.no_pipeline_parallel: # roll back to actual curriculum seqlen at the end of eval. - args.curriculum_seqlen = args.curriculum_scheduler.update_difficulty( \ - args.iteration + 1) + args.curriculum_seqlen = args.curriculum_scheduler.update_difficulty( + args.iteration + 1 + ) if args.curriculum_seqlen < args.seq_length: if args.use_rotary_position_embeddings: update_rotary_pos_emb(args.curriculum_seqlen) @@ -1423,51 +1376,92 @@ def evaluate(forward_step_func, return total_loss_dict, collected_non_loss_data -def evaluate_and_print_results(prefix, forward_step_func, - data_iterator, model, - iteration, process_non_loss_data_func, config, - verbose=False, write_to_tensorboard=True, test=False): + +@dlp.log +def evaluate_and_print_results( + prefix, + forward_step_func, + data_iterator, + model, + iteration, + process_non_loss_data_func, + config, + verbose=False, + write_to_tensorboard=True, + test=False, +): """Helper function to evaluate and dump results on screen.""" args = get_args() + assert args is not None if write_to_tensorboard: writer = get_tensorboard_writer() else: writer = None total_loss_dict, collected_non_loss_data = evaluate( - forward_step_func, data_iterator, model, - process_non_loss_data_func, config, verbose) - string = ' validation loss at {} | '.format(prefix) + forward_step_func, + data_iterator, + model, + process_non_loss_data_func, + config, + verbose, + ) + key = "test" if test else "val" + if wandb is not None and wandb.run is not None: + wandb.log({ + f"{key}/iteration": iteration, + **{f"{key}/{k}": v for k, v in total_loss_dict.items()}, + **{ + f"{key}/ppl_{k}": math.exp(min(20, v.item())) + for k, v in total_loss_dict.items() + }, + }) + string = " validation loss at {} | ".format(prefix) for key in total_loss_dict: - string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item()) + string += f"{key} value={total_loss_dict[key].item():.6f}" ppl = math.exp(min(20, total_loss_dict[key].item())) - string += '{} PPL: {:.6E} | '.format(key, ppl) - if writer and is_last_rank(): - data_type = 'test' if test else 'validation' - writer.add_scalar(f'lm-loss-validation/{key} {data_type}', - total_loss_dict[key].item(), - iteration) - writer.add_scalar(f'lm-loss-validation/{key} {data_type} vs samples', - total_loss_dict[key].item(), - args.consumed_train_samples) - writer.add_scalar(f'lm-loss-validation/{key} {data_type} vs tokens', - total_loss_dict[key].item(), - args.consumed_train_tokens) + string += f"{key} PPL={ppl:.6f}" + # string += '{} PPL={:.6f} | '.format(key, ppl) + if writer is not None and is_last_rank(): + data_type = "test" if test else "validation" + writer.add_scalar( + f"lm-loss-validation/{key} {data_type}", + total_loss_dict[key].item(), + iteration, + ) + writer.add_scalar( + f"lm-loss-validation/{key} {data_type} vs samples", + total_loss_dict[key].item(), + args.consumed_train_samples, + ) + writer.add_scalar( + f"lm-loss-validation/{key} {data_type} vs tokens", + total_loss_dict[key].item(), + args.consumed_train_tokens, + ) if args.log_validation_ppl_to_tensorboard: - writer.add_scalar(f'lm-loss-validation/{key} {data_type} ppl', ppl, - iteration) - writer.add_scalar(f'lm-loss-validation/{key} {data_type} ppl vs samples', - ppl, args.consumed_train_samples) - writer.add_scalar(f'lm-loss-validation/{key} {data_type} ppl vs tokens', - ppl, args.consumed_train_tokens) + writer.add_scalar( + f"lm-loss-validation/{key} {data_type} ppl", ppl, iteration + ) + writer.add_scalar( + f"lm-loss-validation/{key} {data_type} ppl vs samples", + ppl, + args.consumed_train_samples, + ) + writer.add_scalar( + f"lm-loss-validation/{key} {data_type} ppl vs tokens", + ppl, + args.consumed_train_tokens, + ) if process_non_loss_data_func is not None and writer and is_last_rank(): process_non_loss_data_func(collected_non_loss_data, iteration, writer) length = len(string) + 1 - print_rank_last('-' * length) - print_rank_last(string) - print_rank_last('-' * length) + log.info("-" * length) + log.info(string) + log.info("-" * length) + return total_loss_dict def cyclic_iter(iter): @@ -1476,122 +1470,147 @@ def cyclic_iter(iter): yield x +@dlp.log +@ez.dist.timeitlogit(rank=RANK) def build_train_valid_test_datasets(build_train_valid_test_datasets_provider): """Build pretraining datasets.""" args = get_args() # Number of train/valid/test samples. + assert args is not None if args.train_samples: train_samples = args.train_samples else: train_samples = args.train_iters * args.global_batch_size - eval_iters = (args.train_iters // args.eval_interval + 1) * \ - args.eval_iters + eval_iters = (args.train_iters // args.eval_interval + 1) * args.eval_iters test_iters = args.eval_iters - train_val_test_num_samples = [train_samples, - eval_iters * args.global_batch_size, - test_iters * args.global_batch_size] - print_rank_0(' > datasets target sizes (minimum size):') - print_rank_0(' train: {}'.format(train_val_test_num_samples[0])) - print_rank_0(' validation: {}'.format(train_val_test_num_samples[1])) - print_rank_0(' test: {}'.format(train_val_test_num_samples[2])) + train_val_test_num_samples = [ + train_samples, + eval_iters * args.global_batch_size, + test_iters * args.global_batch_size, + ] + log.info(" > datasets target sizes (minimum size):") + log.info(" train: {}".format(train_val_test_num_samples[0])) + log.info(" validation: {}".format(train_val_test_num_samples[1])) + log.info(" test: {}".format(train_val_test_num_samples[2])) # Build the datasets. return build_train_valid_test_datasets_provider(train_val_test_num_samples) -def build_train_valid_test_data_loaders( - build_train_valid_test_datasets_provider): +@dlp.log +@ez.dist.timeitlogit(rank=RANK) +def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider): """Build pretraining data loaders.""" - args = get_args() - + accelerator = get_accelerator() + assert args is not None and accelerator is not None (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None) - - print_rank_0('> building train, validation, and test datasets ...') - + log.info("> building train, validation, and test datasets ...") # Backward compatibility, assume fixed batch size. if args.iteration > 0 and args.consumed_train_samples == 0: - assert args.train_samples is None, \ - 'only backward compatiblity support for iteration-based training' + assert ( + args.train_samples is None + ), "only backward compatiblity support for iteration-based training" args.consumed_train_samples = args.iteration * args.global_batch_size if args.iteration > 0 and args.consumed_valid_samples == 0: if args.train_samples is None: - args.consumed_valid_samples = (args.iteration // args.eval_interval) * \ - args.eval_iters * args.global_batch_size - + args.consumed_valid_samples = ( + (args.iteration // args.eval_interval) + * args.eval_iters + * args.global_batch_size + ) # Data loader only on rank 0 of each model parallel group. - ds_sequence_parallel = mpu.get_sequence_parallel_world_size() > 1 or args.force_ds_sequence_parallel - rank_in_parallel_group = mpu.get_sequence_parallel_rank() if ds_sequence_parallel else mpu.get_tensor_model_parallel_rank() + ds_sequence_parallel = ( + mpu.get_sequence_parallel_world_size() > 1 or args.force_ds_sequence_parallel + ) + rank_in_parallel_group = ( + mpu.get_sequence_parallel_rank() + if ds_sequence_parallel + else mpu.get_tensor_model_parallel_rank() + ) if rank_in_parallel_group == 0: # Build datasets. train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - build_train_valid_test_datasets_provider) - + build_train_valid_test_datasets_provider + ) # Build dataloders. train_dataloader = build_pretraining_data_loader( - train_ds, args.consumed_train_samples) + train_ds, args.consumed_train_samples + ) valid_dataloader = build_pretraining_data_loader( - valid_ds, args.consumed_valid_samples) + valid_ds, args.consumed_valid_samples + ) test_dataloader = build_pretraining_data_loader(test_ds, 0) - # Flags to know if we need to do training/validation/testing. do_train = train_dataloader is not None and args.train_iters > 0 do_valid = valid_dataloader is not None and args.eval_iters > 0 do_test = test_dataloader is not None and args.eval_iters > 0 # Need to broadcast num_tokens and num_type_tokens. - flags = get_accelerator().LongTensor( - [int(do_train), int(do_valid), int(do_test)]) + flags = accelerator.LongTensor([int(do_train), int(do_valid), int(do_test)]) else: - flags = get_accelerator().LongTensor([0, 0, 0]) - + flags = accelerator.LongTensor([0, 0, 0]) # Broadcast num tokens. if ds_sequence_parallel: - torch.distributed.broadcast(flags, - mpu.get_sequence_parallel_src_rank(), - group=mpu.get_sequence_parallel_group()) + torch.distributed.broadcast( + flags, + mpu.get_sequence_parallel_src_rank(), + group=mpu.get_sequence_parallel_group(), + ) else: - torch.distributed.broadcast(flags, - mpu.get_tensor_model_parallel_src_rank(), - group=mpu.get_tensor_model_parallel_group()) + torch.distributed.broadcast( + flags, + mpu.get_tensor_model_parallel_src_rank(), + group=mpu.get_tensor_model_parallel_group(), + ) args.do_train = flags[0].item() args.do_valid = flags[1].item() args.do_test = flags[2].item() - return train_dataloader, valid_dataloader, test_dataloader -def build_train_valid_test_data_iterators( - build_train_valid_test_datasets_provider): +@dlp.log +@ez.dist.timeitlogit(rank=RANK) +def build_train_valid_test_data_iterators(build_train_valid_test_datasets_provider): """Build pretraining data iterators.""" args = get_args() + assert args is not None # Build loaders. - train_dataloader, valid_dataloader, test_dataloader = \ - build_train_valid_test_data_loaders( - build_train_valid_test_datasets_provider) + train_dataloader, valid_dataloader, test_dataloader = ( + build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider) + ) # Build iterators. dl_type = args.dataloader_type - assert dl_type in ['single', 'cyclic'] + assert dl_type in ["single", "cyclic"] if train_dataloader is not None: - train_data_iterator = iter(train_dataloader) if dl_type == 'single' \ - else iter(cyclic_iter(train_dataloader)) + train_data_iterator = ( + iter(train_dataloader) + if dl_type == "single" + else iter(cyclic_iter(train_dataloader)) + ) else: train_data_iterator = None if valid_dataloader is not None: - valid_data_iterator = iter(valid_dataloader) if dl_type == 'single' \ - else iter(cyclic_iter(valid_dataloader)) + valid_data_iterator = ( + iter(valid_dataloader) + if dl_type == "single" + else iter(cyclic_iter(valid_dataloader)) + ) else: valid_data_iterator = None if test_dataloader is not None: - test_data_iterator = iter(test_dataloader) if dl_type == 'single' \ - else iter(cyclic_iter(test_dataloader)) + test_data_iterator = ( + iter(test_dataloader) + if dl_type == "single" + else iter(cyclic_iter(test_dataloader)) + ) else: test_data_iterator = None diff --git a/megatron/training_log.py b/megatron/training_log.py new file mode 100644 index 0000000000..3eb96c392d --- /dev/null +++ b/megatron/training_log.py @@ -0,0 +1,669 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +""" +training_log.py +""" + +import logging +import os + +from deepspeed import get_accelerator +import ezpz as ez +import torch + +from megatron.core import mpu +from megatron.global_vars import ( + get_args, + get_num_microbatches, + get_tensorboard_writer, + get_timers, +) +from megatron.utils import ( + Profile, + is_last_rank, + report_memory, + throughput_calculator, + num_floating_point_operations, +) + + +RANK: int = ez.get_rank() +WORLD_SIZE: int = ez.get_world_size() +DEVICE_TYPE: str = ez.dist.get_torch_device_type() +DEVICE: torch.device = torch.device(DEVICE_TYPE) + +log: logging.Logger = logging.getLogger(__name__) +LOG_LEVEL: str = str(os.environ.get("LOG_LEVEL", "INFO")).upper() +log.setLevel(LOG_LEVEL) if RANK == 0 else log.setLevel("CRITICAL") + +try: + import wandb +except (ImportError, ModuleNotFoundError): + wandb = None + + +dlp = Profile("TRAINING_LOG") + + +@dlp.log +def training_log( + loss_dict, + total_loss_dict, + learning_rate, + iteration, + loss_scale, + report_memory_flag, + skipped_iter, + grad_norm, + params_norm, + num_zeros_in_grad, + model=None, + optimizer=None, +): + """Log training information such as losses, timing, ....""" + args = get_args() + accelerator = get_accelerator() + timers = get_timers() + writer = get_tensorboard_writer() + assert args is not None and timers is not None and accelerator is not None + wandb_metrics = {} + # Advanced, skipped, and Nan iterations. + advanced_iters_key = "advanced iterations" + skipped_iters_key = "skipped iterations" + nan_iters_key = "nan iterations" + # Advanced iterations. + if not skipped_iter: + total_loss_dict[advanced_iters_key] = ( + total_loss_dict.get(advanced_iters_key, 0) + 1 + ) + else: + if advanced_iters_key not in total_loss_dict: + total_loss_dict[advanced_iters_key] = 0 + # Skipped iterations. + total_loss_dict[skipped_iters_key] = ( + total_loss_dict.get(skipped_iters_key, 0) + skipped_iter + ) + # Update losses and set nan iterations + got_nan = False + for key in loss_dict: + if not skipped_iter: + total_loss_dict[key] = ( + total_loss_dict.get(key, accelerator.FloatTensor([0.0])) + + loss_dict[key] + ) + else: + try: + value = loss_dict[key].float().sum().item() + except AttributeError: + value = loss_dict[key] + is_nan = value == float("inf") or value == -float("inf") or value != value + got_nan = got_nan or is_nan + total_loss_dict[nan_iters_key] = total_loss_dict.get(nan_iters_key, 0) + int( + got_nan + ) + + # Logging. + timers_to_log = [ + "forward-backward", + "forward-compute", + "backward-compute", + "batch-generator", + "forward-recv", + "forward-send", + "backward-recv", + "backward-send", + "forward-send-forward-recv", + "forward-send-backward-recv", + "backward-send-forward-recv", + "backward-send-backward-recv", + "forward-backward-send-forward-backward-recv", + "layernorm-grads-all-reduce", + "embedding-grads-all-reduce", + "grads-all-reduce", + "grads-reduce-scatter", + "params-all-gather", + "optimizer-copy-to-main-grad", + "optimizer-unscale-and-check-inf", + "optimizer-clip-main-grad", + "optimizer-count-zeros", + "optimizer-inner-step", + "optimizer-copy-main-to-model-params", + "optimizer", + ] + + # Calculate batch size. + batch_size = ( + args.micro_batch_size * args.data_parallel_size * get_num_microbatches() + ) + total_iterations = ( + total_loss_dict[advanced_iters_key] + total_loss_dict[skipped_iters_key] + ) + + # Tensorboard values. + # Timer requires all the ranks to call. + if args.log_timers_to_tensorboard and ( + iteration % args.tensorboard_log_interval == 0 + ): + timers.write(timers_to_log, writer, iteration, normalizer=total_iterations) + if writer and (iteration % args.tensorboard_log_interval == 0): + writer.add_scalar( + "steps-vs-samples/y=steps,x=samples", iteration, args.consumed_train_samples + ) + writer.add_scalar( + "steps-vs-samples/y=samples,x=steps", args.consumed_train_samples, iteration + ) + writer.add_scalar( + "steps-vs-tokens/y=steps,x=tokens", iteration, args.consumed_train_tokens + ) + writer.add_scalar( + "steps-vs-tokens/y=tokens,x=steps", args.consumed_train_tokens, iteration + ) + if args.log_learning_rate_to_tensorboard: + wandb_metrics |= { + "learning-rate/iteration": iteration, + "learning-rate/learning-rate": learning_rate, + } + writer.add_scalar("learning-rate/learning-rate", learning_rate, iteration) + writer.add_scalar( + "learning-rate/learning-rate vs samples", + learning_rate, + args.consumed_train_samples, + ) + writer.add_scalar( + "learning-rate/learning-rate vs tokens", + learning_rate, + args.consumed_train_tokens, + ) + if args.log_batch_size_to_tensorboard: + writer.add_scalar("batch-size/batch-size", batch_size, iteration) + writer.add_scalar( + "batch-size/batch-size vs samples", + batch_size, + args.consumed_train_samples, + ) + writer.add_scalar( + "batch-size/batch-size vs tokens", + batch_size, + args.consumed_train_tokens, + ) + wandb_metrics |= { + "lm-loss-training/iteration": iteration, + "lm-loss-training/consumed_train_tokens": args.consumed_train_tokens, + } + for key in loss_dict: + wandb_metrics |= {f"lm-loss-training/{key}": loss_dict[key]} + writer.add_scalar(f"lm-loss-training/{key}", loss_dict[key], iteration) + writer.add_scalar( + f"lm-loss-training/{key}" + " vs samples", + loss_dict[key], + args.consumed_train_samples, + ) + writer.add_scalar( + f"lm-loss-training/{key}" + " vs tokens", + loss_dict[key], + args.consumed_train_tokens, + ) + if args.fp16 and loss_scale and args.log_loss_scale_to_tensorboard: + writer.add_scalar("loss-scale/loss-scale", loss_scale, iteration) + writer.add_scalar( + "loss-scale/loss-scale vs samples", + loss_scale, + args.consumed_train_samples, + ) + writer.add_scalar( + "loss-scale/loss-scale vs tokens", + loss_scale, + args.consumed_train_tokens, + ) + if args.log_world_size_to_tensorboard: + writer.add_scalar("world-size/world-size", args.world_size, iteration) + writer.add_scalar( + "world-size/world-size vs samples", + args.world_size, + args.consumed_train_samples, + ) + writer.add_scalar( + "world-size/world-size vs tokens", + args.world_size, + args.consumed_train_tokens, + ) + if grad_norm is not None: + wandb_metrics |= {"training/grad-norm": grad_norm} + writer.add_scalar("grad-norm/grad-norm", grad_norm, iteration) + writer.add_scalar( + "grad-norm/grad-norm vs samples", grad_norm, args.consumed_train_samples + ) + writer.add_scalar( + "grad-norm/grad-norm vs tokens", grad_norm, args.consumed_train_tokens + ) + if num_zeros_in_grad is not None: + wandb_metrics |= {"training/num-zeros": num_zeros_in_grad} + writer.add_scalar("num-zeros/num-zeros", num_zeros_in_grad, iteration) + writer.add_scalar( + "num-zeros/num-zeros vs samples", + num_zeros_in_grad, + args.consumed_train_samples, + ) + writer.add_scalar( + "num-zeros/num-zeros vs tokens", + num_zeros_in_grad, + args.consumed_train_tokens, + ) + if params_norm is not None: + wandb_metrics |= {"training/params-norm": params_norm} + writer.add_scalar("params-norm/params-norm", params_norm, iteration) + writer.add_scalar( + "params-norm/params-norm vs samples", + params_norm, + args.consumed_train_samples, + ) + writer.add_scalar( + "params-norm/params-norm vs tokens", + params_norm, + args.consumed_train_tokens, + ) + if hasattr(args, "actual_seq_length"): + writer.add_scalar( + "seqlen/actual_seq_length", args.actual_seq_length, iteration + ) + writer.add_scalar( + "seqlen/actual_seq_length vs samples", + args.actual_seq_length, + args.consumed_train_samples, + ) + writer.add_scalar( + "seqlen/actual_seq_length vs tokens", + args.actual_seq_length, + args.consumed_train_tokens, + ) + if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: + writer.add_scalar( + "seqlen/curriculum_seqlen", args.curriculum_seqlen, iteration + ) + writer.add_scalar( + "seqlen/curriculum_seqlen vs samples", + args.curriculum_seqlen, + args.consumed_train_samples, + ) + writer.add_scalar( + "seqlen/curriculum_seqlen vs tokens", + args.curriculum_seqlen, + args.consumed_train_tokens, + ) + if args.random_ltd: + writer.add_scalar( + "seqlen/random_ltd_reserved_length", + args.random_ltd_reserved_length, + iteration, + ) + writer.add_scalar( + "seqlen/random_ltd_reserved_length vs samples", + args.random_ltd_reserved_length, + args.consumed_train_samples, + ) + writer.add_scalar( + "seqlen/random_ltd_reserved_length vs tokens", + args.random_ltd_reserved_length, + args.consumed_train_tokens, + ) + if args.log_memory_to_tensorboard: + mem_stats = torch.cuda.memory_stats() + writer.add_scalar( + "mem-reserved-bytes", + mem_stats["reserved_bytes.all.current"], + iteration, + ) + writer.add_scalar( + "mem-allocated-bytes", + mem_stats["allocated_bytes.all.current"], + iteration, + ) + writer.add_scalar( + "mem-allocated-count", + mem_stats["allocation.all.current"], + iteration, + ) + if iteration % args.tensorboard_log_interval == 0: + # This logging write various optimizer states to tensorboard. This + # feature may consume extra GPU memory thus is set at false by default. + if args.log_optimizer_states_to_tensorboard and optimizer is not None: + opt_stats = [0.0] * 8 + opt_stats_2 = [0.0] * 4 + for _, group in enumerate(optimizer.param_groups): + for _, param in enumerate(group["params"]): + state_param = getattr(optimizer, "state", None) + if state_param is not None: + exp_avg_sq = state_param.get("exp_avg_sq", torch.tensor(0.0)) + exp_avg = state_param.get("exp_avg", torch.tensor(0.0)) + opt_stats[0] += (torch.norm(exp_avg_sq).item()) ** 2 + opt_stats[1] += (torch.norm(exp_avg_sq.sqrt()).item()) ** 2 + opt_stats[2] += (torch.norm(exp_avg).item()) ** 2 + opt_stats[3] += (torch.norm(param).item()) ** 2 + opt_stats[4] += torch.norm(exp_avg_sq, p=1).item() + opt_stats[5] += torch.norm(exp_avg_sq.sqrt(), p=1).item() + opt_stats[6] += torch.norm(exp_avg, p=1).item() + opt_stats[7] += torch.norm(param, p=1).item() + opt_stats_2[0] = max( + opt_stats_2[0], + abs(exp_avg_sq.max().item()), + abs(exp_avg_sq.min().item()), + ) + opt_stats_2[1] = max( + opt_stats_2[1], exp_avg_sq.sqrt().abs_().max().item() + ) + opt_stats_2[2] = max( + opt_stats_2[2], + abs(exp_avg.max().item()), + abs(exp_avg.min().item()), + ) + opt_stats_2[3] = max( + opt_stats_2[3], + abs(param.max().item()), + abs(param.min().item()), + ) + # print('step {} rank {} before sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) + if args.zero_stage > 0: + # ZeRO partiions optimizer states + # opt_stats = opt_stats.clone().detach() + # opt_stats = get_accelerator().FloatTensor + opt_stats = accelerator.FloatTensor(opt_stats) + torch.distributed.all_reduce( + opt_stats, group=mpu.get_sequence_data_parallel_group() + ) + # opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) + # opt_stats_2 = opt_stats_2.clone().detach() + opt_stats_2 = accelerator.FloatTensor(opt_stats_2) + torch.distributed.all_reduce( + opt_stats_2, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_sequence_data_parallel_group(), + ) + + if args.tensor_model_parallel_size > 1: + # opt_stats = opt_stats.clone().detach() + opt_stats = accelerator.FloatTensor(opt_stats) + torch.distributed.all_reduce( + opt_stats, group=mpu.get_tensor_model_parallel_group() + ) + # opt_stats_2 = opt_stats_2.clone().detach() + opt_stats_2 = accelerator.FloatTensor(opt_stats_2) + torch.distributed.all_reduce( + opt_stats_2, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_tensor_model_parallel_group(), + ) + + if args.pipeline_model_parallel_size > 1: + # opt_stats = opt_stats.clone().detach() + opt_stats = accelerator.FloatTensor(opt_stats) + torch.distributed.all_reduce( + opt_stats, group=mpu.get_pipeline_model_parallel_group() + ) + # opt_stats_2 = opt_stats_2.clone().detach() + opt_stats_2 = accelerator.FloatTensor(opt_stats_2) + torch.distributed.all_reduce( + opt_stats_2, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_pipeline_model_parallel_group(), + ) + wandb_metrics |= { + "optimizer/learning_rate": learning_rate, + "optimizer/iteration": args.iteration, + "optimizer/consumed_train_tokens": args.consumed_train_tokens, + "optimizer/variance_l2": opt_stats[0] ** 0.5, + "optimizer/variance_sqrt_l2": opt_stats[1] ** 0.5, + "optimizer/momentum_l2": opt_stats[2] ** 0.5, + "optimizer/weight_l2": opt_stats[3] ** 0.5, + "optimizer/variance_l1": opt_stats[4], + "optimizer/variance_sqrt_l1": opt_stats[5], + "optimizer/momentum_l1": opt_stats[6], + "optimizer/weight_l1": opt_stats[7], + "optimizer/variance_abs_max": opt_stats_2[0], + "optimizer/variance_sqrt_abs_max": opt_stats_2[1], + "optimizer/momentum_abs_max": opt_stats_2[2], + "optimizer/weight_abs_max": opt_stats_2[3], + } + # print('step {} rank {} after sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) + if writer and is_last_rank(): + writer.add_scalar( + "optimizer/variance_l2 vs tokens", + opt_stats[0] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_sqrt_l2 vs tokens", + opt_stats[1] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/momentum_l2 vs tokens", + opt_stats[2] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/weight_l2 vs tokens", + opt_stats[3] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_l1 vs tokens", + opt_stats[4], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_sqrt_l1 vs tokens", + opt_stats[5], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/momentum_l1 vs tokens", + opt_stats[6], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/weight_l1 vs tokens", + opt_stats[7], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_abs_max vs tokens", + opt_stats_2[0], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_sqrt_abs_max vs tokens", + opt_stats_2[1], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/momentum_abs_max vs tokens", + opt_stats_2[2], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/weight_abs_max vs tokens", + opt_stats_2[3], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_l2", opt_stats[0] ** 0.5, iteration + ) + writer.add_scalar( + "optimizer/variance_sqrt_l2", opt_stats[1] ** 0.5, iteration + ) + writer.add_scalar( + "optimizer/momentum_l2", opt_stats[2] ** 0.5, iteration + ) + writer.add_scalar("optimizer/weight_l2", opt_stats[3] ** 0.5, iteration) + writer.add_scalar("optimizer/variance_l1", opt_stats[4], iteration) + writer.add_scalar("optimizer/variance_sqrt_l1", opt_stats[5], iteration) + writer.add_scalar("optimizer/momentum_l1", opt_stats[6], iteration) + writer.add_scalar("optimizer/weight_l1", opt_stats[7], iteration) + writer.add_scalar( + "optimizer/variance_abs_max", opt_stats_2[0], iteration + ) + writer.add_scalar( + "optimizer/variance_sqrt_abs_max", opt_stats_2[1], iteration + ) + writer.add_scalar( + "optimizer/momentum_abs_max", opt_stats_2[2], iteration + ) + writer.add_scalar("optimizer/weight_abs_max", opt_stats_2[3], iteration) + + assert args is not None + assert timers is not None + if iteration % args.log_interval == 0: + elapsed_time = timers("interval-time").elapsed(barrier=True) + elapsed_time_per_iteration = elapsed_time / total_iterations + seq_len = args.seq_length + if hasattr(args, "actual_seq_length"): + seq_len = args.actual_seq_length + samples_per_sec, tflops, approx_parameters_in_billions = throughput_calculator( + model, args, elapsed_time, total_iterations + ) + samples_per_sec_per_replica = samples_per_sec / args.data_parallel_size + tokens_per_sec = samples_per_sec * seq_len + tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size + tokens_per_gpu_per_second = tokens_per_sec / args.world_size + tokens_per_gpu_per_second_per_replica = ( + tokens_per_gpu_per_second / args.data_parallel_size + ) + # NOTE: [2024-06-19] + # Updated to use (more accurate) calculation according to + # `num_floating_point_operations` from NVIDIA/Megatron-LM + num_flop_lm = num_floating_point_operations(args, batch_size) + num_flop_per_sec_lm = num_flop_lm / elapsed_time_per_iteration + tflops_lm = num_flop_per_sec_lm / (10**12) + tflops_lm_per_gpu = tflops_lm / args.world_size + wandb_metrics |= { + "throughput/iteration-time": elapsed_time_per_iteration, # 1000 ms / s + "throughput/samples_per_sec": samples_per_sec, + "throughput/samples_per_sec_per_replica": samples_per_sec_per_replica, + "throughput/tokens_per_sec": tokens_per_sec, + "throughput/tokens_per_sec_per_replica": tokens_per_sec_per_replica, + "throughput/tokens_per_gpu_per_sec": tokens_per_gpu_per_second, + "throughput/tokens_per_gpu_per_sec_per_replica": tokens_per_gpu_per_second_per_replica, + "throughput/tflops": tflops, + "throughput/tflops-new": num_flop_lm / elapsed_time_per_iteration, + "throughput/tflops-lm": tflops_lm_per_gpu, + "throughput/approx_params_in_billions": approx_parameters_in_billions, + "throughput/elapsed_ms_per_iteration": elapsed_time_per_iteration, + "throughput/iteration": iteration, + } + if loss_dict is not None: + wandb_metrics |= { + "loss/iteration": iteration, + **{f"loss/{k}": v for k, v in loss_dict.items()}, + } + if writer and args.log_timers_to_tensorboard: + writer.add_scalar( + "iteration-time/iteration-time", elapsed_time_per_iteration, iteration + ) + writer.add_scalar( + "iteration-time/iteration-time vs samples", + elapsed_time_per_iteration, + args.consumed_train_samples, + ) + writer.add_scalar( + "iteration-time/iteration-time vs tokens", + elapsed_time_per_iteration, + args.consumed_train_tokens, + ) + # metrics_to_log = { + # 'iteration': iteration, + # 'train_iters': args.train_iters, + # 'consumed_samples': args.consumed_train_samples, + # 'consumed_tokens': args.consumed_tokens, + # } + log_string = f" iteration={iteration:8d}/{args.train_iters:8d} |" + # .format( iteration, args.train_iters) + log_string += ( + f" consumed_samples={args.consumed_train_samples:12d} |" + # .format(args.consumed_train_samples) + ) + log_string += f" consumed_tokens={args.consumed_train_tokens:12d} |" + # .format( args.consumed_train_tokens) + log_string += ( + " elapsed_time_per_iteration_ms=" + f"{elapsed_time_per_iteration * 1000.0:.1f} |" + # .format( elapsed_time_per_iteration * 1000.0) + ) + log_string += f" learning_rate={learning_rate:.6g} |" + log_string += f" global_batch_size={batch_size:5d} |" + # if wandb is not None and getattr(wandb, 'run', None) is not None: + wandb_metrics |= { + "training/iteration": iteration, + "training/iteration_time": elapsed_time_per_iteration, + "training/iteration_time_vs_tokens": ( + elapsed_time_per_iteration / args.consumed_train_tokens + ), + "training/iteration_time_vs_samples": ( + (elapsed_time_per_iteration / args.consumed_train_samples), + ), + "training/consumed_samples": args.consumed_train_samples, + "training/consumed_tokens": args.consumed_train_tokens, + } + for key in total_loss_dict: + if key not in [advanced_iters_key, skipped_iters_key, nan_iters_key]: + avg = total_loss_dict[key].item() / float( + max(1, total_loss_dict[advanced_iters_key]) + ) + if avg > 0.0: + log_string += " {}={:.6f} |".format(key, avg) + total_loss_dict[key] = accelerator.FloatTensor([0.0]) + if loss_scale is not None: + log_string += " loss_scale={:.1f} |".format(loss_scale) + wandb_metrics |= {"loss/loss_scale": loss_scale} + if grad_norm is not None: + log_string += " grad_norm={:.3f} |".format(grad_norm) + wandb_metrics |= {"loss/grad_norm": grad_norm} + if num_zeros_in_grad is not None: + log_string += " num_zeros={:.1f} |".format(num_zeros_in_grad) + wandb_metrics |= {"loss/num_zeros_in_grad": num_zeros_in_grad} + if params_norm is not None: + log_string += " params_norm={:.3f} |".format(params_norm) + wandb_metrics |= {"loss/params_norm": params_norm} + if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: + log_string += " curriculum_seqlen={:5d} |".format(args.curriculum_seqlen) + if args.random_ltd: + log_string += " random_ltd reserved_length={:5d} |".format( + args.random_ltd_reserved_length + ) + # log_string += " | ".join([ + # f"{seq_len=:5d} ", + # f"{}" + # f"number_of_skipped_iterations={:3d}", + # + # ]) + log_string += " actual_seqlen={:5d} |".format(seq_len) + log_string += " number_of_skipped_iterations={:3d} |".format( + total_loss_dict[skipped_iters_key] + ) + log_string += " number_of_nan_iterations={:3d} |".format( + total_loss_dict[nan_iters_key] + ) + log_string += " samples_per_second={:.3f} |".format(samples_per_sec) + log_string += " tokens_per_gpu_per_second_tgs={:.3f} |".format( + tokens_per_gpu_per_second + ) + log_string += " [LM]TFLOPs={:.2f} |".format(tflops_lm_per_gpu) + log_string += " [DS]TFLOPs={:.2f} |".format(tflops) + if wandb is not None and getattr(wandb, "run", None) is not None: + wandb_metrics |= { + "training/skiped_iterations": total_loss_dict[skipped_iters_key] + } + wandb_metrics |= {"training/nan_iterations": total_loss_dict[nan_iters_key]} + wandb.log(wandb_metrics) + total_loss_dict[advanced_iters_key] = 0 + total_loss_dict[skipped_iters_key] = 0 + total_loss_dict[nan_iters_key] = 0 + # print_rank_last(log_string) + log.info(log_string) + if report_memory_flag and learning_rate > 0.0: + # Report memory after optimizer state has been initialized. + report_memory("(after {} iterations)".format(iteration)) + report_memory_flag = False + if timers is not None: + timers.log(timers_to_log, normalizer=args.log_interval) + + return report_memory_flag diff --git a/megatron/training_log_alcf.py b/megatron/training_log_alcf.py new file mode 100644 index 0000000000..dcd872971d --- /dev/null +++ b/megatron/training_log_alcf.py @@ -0,0 +1,725 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain utilities.""" + +from enum import Enum + +# from deepspeed.accelerator import get_accelerator +# from deepspeed.compression.compress import redundancy_clean +import torch +import os +import logging + +from megatron import get_args +from megatron import get_timers +from megatron import get_tensorboard_writer +from megatron import get_wandb_writer +from megatron import get_num_microbatches +from megatron.core import mpu + +# from megatron import is_rank_0, print_rank_0 +# from megatron import print_rank_last +# from megatron.arguments import core_transformer_config_from_args +# from megatron.checkpointing import load_checkpoint +# from megatron.checkpointing import save_checkpoint +# from megatron.core import mpu, tensor_parallel +# from megatron.core.enums import ModelType +# from megatron.core.pipeline_parallel import get_forward_backward_func +# from megatron.data.data_samplers import build_pretraining_data_loader +# from megatron.initialize import initialize_megatron +# from megatron.initialize import write_args_to_tensorboard +# from megatron.initialize import set_jit_fusion_options +# from megatron.model import Float16Module +# from megatron.model import GPTModel +# from megatron.model import DistributedDataParallel as LocalDDP +# from megatron.model.transformer import ParallelTransformerLayer +# from megatron.model.vision.knn_monitor import compute_feature_bank +# from megatron.optimizer import get_megatron_optimizer +# from megatron.optimizer_param_scheduler import OptimizerParamScheduler +# from megatron.profiler import on_step_begin, on_step_end, setup_profiler, trigger +# from megatron.utils import check_adlr_autoresume_termination +# from megatron.utils import found_kill_switch, unwrap_model +import ezpz as ez + +# from megatron.utils import calc_params_l2_norm +from megatron.utils import ( + # checkpoint_throughput_calculator, + report_memory, + throughput_calculator, + # update_rotary_pos_emb, +) + +try: + import wandb +except (ImportError, ModuleNotFoundError): + wandb = None +# The earliest we can measure the start time. +# _TRAIN_START_TIME = time.time() + + +log = logging.getLogger(__name__) + + +class InteropLoggingTool(Enum): + TENSORBOARD = 1 + WANDB = 2 + + +RANK: int = ez.get_rank() +LOCAL_RANK: int = ez.get_local_rank() +WORLD_SIZE: int = ez.get_world_size() +DEVICE_TYPE: str = ez.dist.get_torch_device_type() +DEVICE_ID: str = f"{DEVICE_TYPE}:{LOCAL_RANK}" +DEVICE: torch.device = torch.device(DEVICE_TYPE) + +log: logging.Logger = logging.getLogger(__name__) +LOG_LEVEL: str = str(os.environ.get("LOG_LEVEL", "INFO")).upper() +log.setLevel(LOG_LEVEL) if RANK == 0 else log.setLevel("CRITICAL") + + +def num_floating_point_operations(args, batch_size): + # Group Query Attention. + # if not args.group_query_attention: + if not args.num_key_value_heads: + args.num_key_value_heads = args.num_attention_heads + # args.num_query_groups = args.num_attention_heads + # MoE. + # num_experts_routed_to = 1 if args.num_experts is None else args.moe_router_topk + num_experts_routed_to = 1 if args.num_experts is None else args.topk + gated_linear_multiplier = 3 / 2 if args.swiglu else 1 + return ( + 12 + * batch_size + * args.seq_length + * args.num_layers + * args.hidden_size + * args.hidden_size + * ( + 1 + + ( + (args.ffn_hidden_size / args.hidden_size) + * num_experts_routed_to + * gated_linear_multiplier + ) + + (args.num_key_value_heads / args.num_attention_heads) + + (args.seq_length / args.hidden_size) + + (args.padded_vocab_size / (2 * args.num_layers * args.hidden_size)) + ) + ) + + +def training_log( + loss_dict, + total_loss_dict, + learning_rate, + iteration, + loss_scale, + report_memory_flag, + skipped_iter, + grad_norm, + params_norm, + num_zeros_in_grad, + model=None, + optimizer=None, +): + """Log training information such as losses, timing, ....""" + args = get_args() + timers = get_timers() + writer = get_tensorboard_writer() + assert args is not None and timers is not None + wandb_metrics = {} + # Advanced, skipped, and Nan iterations. + advanced_iters_key = "advanced iterations" + skipped_iters_key = "skipped iterations" + nan_iters_key = "nan iterations" + # Advanced iterations. + if not skipped_iter: + total_loss_dict[advanced_iters_key] = ( + total_loss_dict.get(advanced_iters_key, 0) + 1 + ) + else: + if advanced_iters_key not in total_loss_dict: + total_loss_dict[advanced_iters_key] = 0 + # Skipped iterations. + total_loss_dict[skipped_iters_key] = ( + total_loss_dict.get(skipped_iters_key, 0) + skipped_iter + ) + # Update losses and set nan iterations + got_nan = False + _zero = torch.tensor([0.0]).to(DEVICE) + for key in loss_dict: + if not skipped_iter: + total_loss_dict[key] = total_loss_dict.get(key, _zero) + loss_dict[key] + else: + value = loss_dict[key].float().sum().item() + is_nan = value == float("inf") or value == -float("inf") or value != value + got_nan = got_nan or is_nan + total_loss_dict[nan_iters_key] = total_loss_dict.get(nan_iters_key, 0) + int( + got_nan + ) + + # Logging. + timers_to_log = [ + "forward-backward", + "forward-compute", + "backward-compute", + "batch-generator", + "forward-recv", + "forward-send", + "backward-recv", + "backward-send", + "forward-send-forward-recv", + "forward-send-backward-recv", + "backward-send-forward-recv", + "backward-send-backward-recv", + "forward-backward-send-forward-backward-recv", + "layernorm-grads-all-reduce", + "embedding-grads-all-reduce", + "grads-all-reduce", + "grads-reduce-scatter", + "params-all-gather", + "optimizer-copy-to-main-grad", + "optimizer-unscale-and-check-inf", + "optimizer-clip-main-grad", + "optimizer-count-zeros", + "optimizer-inner-step", + "optimizer-copy-main-to-model-params", + "optimizer", + ] + + # Calculate batch size. + batch_size = ( + args.micro_batch_size * args.data_parallel_size * get_num_microbatches() + ) + total_iterations = ( + total_loss_dict[advanced_iters_key] + total_loss_dict[skipped_iters_key] + ) + + # Tensorboard values. + # Timer requires all the ranks to call. + if args.log_timers_to_tensorboard and ( + iteration % args.tensorboard_log_interval == 0 and writer is not None + ): + timers.write(timers_to_log, writer, iteration, normalizer=total_iterations) + if writer and (iteration % args.tensorboard_log_interval == 0): + writer.add_scalar( + "steps-vs-samples/y=steps,x=samples", iteration, args.consumed_train_samples + ) + writer.add_scalar( + "steps-vs-samples/y=samples,x=steps", args.consumed_train_samples, iteration + ) + writer.add_scalar( + "steps-vs-tokens/y=steps,x=tokens", iteration, args.consumed_train_tokens + ) + writer.add_scalar( + "steps-vs-tokens/y=tokens,x=steps", args.consumed_train_tokens, iteration + ) + if args.log_learning_rate_to_tensorboard: + wandb_metrics |= { + "learning-rate/iteration": iteration, + "learning-rate/learning-rate": learning_rate, + } + writer.add_scalar("learning-rate/learning-rate", learning_rate, iteration) + writer.add_scalar( + "learning-rate/learning-rate vs samples", + learning_rate, + args.consumed_train_samples, + ) + writer.add_scalar( + "learning-rate/learning-rate vs tokens", + learning_rate, + args.consumed_train_tokens, + ) + if args.log_batch_size_to_tensorboard: + writer.add_scalar("batch-size/batch-size", batch_size, iteration) + writer.add_scalar( + "batch-size/batch-size vs samples", + batch_size, + args.consumed_train_samples, + ) + writer.add_scalar( + "batch-size/batch-size vs tokens", + batch_size, + args.consumed_train_tokens, + ) + wandb_metrics |= { + "lm-loss-training/iteration": iteration, + "lm-loss-training/consumed_train_tokens": args.consumed_train_tokens, + } + for key in loss_dict: + wandb_metrics |= {f"lm-loss-training/{key}": loss_dict[key]} + writer.add_scalar(f"lm-loss-training/{key}", loss_dict[key], iteration) + writer.add_scalar( + f"lm-loss-training/{key}" + " vs samples", + loss_dict[key], + args.consumed_train_samples, + ) + writer.add_scalar( + f"lm-loss-training/{key}" + " vs tokens", + loss_dict[key], + args.consumed_train_tokens, + ) + if args.fp16 and loss_scale and args.log_loss_scale_to_tensorboard: + writer.add_scalar("loss-scale/loss-scale", loss_scale, iteration) + writer.add_scalar( + "loss-scale/loss-scale vs samples", + loss_scale, + args.consumed_train_samples, + ) + writer.add_scalar( + "loss-scale/loss-scale vs tokens", + loss_scale, + args.consumed_train_tokens, + ) + if args.log_world_size_to_tensorboard: + writer.add_scalar("world-size/world-size", args.world_size, iteration) + writer.add_scalar( + "world-size/world-size vs samples", + args.world_size, + args.consumed_train_samples, + ) + writer.add_scalar( + "world-size/world-size vs tokens", + args.world_size, + args.consumed_train_tokens, + ) + if grad_norm is not None: + wandb_metrics |= {"training/grad-norm": grad_norm} + writer.add_scalar("grad-norm/grad-norm", grad_norm, iteration) + writer.add_scalar( + "grad-norm/grad-norm vs samples", grad_norm, args.consumed_train_samples + ) + writer.add_scalar( + "grad-norm/grad-norm vs tokens", grad_norm, args.consumed_train_tokens + ) + if num_zeros_in_grad is not None: + wandb_metrics |= {"training/num-zeros": num_zeros_in_grad} + writer.add_scalar("num-zeros/num-zeros", num_zeros_in_grad, iteration) + writer.add_scalar( + "num-zeros/num-zeros vs samples", + num_zeros_in_grad, + args.consumed_train_samples, + ) + writer.add_scalar( + "num-zeros/num-zeros vs tokens", + num_zeros_in_grad, + args.consumed_train_tokens, + ) + if params_norm is not None: + wandb_metrics |= {"training/params-norm": params_norm} + writer.add_scalar("params-norm/params-norm", params_norm, iteration) + writer.add_scalar( + "params-norm/params-norm vs samples", + params_norm, + args.consumed_train_samples, + ) + writer.add_scalar( + "params-norm/params-norm vs tokens", + params_norm, + args.consumed_train_tokens, + ) + if hasattr(args, "actual_seq_length"): + writer.add_scalar( + "seqlen/actual_seq_length", args.actual_seq_length, iteration + ) + writer.add_scalar( + "seqlen/actual_seq_length vs samples", + args.actual_seq_length, + args.consumed_train_samples, + ) + writer.add_scalar( + "seqlen/actual_seq_length vs tokens", + args.actual_seq_length, + args.consumed_train_tokens, + ) + if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: + writer.add_scalar( + "seqlen/curriculum_seqlen", args.curriculum_seqlen, iteration + ) + writer.add_scalar( + "seqlen/curriculum_seqlen vs samples", + args.curriculum_seqlen, + args.consumed_train_samples, + ) + writer.add_scalar( + "seqlen/curriculum_seqlen vs tokens", + args.curriculum_seqlen, + args.consumed_train_tokens, + ) + if args.random_ltd: + writer.add_scalar( + "seqlen/random_ltd_reserved_length", + args.random_ltd_reserved_length, + iteration, + ) + writer.add_scalar( + "seqlen/random_ltd_reserved_length vs samples", + args.random_ltd_reserved_length, + args.consumed_train_samples, + ) + writer.add_scalar( + "seqlen/random_ltd_reserved_length vs tokens", + args.random_ltd_reserved_length, + args.consumed_train_tokens, + ) + if args.log_memory_to_tensorboard: + mem_stats = torch.cuda.memory_stats() + writer.add_scalar( + "mem-reserved-bytes", + mem_stats["reserved_bytes.all.current"], + iteration, + ) + writer.add_scalar( + "mem-allocated-bytes", + mem_stats["allocated_bytes.all.current"], + iteration, + ) + writer.add_scalar( + "mem-allocated-count", + mem_stats["allocation.all.current"], + iteration, + ) + if iteration % args.tensorboard_log_interval == 0: + # This logging write various optimizer states to tensorboard. This + # feature may consume extra GPU memory thus is set at false by default. + if args.log_optimizer_states_to_tensorboard and optimizer is not None: + opt_stats = [0.0] * 8 + opt_stats_2 = [0.0] * 4 + for _, group in enumerate(optimizer.param_groups): + for _, param in enumerate(group["params"]): + state_param = getattr(optimizer, "state", None) + if state_param is not None: + exp_avg_sq = state_param.get("exp_avg_sq", torch.tensor(0.0)) + exp_avg = state_param.get("exp_avg", torch.tensor(0.0)) + opt_stats[0] += (torch.norm(exp_avg_sq).item()) ** 2 + opt_stats[1] += (torch.norm(exp_avg_sq.sqrt()).item()) ** 2 + opt_stats[2] += (torch.norm(exp_avg).item()) ** 2 + opt_stats[3] += (torch.norm(param).item()) ** 2 + opt_stats[4] += torch.norm(exp_avg_sq, p=1).item() + opt_stats[5] += torch.norm(exp_avg_sq.sqrt(), p=1).item() + opt_stats[6] += torch.norm(exp_avg, p=1).item() + opt_stats[7] += torch.norm(param, p=1).item() + opt_stats_2[0] = max( + opt_stats_2[0], + abs(exp_avg_sq.max().item()), + abs(exp_avg_sq.min().item()), + ) + opt_stats_2[1] = max( + opt_stats_2[1], exp_avg_sq.sqrt().abs_().max().item() + ) + opt_stats_2[2] = max( + opt_stats_2[2], + abs(exp_avg.max().item()), + abs(exp_avg.min().item()), + ) + opt_stats_2[3] = max( + opt_stats_2[3], + abs(param.max().item()), + abs(param.min().item()), + ) + if args.zero_stage > 0: + # ZeRO partiions optimizer states + # opt_stats = get_accelerator().FloatTensor(opt_stats) + opt_stats = torch.tensor(opt_stats).to(DEVICE) + torch.distributed.all_reduce( + opt_stats, group=mpu.get_sequence_data_parallel_group() + ) + # opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) + opt_stats_2 = torch.tensor(opt_stats_2).to(DEVICE) + torch.distributed.all_reduce( + opt_stats_2, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_sequence_data_parallel_group(), + ) + + if args.tensor_model_parallel_size > 1: + opt_stats = torch.tensor(opt_stats).to(DEVICE) + # opt_stats = get_accelerator().FloatTensor(opt_stats) + torch.distributed.all_reduce( + opt_stats, group=mpu.get_tensor_model_parallel_group() + ) + # opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) + opt_stats_2 = torch.tensor(opt_stats_2).to(DEVICE) + torch.distributed.all_reduce( + opt_stats_2, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_tensor_model_parallel_group(), + ) + + if args.pipeline_model_parallel_size > 1: + # opt_stats = get_accelerator().FloatTensor(opt_stats) + opt_stats = torch.tensor(opt_stats).to(DEVICE) + torch.distributed.all_reduce( + opt_stats, group=mpu.get_pipeline_model_parallel_group() + ) + # opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) + opt_stats_2 = torch.tensor(opt_stats_2).to(DEVICE) + torch.distributed.all_reduce( + opt_stats_2, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_pipeline_model_parallel_group(), + ) + + wandb_metrics |= { + "optimizer/learning_rate": learning_rate, + "optimizer/iteration": args.iteration, + "optimizer/consumed_train_tokens": args.consumed_train_tokens, + "optimizer/variance_l2": opt_stats[0] ** 0.5, + "optimizer/variance_sqrt_l2": opt_stats[1] ** 0.5, + "optimizer/momentum_l2": opt_stats[2] ** 0.5, + "optimizer/weight_l2": opt_stats[3] ** 0.5, + "optimizer/variance_l1": opt_stats[4], + "optimizer/variance_sqrt_l1": opt_stats[5], + "optimizer/momentum_l1": opt_stats[6], + "optimizer/weight_l1": opt_stats[7], + "optimizer/variance_abs_max": opt_stats_2[0], + "optimizer/variance_sqrt_abs_max": opt_stats_2[1], + "optimizer/momentum_abs_max": opt_stats_2[2], + "optimizer/weight_abs_max": opt_stats_2[3], + } + # print('step {} rank {} after sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) + # if writer and is_last_rank(): + if writer is not None and RANK == 0: + writer.add_scalar( + "optimizer/variance_l2 vs tokens", + opt_stats[0] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_sqrt_l2 vs tokens", + opt_stats[1] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/momentum_l2 vs tokens", + opt_stats[2] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/weight_l2 vs tokens", + opt_stats[3] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_l1 vs tokens", + opt_stats[4], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_sqrt_l1 vs tokens", + opt_stats[5], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/momentum_l1 vs tokens", + opt_stats[6], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/weight_l1 vs tokens", + opt_stats[7], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_abs_max vs tokens", + opt_stats_2[0], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_sqrt_abs_max vs tokens", + opt_stats_2[1], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/momentum_abs_max vs tokens", + opt_stats_2[2], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/weight_abs_max vs tokens", + opt_stats_2[3], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_l2", opt_stats[0] ** 0.5, iteration + ) + writer.add_scalar( + "optimizer/variance_sqrt_l2", opt_stats[1] ** 0.5, iteration + ) + writer.add_scalar( + "optimizer/momentum_l2", opt_stats[2] ** 0.5, iteration + ) + writer.add_scalar("optimizer/weight_l2", opt_stats[3] ** 0.5, iteration) + writer.add_scalar("optimizer/variance_l1", opt_stats[4], iteration) + writer.add_scalar("optimizer/variance_sqrt_l1", opt_stats[5], iteration) + writer.add_scalar("optimizer/momentum_l1", opt_stats[6], iteration) + writer.add_scalar("optimizer/weight_l1", opt_stats[7], iteration) + writer.add_scalar( + "optimizer/variance_abs_max", opt_stats_2[0], iteration + ) + writer.add_scalar( + "optimizer/variance_sqrt_abs_max", opt_stats_2[1], iteration + ) + writer.add_scalar( + "optimizer/momentum_abs_max", opt_stats_2[2], iteration + ) + writer.add_scalar("optimizer/weight_abs_max", opt_stats_2[3], iteration) + + assert args is not None + assert timers is not None + if iteration % args.log_interval == 0: + elapsed_time = timers("interval-time").elapsed(barrier=True) + elapsed_time_per_iteration = elapsed_time / total_iterations + seq_len = args.seq_length + if hasattr(args, "actual_seq_length"): + seq_len = args.actual_seq_length + samples_per_sec, tflops, approx_parameters_in_billions = throughput_calculator( + model, args, elapsed_time, total_iterations + ) + samples_per_sec_per_replica = samples_per_sec / args.data_parallel_size + tokens_per_sec = samples_per_sec * seq_len + tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size + tokens_per_gpu_per_second = tokens_per_sec / args.world_size + tokens_per_gpu_per_second_per_replica = ( + tokens_per_gpu_per_second / args.data_parallel_size + ) + # NOTE: [2024-06-19] + # Updated to use (more accurate) calculation according to + # `num_floating_point_operations` from NVIDIA/Megatron-LM + num_flop_lm = num_floating_point_operations(args, batch_size) + num_flop_per_sec_lm = num_flop_lm / elapsed_time_per_iteration + tflops_lm = num_flop_per_sec_lm / (10**12) + tflops_lm_per_gpu = tflops_lm / args.world_size + wandb_metrics |= { + "throughput/iteration-time": elapsed_time_per_iteration, # 1000 ms / s + "throughput/samples_per_sec": samples_per_sec, + "throughput/samples_per_sec_per_replica": samples_per_sec_per_replica, + "throughput/tokens_per_sec": tokens_per_sec, + "throughput/tokens_per_sec_per_replica": tokens_per_sec_per_replica, + "throughput/tokens_per_gpu_per_sec": tokens_per_gpu_per_second, + "throughput/tokens_per_gpu_per_sec_per_replica": tokens_per_gpu_per_second_per_replica, + "throughput/tflops": tflops, + "throughput/tflops-new": num_flop_lm / elapsed_time_per_iteration, + "throughput/tflops-lm": tflops_lm_per_gpu, + "throughput/approx_params_in_billions": approx_parameters_in_billions, + "throughput/elapsed_ms_per_iteration": elapsed_time_per_iteration, + "throughput/iteration": iteration, + } + if loss_dict is not None: + wandb_metrics |= { + "loss/iteration": iteration, + **{f"loss/{k}": v for k, v in loss_dict.items()}, + } + if writer and args.log_timers_to_tensorboard: + writer.add_scalar( + "iteration-time/iteration-time", elapsed_time_per_iteration, iteration + ) + writer.add_scalar( + "iteration-time/iteration-time vs samples", + elapsed_time_per_iteration, + args.consumed_train_samples, + ) + writer.add_scalar( + "iteration-time/iteration-time vs tokens", + elapsed_time_per_iteration, + args.consumed_train_tokens, + ) + # metrics_to_log = { + # 'iteration': iteration, + # 'train_iters': args.train_iters, + # 'consumed_samples': args.consumed_train_samples, + # 'consumed_tokens': args.consumed_tokens, + # } + log_string = f" iteration={iteration:8d}/{args.train_iters:8d} |" + # .format( iteration, args.train_iters) + log_string += ( + f" consumed_samples={args.consumed_train_samples:12d} |" + # .format(args.consumed_train_samples) + ) + log_string += f" consumed_tokens={args.consumed_train_tokens:12d} |" + # .format( args.consumed_train_tokens) + log_string += ( + " elapsed_time_per_iteration_ms=" + f"{elapsed_time_per_iteration * 1000.0:.1f} |" + # .format( elapsed_time_per_iteration * 1000.0) + ) + log_string += f" learning_rate={learning_rate:.6g} |" + log_string += f" global_batch_size={batch_size:5d} |" + # if wandb is not None and getattr(wandb, 'run', None) is not None: + wandb_metrics |= { + "training/iteration": iteration, + "training/iteration_time": elapsed_time_per_iteration, + "training/iteration_time_vs_tokens": ( + elapsed_time_per_iteration / args.consumed_train_tokens + ), + "training/iteration_time_vs_samples": ( + (elapsed_time_per_iteration / args.consumed_train_samples), + ), + "training/consumed_samples": args.consumed_train_samples, + "training/consumed_tokens": args.consumed_train_tokens, + } + for key in total_loss_dict: + if key not in [advanced_iters_key, skipped_iters_key, nan_iters_key]: + avg = total_loss_dict[key].item() / float( + max(1, total_loss_dict[advanced_iters_key]) + ) + if avg > 0.0: + log_string += " {}={:.6f} |".format(key, avg) + total_loss_dict[key] = torch.tensor([0.0]).to(DEVICE) + if loss_scale is not None: + log_string += " loss_scale={:.1f} |".format(loss_scale) + wandb_metrics |= {"loss/loss_scale": loss_scale} + if grad_norm is not None: + log_string += " grad_norm={:.3f} |".format(grad_norm) + wandb_metrics |= {"loss/grad_norm": grad_norm} + if num_zeros_in_grad is not None: + log_string += " num_zeros={:.1f} |".format(num_zeros_in_grad) + wandb_metrics |= {"loss/num_zeros_in_grad": num_zeros_in_grad} + if params_norm is not None: + log_string += " params_norm={:.3f} |".format(params_norm) + wandb_metrics |= {"loss/params_norm": params_norm} + if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: + log_string += " curriculum_seqlen={:5d} |".format(args.curriculum_seqlen) + if args.random_ltd: + log_string += " random_ltd reserved_length={:5d} |".format( + args.random_ltd_reserved_length + ) + # log_string += " | ".join([ + # f"{seq_len=:5d} ", + # f"{}" + # f"number_of_skipped_iterations={:3d}", + # + # ]) + log_string += " actual_seqlen={:5d} |".format(seq_len) + log_string += " number_of_skipped_iterations={:3d} |".format( + total_loss_dict[skipped_iters_key] + ) + log_string += " number_of_nan_iterations={:3d} |".format( + total_loss_dict[nan_iters_key] + ) + log_string += " samples_per_second={:.3f} |".format(samples_per_sec) + log_string += " tokens_per_gpu_per_second_tgs={:.3f} |".format( + tokens_per_gpu_per_second + ) + log_string += " [LM]TFLOPs={:.2f} |".format(tflops_lm_per_gpu) + log_string += " [DS]TFLOPs={:.2f} |".format(tflops) + total_loss_dict[advanced_iters_key] = 0 + total_loss_dict[skipped_iters_key] = 0 + total_loss_dict[nan_iters_key] = 0 + # print_rank_last(log_string) + log.info(log_string) + if report_memory_flag and learning_rate > 0.0: + # Report memory after optimizer state has been initialized. + report_memory("(after {} iterations)".format(iteration)) + report_memory_flag = False + if wandb is not None and getattr(wandb, "run", None) is not None: + wandb_metrics |= { + "training/skiped_iterations": total_loss_dict[skipped_iters_key] + } + wandb_metrics |= {"training/nan_iterations": total_loss_dict[nan_iters_key]} + wandb.log(wandb_metrics) + if timers is not None: + timers.log(timers_to_log, normalizer=args.log_interval) + + return report_memory_flag diff --git a/megatron/utils.py b/megatron/utils.py index 97294070af..dc1dea0b3a 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -4,30 +4,155 @@ import sys import os +import logging +from typing import Optional import torch from torch.nn.parallel import DistributedDataParallel as torchDDP from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': - from apex.multi_tensor_apply import multi_tensor_applier - import amp_C - -from megatron import ( - get_args, - get_adlr_autoresume, - get_num_microbatches -) + +from megatron import get_args, get_adlr_autoresume, get_num_microbatches from megatron.core import mpu from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate from megatron.model.module import param_is_not_shared from megatron.model.rotary_pos_embedding import RotaryEmbedding +import ezpz as ez + +ACCELERATOR = get_accelerator() +assert ACCELERATOR is not None + +if ACCELERATOR.device_name() == "cuda": + try: + from apex.multi_tensor_apply import multi_tensor_applier # type:ignore + import amp_C # type:ignore + + HAS_APEX = True + except Exception: + HAS_APEX = False + +RANK = ez.get_rank() +log = logging.getLogger(__name__) +log.setLevel(os.environ.get("LOG_LEVEL", ("INFO" if RANK == 0 else "CRITICAL"))) + +_DLIO_PROFILER_EXIST = True +_DFTRACER_EXIST = True + +try: + import dftracer # type:ignore +except Exception: + _DFTRACER_EXIST = False + +try: + import dlio_profiler # type:ignore +except Exception: + _DLIO_PROFILER_EXIST = False + + +if _DFTRACER_EXIST: + from dftracer.logger import ( # type:ignore + dftracer as PerfTrace, + dft_fn as Profile, + DFTRACER_ENABLE as DFTRACER_ENABLE, + ) +elif _DLIO_PROFILER_EXIST: + from dlio_profiler.logger import fn_interceptor as Profile # type:ignore + from dlio_profiler.logger import dlio_logger as PerfTrace # type:ignore +else: + from functools import wraps + + class Profile(object): + def __init__( + self, cat, name=None, epoch=None, step=None, image_idx=None, image_size=None + ): + return + + def log(self, func): + return func + + def log_init(self, func): + return func + + def iter(self, func, iter_name="step"): + return func + + def __enter__(self): + return + + def __exit__(self, type, value, traceback): + return + + def update( + self, epoch=None, step=None, image_idx=None, image_size=None, args={} + ): + return + + def flush(self): + return + + def reset(self): + return + + def log_static(self, func): + return + + class dftracer(object): + def __init__( + self, + ): + self.type = None + + def initialize_log(self, logfile=None, data_dir=None, process_id=-1): + return + + def get_time(self): + return + + def enter_event(self): + return + + def exit_event(self): + return + + def log_event(self, name, cat, start_time, duration, string_args=None): + return + + def finalize(self): + return + + PerfTrace = dftracer() + DFTRACER_ENABLE = False + + +def get_logger( + name: str, + level: Optional[str] = None, + rank_zero_only: Optional[bool] = True, +) -> logging.Logger: + """Returns a `logging.Logger` object. + + If `rank_zero_only` passed, the level will be set to CRITICAL on all + non-zero ranks (and will be set to `level` on RANK==0). + """ + logger = logging.getLogger(name) + logger.setLevel( + str(level if level is not None else os.environ.get("LOG_LEVEL", "INFO")).upper() + ) + if rank_zero_only and ez.get_rank() != 0: + logger.setLevel("CRITICAL") + return logger + def update_rotary_pos_emb(seq_length): args = get_args() - rotary_dim = args.hidden_size // args.num_attention_heads \ - if args.kv_channels is None else args.kv_channels + accelerator = get_accelerator() + assert args is not None and accelerator is not None + rotary_dim = ( + args.hidden_size // args.num_attention_heads + if args.kv_channels is None + else args.kv_channels + ) if args.rotary_percent < 1.0: rotary_dim = int(rotary_dim * args.rotary_percent) @@ -36,7 +161,8 @@ def update_rotary_pos_emb(seq_length): # Wang and Komatsuzaki et al # https://github.com/kingoflolz/mesh-transformer-jax/ rotary_pos_emb = RotaryEmbedding(rotary_dim, theta=args.rope_theta)(seq_length).to( - get_accelerator().current_device_name()) + accelerator.current_device_name() + ) args.rotary_pos_emb = rotary_pos_emb @@ -56,8 +182,9 @@ def unwrap_model(model, module_instances=(torchDDP)): def calc_params_l2_norm(model): - """Calculate l2 norm of parameters """ + """Calculate l2 norm of parameters""" args = get_args() + assert args is not None if not isinstance(model, list): model = [model] # Remove duplicate params. @@ -73,82 +200,84 @@ def calc_params_l2_norm(model): params_data.append(param.data) # Calculate norm dummy_overflow_buf = get_accelerator().IntTensor([0]) - - if get_accelerator().device_name() == 'cuda': + if get_accelerator().device_name() == "cuda" and HAS_APEX: norm, _ = multi_tensor_applier( amp_C.multi_tensor_l2norm, dummy_overflow_buf, [params_data], - False # no per-parameter norm + False, # no per-parameter norm ) - else : - norm = torch.norm(params_data,p=2.0) + else: + norm = torch.norm(params_data, p=2.0) norm_2 = norm * norm # Sum across all model-parallel GPUs. - torch.distributed.all_reduce(norm_2, - op=torch.distributed.ReduceOp.SUM, - group=mpu.get_model_parallel_group()) + torch.distributed.all_reduce( + norm_2, op=torch.distributed.ReduceOp.SUM, group=mpu.get_model_parallel_group() + ) return norm_2.item() ** 0.5 def average_losses_across_data_parallel_group(losses): """Reduce a tensor of losses across all GPUs.""" - averaged_losses = torch.cat( - [loss.clone().detach().view(1) for loss in losses]) - torch.distributed.all_reduce(averaged_losses, - group=mpu.get_data_parallel_group()) - averaged_losses = averaged_losses / \ - torch.distributed.get_world_size(group=mpu.get_data_parallel_group()) + averaged_losses = torch.cat([loss.clone().detach().view(1) for loss in losses]) + torch.distributed.all_reduce(averaged_losses, group=mpu.get_data_parallel_group()) + averaged_losses = averaged_losses / torch.distributed.get_world_size( + group=mpu.get_data_parallel_group() + ) return averaged_losses def report_memory(name): """Simple GPU memory report.""" + accelerator = get_accelerator() + assert accelerator is not None mega_bytes = 1024.0 * 1024.0 - string = name + ' memory (MB)' - string += ' | allocated: {}'.format( - get_accelerator().memory_allocated() / mega_bytes) - string += ' | max allocated: {}'.format( - get_accelerator().max_memory_allocated() / mega_bytes) - string += ' | reserved: {}'.format( - get_accelerator().memory_reserved() / mega_bytes) - string += ' | max reserved: {}'.format( - get_accelerator().max_memory_reserved() / mega_bytes) + string = name + " memory (MB)" + string += " | allocated: {}".format(accelerator.memory_allocated() / mega_bytes) + string += " | max allocated: {}".format( + accelerator.max_memory_allocated() / mega_bytes + ) + reserved = accelerator.memory_reserved() + max_reserved = accelerator.max_memory_reserved() + if reserved is not None: + string += " | reserved: {}".format(reserved / mega_bytes) + if max_reserved is not None: + string += " | max reserved: {}".format(max_reserved / mega_bytes) if mpu.get_data_parallel_rank() == 0: - print("[Rank {}] {}".format(torch.distributed.get_rank(), string), - flush=True) + log.info(f"[Rank {RANK}] {string}") def print_params_min_max_norm(optimizer, iteration): """Print min, max, and norm of all parameters.""" index = 0 rank = torch.distributed.get_rank() - string = 'iteration, rank, index, tensor-model-parallel, min, max, norm\n' + string = "iteration, rank, index, tensor-model-parallel, min, max, norm\n" optimizer_ = optimizer.optimizer for param_group in optimizer_.param_groups: - for param in param_group['params']: + for param in param_group["params"]: index += 1 min_ = param.data.min() max_ = param.data.max() norm = torch.linalg.norm(param.data) - string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format( - iteration, rank, index, int(param.tensor_model_parallel)) - string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm) - print(string, flush=True) + string += "{:7d}, {:4d}, {:4d}, {:2d}, ".format( + iteration, rank, index, int(param.tensor_model_parallel) + ) + string += "{:.6E}, {:.6E}, {:.6E}\n".format(min_, max_, norm) + log.info(string) -def check_adlr_autoresume_termination(iteration, model, - optimizer, opt_param_scheduler): +def check_adlr_autoresume_termination(iteration, model, optimizer, opt_param_scheduler): """Check for autoresume signal and exit if it is received.""" from megatron.checkpointing import save_checkpoint args = get_args() + assert args is not None autoresume = get_adlr_autoresume() # Add barrier to ensure consistnecy. torch.distributed.barrier() - if autoresume.termination_requested(): + if autoresume is not None and autoresume.termination_requested(): if args.save: save_checkpoint(iteration, model, optimizer, opt_param_scheduler) print_rank_0(">>> autoresume termination request found!") @@ -158,12 +287,14 @@ def check_adlr_autoresume_termination(iteration, model, sys.exit(0) -def get_ltor_masks_and_position_ids(data, - eod_token, - reset_position_ids, - reset_attention_mask, - eod_mask_loss, - skip_mask=False): +def get_ltor_masks_and_position_ids( + data, + eod_token, + reset_position_ids, + reset_attention_mask, + eod_mask_loss, + skip_mask=False, +): """Build masks and position id for left to right model.""" # Extract batch size and sequence length. @@ -177,8 +308,9 @@ def get_ltor_masks_and_position_ids(data, attention_mask = None if not skip_mask: - attention_mask = torch.tril(torch.ones( - (att_mask_batch, seq_length, seq_length))).view(att_mask_batch, 1, seq_length, seq_length) + attention_mask = torch.tril( + torch.ones((att_mask_batch, seq_length, seq_length), device=data.device) + ).view(att_mask_batch, 1, seq_length, seq_length) # Loss mask. loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device) @@ -186,8 +318,7 @@ def get_ltor_masks_and_position_ids(data, loss_mask[data == eod_token] = 0.0 # Position ids. - position_ids = torch.arange(seq_length, dtype=torch.long, - device=data.device) + position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device) position_ids = position_ids.unsqueeze(0).expand_as(data) # We need to clone as the ids will be modifed based on batch index. if reset_position_ids: @@ -196,7 +327,6 @@ def get_ltor_masks_and_position_ids(data, if reset_position_ids or reset_attention_mask: # Loop through the batches: for b in range(micro_batch_size): - # Find indecies where EOD token is. eod_index = position_ids[b, data[b] == eod_token] # Detach indecies from positions if going to modify positions. @@ -208,125 +338,234 @@ def get_ltor_masks_and_position_ids(data, for j in range(eod_index.size()[0]): i = eod_index[j] # Mask attention loss. - if reset_attention_mask and not skip_mask: - attention_mask[b, 0, (i + 1):, :(i + 1)] = 0 + if ( + reset_attention_mask + and not skip_mask + and attention_mask is not None + ): + attention_mask[b, 0, (i + 1) :, : (i + 1)] = 0 # Reset positions. if reset_position_ids: - position_ids[b, (i + 1):] -= (i + 1 - prev_index) + position_ids[b, (i + 1) :] -= i + 1 - prev_index prev_index = i + 1 # Convert attention mask to binary: if not skip_mask: - attention_mask = (attention_mask < 0.5) - attention_mask = attention_mask.to(data.device) + assert attention_mask is not None + attention_mask = attention_mask < 0.5 return attention_mask, loss_mask, position_ids def print_rank_0(message): """If distributed is initialized, print only on rank 0.""" - if torch.distributed.is_initialized(): - if torch.distributed.get_rank() == 0: - print(message, flush=True) - else: - print(message, flush=True) + # if torch.distributed.is_initialized(): + # if torch.distributed.get_rank() == 0: + # # print(message, flush=True) + # print(message, flush=True) + # else: + # print(message, flush=True) + _ = log.info(f"{message}") if RANK == 0 else None + def is_last_rank(): - return torch.distributed.get_rank() == ( - torch.distributed.get_world_size() - 1) + return torch.distributed.get_rank() == (torch.distributed.get_world_size() - 1) + def print_rank_last(message): """If distributed is initialized, print only on last rank.""" if torch.distributed.is_initialized(): if is_last_rank(): - print(message, flush=True) + # print(message, flush=True) + log.info(message) else: - print(message, flush=True) + log.info(message) + def is_aml(): # Are we running inside an Azure Machine Learning (AML) environment? - return 'AZUREML_EXPERIMENT_ID' in os.environ + return "AZUREML_EXPERIMENT_ID" in os.environ + def is_rank_0(): """Check whether it is rank 0. For AML, check if it is rank 0 of a node""" if torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0 or ( - is_aml() and torch.distributed.get_rank() % get_accelerator().device_count() == 0 - ): + is_aml() + and (torch.distributed.get_rank() % get_accelerator().device_count()) == 0 + ): return True else: return False else: return True -def get_parameters_in_billions(model): - gpus_per_model = torch.distributed.get_world_size(group=mpu.get_model_parallel_group()) - approx_parameters_in_billions = sum([sum([p.ds_numel if hasattr(p,'ds_id') else p.nelement() for p in model_module.parameters()]) - for model_module in model]) +def get_parameters_in_billions(model): + gpus_per_model = torch.distributed.get_world_size( + group=mpu.get_model_parallel_group() + ) + + approx_parameters_in_billions = sum( + [ + sum( + [ + p.ds_numel if hasattr(p, "ds_id") else p.nelement() + for p in model_module.parameters() + ] + ) + for model_module in model + ] + ) + + return approx_parameters_in_billions * gpus_per_model / (1e9) + + +def num_floating_point_operations(args, batch_size): + # Group Query Attention. + # if not args.group_query_attention: + if not args.num_key_value_heads: + args.num_key_value_heads = args.num_attention_heads + # args.num_query_groups = args.num_attention_heads + # MoE. + # num_experts_routed_to = 1 if args.num_experts is None else args.moe_router_topk + num_experts_routed_to = 1 if args.num_experts is None else args.topk + gated_linear_multiplier = 3 / 2 if args.swiglu else 1 + return ( + 12 + * batch_size + * args.seq_length + * args.num_layers + * args.hidden_size + * args.hidden_size + * ( + 1 + + ( + (args.ffn_hidden_size / args.hidden_size) + * num_experts_routed_to + * gated_linear_multiplier + ) + + (args.num_key_value_heads / args.num_attention_heads) + + (args.seq_length / args.hidden_size) + + (args.padded_vocab_size / (2 * args.num_layers * args.hidden_size)) + ) + ) - return approx_parameters_in_billions*gpus_per_model/(1e9) def throughput_calculator(model, args, iteration_time, total_iterations): - batch_size = args.micro_batch_size * get_num_microbatches() * args.data_parallel_size - approx_parameters_in_billions = None if (model is None) else get_parameters_in_billions(model) - elapsed_time_per_iter = iteration_time/total_iterations + batch_size = ( + args.micro_batch_size * get_num_microbatches() * args.data_parallel_size + ) + approx_parameters_in_billions = ( + None if (model is None) else get_parameters_in_billions(model) + ) + elapsed_time_per_iter = iteration_time / total_iterations samples_per_second = batch_size / elapsed_time_per_iter - #flops calculator + # flops calculator hidden_size = args.hidden_size + num_attention_heads = args.num_attention_heads + head_dim = hidden_size // num_attention_heads + ffn_hidden_size = args.ffn_hidden_size num_layers = args.num_layers vocab_size = args.padded_vocab_size + gqa = args.num_attention_heads // args.num_key_value_heads + num_experts_routed_to = args.topk + ffn_multiplier = 3 if args.swiglu else 2 + macs_per_flops = 2 # General TFLOPs formula (borrowed from Equation 3 in Section 5.1 of # https://arxiv.org/pdf/2104.04473.pdf). - # The factor of 4 is when used with activation check-pointing, - # otherwise it will be 3. - checkpoint_activations_factor = 3 - if hasattr(args, 'checkpoint_activations') and args.checkpoint_activations: - checkpoint_activations_factor = 4 - if hasattr(args, 'recompute_granularity') and (args.recompute_granularity == 'selective' or args.recompute_granularity == 'full'): - checkpoint_activations_factor = 4 + # correction has been made to TFLOPs formula due to incorrect behavior + # observed with selective recompute when GQA not used and for all with GQA seq_len = args.seq_length if hasattr(args, 'actual_seq_length'): seq_len = args.actual_seq_length - flops_per_iteration = (24 * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2)) * (1. + (seq_len / (6. * hidden_size)) + (vocab_size / (16. * num_layers * hidden_size))) + pre_and_post_mha_gemm_macs = ( + batch_size * num_layers * (1 + (2 // gqa) + 1) * (hidden_size**2) * seq_len + ) + mha_bgemm_macs = ( + batch_size * num_layers * 2 * head_dim * num_attention_heads * (seq_len**2) + ) + ffn_gemm_macs = ( + batch_size + * num_layers + * ffn_multiplier + * ffn_hidden_size + * hidden_size + * seq_len + * num_experts_routed_to + ) + logit_lmhead_gemm_macs = batch_size * vocab_size * hidden_size * seq_len + + fwd_macs = ( + pre_and_post_mha_gemm_macs + + mha_bgemm_macs + + ffn_gemm_macs + + logit_lmhead_gemm_macs + ) + bwd_macs = 2 * fwd_macs + fwd_bwd_macs = fwd_macs + bwd_macs + + if (hasattr(args, "checkpoint_activations") and args.checkpoint_activations) or ( + hasattr(args, "recompute_granularity") and args.recompute_granularity == "full" + ): + fwd_bwd_macs += fwd_macs + if ( + hasattr(args, "recompute_granularity") + and args.recompute_granularity == "selective" + ): + fwd_bwd_macs += mha_bgemm_macs + + flops_per_iteration = fwd_bwd_macs * macs_per_flops tflops = flops_per_iteration / (elapsed_time_per_iter * args.world_size * (10**12)) return samples_per_second, tflops, approx_parameters_in_billions + def checkpoint_throughput_calculator(model, latency_second): approx_parameters_in_billions = get_parameters_in_billions(model) - checkpoint_multiplier = 14 # fp16 weights (2), fp32 weights (4), fp32 momentum (4), fp32 variance (4) + checkpoint_multiplier = ( + 14 # fp16 weights (2), fp32 weights (4), fp32 momentum (4), fp32 variance (4) + ) checkpoint_GB = approx_parameters_in_billions * checkpoint_multiplier GB_per_second = checkpoint_GB / latency_second - print_rank_0(f"Checkpoint Save GB: {round(checkpoint_GB, 3)}, GB/Sec: {round(GB_per_second,2)}, Latency(second): {round(latency_second, 3)}") + print_rank_0( + f"Checkpoint Save GB: {round(checkpoint_GB, 3)}, GB/Sec: {round(GB_per_second,2)}, Latency(second): {round(latency_second, 3)}" + ) def get_fingerprint_header(): return f"{'min':^13} {'max':^13} {'mean':^13} {'l2 norm':^12} metadata" + def get_fingerprint(p): return f"{p.min():13.6e} {p.max():13.6e} {p.mean():13.6e} {p.norm():12.6e}" def dump_position_embed_weights(preamble, iteration, model): - # return + # return from deepspeed.utils import safe_get_full_fp32_param + tp_rank = mpu.get_tensor_model_parallel_rank() pp_rank = mpu.get_pipeline_model_parallel_rank() dp_rank = mpu.get_data_parallel_rank() get_fingerprint_header() for n, p in model[0].named_parameters(): - if 'position_embeddings' in n: + if "position_embeddings" in n: tag = "pos_embed" elif "word_embeddings" in n: tag = "word_embed" else: - continue - print(f"iter {iteration} {preamble} {tag} lp {tp_rank}/{pp_rank}/{dp_rank}: {get_fingerprint(p)} {p.shape}\n") + continue + log.info( + f"iter {iteration} {preamble} {tag} lp {tp_rank}/{pp_rank}/{dp_rank}: {get_fingerprint(p)} {p.shape}\n" + ) fp32_value = safe_get_full_fp32_param(p) - if fp32_value is not None: - print(f"iter {iteration} {preamble} {tag} hp {tp_rank}/{pp_rank}/{dp_rank}: {get_fingerprint(fp32_value)} {p.shape}\n") + if fp32_value is not None: + log.info( + f"iter {iteration} {preamble} {tag} hp {tp_rank}/{pp_rank}/{dp_rank}: {get_fingerprint(fp32_value)} {p.shape}\n" + ) + def dump_weights(preamble, iteration, model, optimizer, tensor=None): # return @@ -337,19 +576,19 @@ def dump_weights(preamble, iteration, model, optimizer, tensor=None): fn = f"debug-bf16-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-{preamble}.txt" # only care for first and last pp stages and dp0 tp0 - #if not (mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()): + # if not (mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()): # return - #if not (tp_rank == 0 and dp_rank == 0): + # if not (tp_rank == 0 and dp_rank == 0): # return if tensor is not None: orig_tensor = tensor if hasattr(tensor, "_hp_param"): - numel = tensor._hp_param.numel() # // dp_size + numel = tensor._hp_param.numel() # // dp_size tensor = tensor.flatten().narrow(0, 0, numel) - #print(fn) + # print(fn) with open(fn, "w") as fh: fh.write(f"{get_fingerprint_header()}\n") @@ -359,25 +598,30 @@ def dump_weights(preamble, iteration, model, optimizer, tensor=None): for n, p in model[0].named_parameters(): fh.write(f"{get_fingerprint(p)} {n} {p.shape}\n") - + # # until we figure out how to dump the actual fp32 values don't do this + # fn = f"debug-fp32-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-{preamble}.txt" + # with open(fn, "w") as fh: + # fh.write(f"{get_fingerprint_header()}\n") + # if tensor is not None: + # tensor = orig_tensor + # if hasattr(tensor, "_hp_param"): + # fh.write(f"{get_fingerprint(tensor._hp_param)} tensor {tensor._hp_param.shape}\n") + # #fh.write(f"{get_fingerprint(tensor._hp_grad)} tensor grad\n") + # else: + # fh.write(f"{get_fingerprint(tensor)} tensor {tensor.shape}\n") + # #fh.write(f"{get_fingerprint(tensor.grad)} tensor grad\n") + # + # else: + # if hasattr(model[0].module.tied_modules, "embed"): + # p = model[0].module.tied_modules.embed.word_embeddings.weight._hp_param + # fh.write(f"{get_fingerprint(p)} module.tied_modules.embed.word_embeddings.weight._hp_param {p.shape}\n") return - # until we figure out how to dump the actual fp32 values don't do this - fn = f"debug-fp32-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-{preamble}.txt" - with open(fn, "w") as fh: - fh.write(f"{get_fingerprint_header()}\n") - if tensor is not None: - tensor = orig_tensor - if hasattr(tensor, "_hp_param"): - fh.write(f"{get_fingerprint(tensor._hp_param)} tensor {tensor._hp_param.shape}\n") - #fh.write(f"{get_fingerprint(tensor._hp_grad)} tensor grad\n") - else: - fh.write(f"{get_fingerprint(tensor)} tensor {tensor.shape}\n") - #fh.write(f"{get_fingerprint(tensor.grad)} tensor grad\n") - - else: - if hasattr(model[0].module.tied_modules, "embed"): - p = model[0].module.tied_modules.embed.word_embeddings.weight._hp_param - fh.write(f"{get_fingerprint(p)} module.tied_modules.embed.word_embeddings.weight._hp_param {p.shape}\n") - +def found_kill_switch(): + args = get_args() + assert args is not None + if args.kill_switch_file is not None and os.path.exists(args.kill_switch_file): + return True + else: + return False diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 785a129156..52681e5f8f 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -36,8 +36,8 @@ def model_provider(pre_process=True, post_process=True): args = get_args() config = core_transformer_config_from_args(args) - if hasattr(mpu, 'get_sequence_parallel_group'): - dpg = mpu.get_sequence_parallel_group() + if hasattr(mpu, 'get_sequence_data_parallel_group'): + dpg = mpu.get_sequence_data_parallel_group() elif hasattr(mpu, 'get_data_parallel_group'): dpg = mpu.get_data_parallel_group() else: diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 4fefef795f..3686c6ceeb 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -2,13 +2,22 @@ """Pretrain GPT""" +import time +from typing import Callable +from mpi4py import MPI + +comm = MPI.COMM_WORLD +comm.Barrier() +python_start_time = time.time() + import os from rich import print import torch import math from functools import partial from megatron import get_args -from megatron import print_rank_0 + +# from megatron import print_rank_0 from megatron import get_timers from megatron import get_tokenizer from megatron.core import mpu, tensor_parallel @@ -17,148 +26,157 @@ from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids -from megatron.utils import average_losses_across_data_parallel_group, update_rotary_pos_emb -from megatron.arguments import core_transformer_config_from_args from megatron.utils import ( - report_memory, - throughput_calculator, - checkpoint_throughput_calculator + average_losses_across_data_parallel_group, + update_rotary_pos_emb, ) -from pathlib import Path +from megatron.arguments import core_transformer_config_from_args + +# from megatron.utils import Profile, PerfTrace + +import logging import deepspeed from deepspeed.runtime.utils import see_memory_usage -from deepspeed.accelerator.real_accelerator import get_accelerator + +# from deepspeed.accelerator.real_accelerator import get_accelerator import subprocess import wandb -import time from torch import nn import torch.nn.functional as F - -# from ezpz import get_logger -from ezpz.dist import get_world_size, setup_wandb, get_rank - -# RANK = setup_torch( -# backend='deepspeed', -# port='5432', -# ) -RANK = get_rank() -WORLD_SIZE = get_world_size() -LEVEL = "DEBUG" if RANK == 0 else "CRITICAL" - -WANDB_MODE = os.environ.get('WANDB_MODE', None) -DISABLE_WANDB = ( - WANDB_MODE is not None and str(WANDB_MODE).lower() == 'disabled' -) - +import ezpz as ez + +dt_imports = time.time() - python_start_time +t0_setup = time.time() + +# ---- [SETUP COMMS] ------------------------ +# if str(os.environ.get('LAUNCH_CMD', 'mpich')).lower() == 'mpich': +RANK = ez.setup_torch(backend="deepspeed") # , timeout=7200) +dt_setup = time.time() - t0_setup +# else: +# RANK = ez.get_rank() +WORLD_SIZE = ez.get_world_size() +LOCAL_RANK = ez.get_local_rank() +DEVICE_TYPE = ez.dist.get_torch_device_type() +if torch.cuda.is_available(): + torch.cuda.set_device(LOCAL_RANK) + +log = logging.getLogger(__name__) +LOG_LEVEL = str(os.environ.get("LOG_LEVEL", "INFO")).upper() +# set logging level to "INFO" on RANK == 0, "CRITICAL" on all other ranks +log.setLevel(LOG_LEVEL) if RANK == 0 else log.setLevel("CRITICAL") + +log.info(f"Import python modules in {dt_imports} seconds") +log.info(f"ez.setup_torch time: {dt_setup} seconds") + +# ---- [SETUP WANDB FROM RANK 0] -------------- +WANDB_MODE = os.environ.get("WANDB_MODE", None) +DISABLE_WANDB = WANDB_MODE is not None and str(WANDB_MODE).lower() == "disabled" if RANK == 0 and not DISABLE_WANDB: - project_name = ( - os.environ.get( - 'WB_PROJECT', - os.environ.get( - 'WANDB_PROJECT', - 'AuroraGPT' - ), - ) + project_name = os.environ.get( + "WB_PROJECT", # look for WB_PROJECT in env + os.environ.get("WANDB_PROJECT", "AuroraGPT"), # look for WANDB_PROJECT in env ) - print('--------------------------------------------------') - print(f"Setting up W&B from: {RANK} with {project_name}") - print('--------------------------------------------------') - setup_wandb(project_name=project_name) + log.info(f"Setting up W&B from: {RANK} with {project_name}") + _ = ez.setup_wandb(project_name=project_name) +@ez.dist.timeitlogit(rank=RANK) def model_provider(pre_process=True, post_process=True): """Build the model.""" - print_rank_0('building GPT model ...') + log.info("building GPT model ...") see_memory_usage("Before Building Model", force=True) args = get_args() + assert args is not None config = core_transformer_config_from_args(args) - if wandb.run is not None: - print(f"Updating WandB run: [{wandb.run.name}]({wandb.run.url})") - wandb.run.config.update({"args": vars(args)}) - if RANK == 0: - git_ds_info() - if hasattr(mpu, 'get_sequence_parallel_group'): - dpg = mpu.get_sequence_parallel_group() - elif hasattr(mpu, 'get_data_parallel_group'): + # if RANK == 0: + # git_ds_info() + if hasattr(mpu, "get_sequence_data_parallel_group"): + dpg = mpu.get_sequence_data_parallel_group() + elif hasattr(mpu, "get_data_parallel_group"): dpg = mpu.get_data_parallel_group() else: dpg = None - if wandb is not None and wandb.run is not None: - assert wandb is not None and wandb.run is not None - print(f'Updating {wandb.run.name=} at {wandb.run.url=}') - wandb.run.config.update({'args': vars(args)}) - with deepspeed.zero.Init( - data_parallel_group=dpg, - remote_device=( - None if args.remote_device == 'none' else args.remote_device - ), - config_dict_or_path=args.deepspeed_config_dict, - enabled=args.zero_stage == 3, - mpu=mpu + deepspeed_zero_init = deepspeed.zero.Init + if args.use_mics: + deepspeed_zero_init = deepspeed.zero.MiCS_Init + with deepspeed_zero_init( + data_parallel_group=dpg, + remote_device=(None if args.remote_device == "none" else args.remote_device), + config_dict_or_path=args.deepspeed_config, # _dict, + enabled=args.zero_stage == 3, + mpu=mpu, ): if args.deepspeed and not args.no_pipeline_parallel: - model = GPTModelPipe( - config=config, - num_tokentypes=0, - parallel_output=True - ) + model = GPTModelPipe(config=config, num_tokentypes=0, parallel_output=True) # This is a hack to give us a reference to # get_batch_pipe from within training.py # We need to call model.set_batch_fn after deepspeed.initialize model._megatron_batch_fn = get_batch_pipe - - # Predompute the attention mask and store it in args. + # Precompute the attention mask and store it in args. # This avoids having to pipeline it # as an activation during training. # The mask is constant, and thus we can reuse it. attention_mask = torch.tril( torch.ones( (1, args.seq_length, args.seq_length), - device=get_accelerator().current_device_name() + device=DEVICE_TYPE, ) ).view(1, 1, args.seq_length, args.seq_length) - # Convert attention mask to binary: - attention_mask = (attention_mask < 0.5) + attention_mask = attention_mask < 0.5 if args.fp16: attention_mask = attention_mask.half() elif args.bf16: attention_mask = attention_mask.bfloat16() - # Attention mask must be bool. args.attn_mask = attention_mask.to(torch.bool) - - # For prertaining, since sequence length is fixed, + # For pretraining, since sequence length is fixed, # cache rotary embedding in args, to avoid communicating around if args.use_rotary_position_embeddings: update_rotary_pos_emb(args.seq_length) - else: model = GPTModel( config=config, num_tokentypes=0, parallel_output=True, pre_process=pre_process, - post_process=post_process + post_process=post_process, ) num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - # print_rank_0('\n ------------------------ ') - # print_rank_0(f'num of parameters {num_params}') - # print_rank_0('------------------------\n ') - print_rank_0(80 * '-') - print_rank_0(f"Number of parameters in model: {num_params}") - print_rank_0(80 * '-') + log.info(80 * "-") + log.info(f"Number of parameters in model: {num_params}") + log.info(80 * "-") see_memory_usage("After Building Model", force=True) - if wandb.run is not None: - wandb.run.config.update({'num_params': num_params}) - # wandb.run.watch( - # model, - # log='all', - # log_graph=True, - # ) - # wandb.run.config.update({'num_params': num_params}) + if wandb is not None and getattr(wandb, "run", None) is not None: + assert wandb.run is not None + tbdir = args.tensorboard_dir + # tbdir = args.getattr('tensorboard_dir', None) + if tbdir is not None: + try: + log.info(f"Patching tensorboard from {tbdir}") + wandb.tensorboard.patch(root_logdir=tbdir) + except ValueError as exc: + log.exception(exc) + log.warning("Continuing without patching tensorboard!") + wandb.run.config.update({"num_params": num_params}) + if "args" not in wandb.run.config: + log.info( + f"Updating WandB run.config: [{wandb.run.name}]({wandb.run.get_url()})" + ) + try: + wandb.run.config.update({"args": dict(sorted(vars(args).items()))}) + except Exception: + log.error('Unable to `wandb.run.config.update({"args": vars(args)})`') + # try: + # wandb.run.watch( + # model, + # log='all', + # log_graph=True, + # ) + # except Exception: + # pass return model @@ -166,24 +184,29 @@ def get_batch(data_iterator): """Generate a batch""" args = get_args() tokenizer = get_tokenizer() - + assert args is not None and tokenizer is not None # Items and their type. - keys = ['text'] + keys = ["text"] datatype = torch.int64 - data = next(data_iterator) if data_iterator is not None else None + + if ( + args.iteration < 10 + and RANK == 0 + and os.environ.get("DUMP_TOKENS", None) + and data is not None + ): + log.info(f"{args.iteration=}: {data['text'][:10]=}") # # Broadcast data. # if data_iterator is not None: # data = next(data_iterator) # else: # data = None data_b = tensor_parallel.broadcast_data(keys, data, datatype) - # Unpack. - tokens_ = data_b['text'].long() + tokens_ = data_b["text"].long() labels = tokens_[:, 1:].contiguous() tokens = tokens_[:, :-1].contiguous() - # Get the masks and postition ids. skip_mask = args.use_flash_attn or args.use_flash_attn_triton attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( @@ -192,53 +215,63 @@ def get_batch(data_iterator): args.reset_position_ids, args.reset_attention_mask, args.eod_mask_loss, - skip_mask) - + skip_mask, + ) # For DS's sequence parallel seq_parallel_world_size = mpu.get_sequence_parallel_world_size() seq_parallel_world_rank = mpu.get_sequence_parallel_rank() - # For Megatron's sequence parallel if args.sequence_parallel: seq_parallel_world_size = mpu.get_tensor_model_parallel_world_size() seq_parallel_world_rank = mpu.get_tensor_model_parallel_rank() seq_length = tokens.size(1) - assert seq_length % seq_parallel_world_size == 0 sub_seq_length = seq_length // seq_parallel_world_size sub_seq_start = seq_parallel_world_rank * sub_seq_length sub_seq_end = (seq_parallel_world_rank + 1) * sub_seq_length - tokens = tokens[:, sub_seq_start:sub_seq_end] position_ids = position_ids[:, sub_seq_start:sub_seq_end] # For DS's sequence parallel if mpu.get_sequence_parallel_world_size() > 1: labels = labels[:, sub_seq_start:sub_seq_end] - return tokens, labels, loss_mask, attention_mask, position_ids def data_post_process(data, data_sampler_state_dict): args = get_args() + assert args is not None if args.data_efficiency_curriculum_learning: - if 'seqlen_truncate' in data_sampler_state_dict['current_difficulties']: - args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_truncate' - current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_truncate'] + if "seqlen_truncate" in data_sampler_state_dict["current_difficulties"]: + args.data_efficiency_curriculum_learning_seqlen_type = "seqlen_truncate" + current_seqlen = data_sampler_state_dict["current_difficulties"][ + "seqlen_truncate" + ] if current_seqlen < args.seq_length: - data['text'] = data['text'][:, :(current_seqlen+1)].contiguous() - elif 'seqlen_reshape' in data_sampler_state_dict['current_difficulties']: - args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_reshape' - current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_reshape'] + data["text"] = data["text"][:, : (current_seqlen + 1)].contiguous() + elif "seqlen_reshape" in data_sampler_state_dict["current_difficulties"]: + args.data_efficiency_curriculum_learning_seqlen_type = "seqlen_reshape" + current_seqlen = data_sampler_state_dict["current_difficulties"][ + "seqlen_reshape" + ] if current_seqlen < args.seq_length: - orig_num_token = torch.numel(data['text']) - reshape_len = (data['text'].size()[1] // (current_seqlen+1)) * (current_seqlen+1) - data['text'] = torch.cat((data['text'][:, :reshape_len].contiguous().view(-1, current_seqlen+1), - data['text'][:, -(current_seqlen+1):]), 0).contiguous() - num_row = math.ceil(orig_num_token / (current_seqlen+1)) - num_row = min(num_row, data['text'].size()[0]) + orig_num_token = torch.numel(data["text"]) + reshape_len = (data["text"].size()[1] // (current_seqlen + 1)) * ( + current_seqlen + 1 + ) + data["text"] = torch.cat( + ( + data["text"][:, :reshape_len] + .contiguous() + .view(-1, current_seqlen + 1), + data["text"][:, -(current_seqlen + 1) :], + ), + 0, + ).contiguous() + num_row = math.ceil(orig_num_token / (current_seqlen + 1)) + num_row = min(num_row, data["text"].size()[0]) if num_row > 1 and num_row % 2 != 0: num_row -= 1 - data['text'] = data['text'][:num_row, :].contiguous() + data["text"] = data["text"][:num_row, :].contiguous() else: args.data_efficiency_curriculum_learning_seqlen_type = None return data @@ -251,48 +284,42 @@ def get_batch_pipe(data): """ args = get_args() tokenizer = get_tokenizer() - + assert args is not None # Items and their type. - keys = ['text'] + keys = ["text"] datatype = torch.int64 - # Broadcast data. data_b = tensor_parallel.broadcast_data(keys, data, datatype) - # Unpack. - tokens_ = data_b['text'].long() + tokens_ = data_b["text"].long() labels = tokens_[:, 1:].contiguous() tokens = tokens_[:, :-1].contiguous() - # Get the masks and postition ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( tokens, tokenizer.eod, args.reset_position_ids, args.reset_attention_mask, - args.eod_mask_loss) - if ( - args.curriculum_learning_legacy - and args.curriculum_seqlen < tokens.size()[1] - ): + args.eod_mask_loss, + ) + if args.curriculum_learning_legacy and args.curriculum_seqlen < tokens.size()[1]: # seqlen-based curriculum learning # tokens, position_ids, labels, loss_mask # have size [batch size, seqlen] - tokens = tokens[:, :args.curriculum_seqlen].contiguous() - position_ids = position_ids[:, :args.curriculum_seqlen].contiguous() + tokens = tokens[:, : args.curriculum_seqlen].contiguous() + position_ids = position_ids[:, : args.curriculum_seqlen].contiguous() if labels is not None: - labels = labels[:, :args.curriculum_seqlen].contiguous() - loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() - + labels = labels[:, : args.curriculum_seqlen].contiguous() + loss_mask = loss_mask[:, : args.curriculum_seqlen].contiguous() return (tokens, position_ids, attention_mask), (labels, loss_mask) def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): args = get_args() + assert args is not None losses = output_tensor.float() loss_mask = loss_mask.view(-1).float() loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - # Reduce loss for logging. averaged_loss = average_losses_across_data_parallel_group([loss]) if args.mos or args.kd: @@ -300,139 +327,108 @@ def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): loss = loss + moe_loss + mos_loss if args.mos: return loss, { - 'total loss': loss, - 'lm loss': averaged_loss[0], - 'moe loss': moe_loss, - 'mos loss': mos_loss + "total loss": loss, + "lm loss": averaged_loss[0], + "moe loss": moe_loss, + "mos loss": mos_loss, } elif args.kd: return loss, { - 'total loss': loss, - 'lm loss': averaged_loss[0], - 'moe loss': moe_loss, - 'kd loss': mos_loss + "total loss": loss, + "lm loss": averaged_loss[0], + "moe loss": moe_loss, + "kd loss": mos_loss, } - print_rank_0( - f'>>> total loss: {loss}, ' - f'lm loss {averaged_loss[0]}, ' - f'kd loss {mos_loss}' + log.info( + f">>> total loss: {loss}, " + f"lm loss {averaged_loss[0]}, " + f"kd loss {mos_loss}" ) else: if max(args.num_experts) <= 1: - return loss, {'lm loss': averaged_loss[0]} + return loss, {"lm loss": averaged_loss[0]} loss = loss + moe_loss - return loss, {'lm loss': averaged_loss[0], 'moe loss': moe_loss} + return loss, {"lm loss": averaged_loss[0], "moe loss": moe_loss} def calculate_mos_loss( - args, - stu_output, - teacher_model, - tokens, - position_ids, - attention_mask + args, stu_output, teacher_model, tokens, position_ids, attention_mask ): mos_loss = 0 alpha = args.kd_alpha_ce beta = args.kd_beta_ce kd_temp = args.kd_temp - if teacher_model: with torch.no_grad(): if ( - args.curriculum_learning_legacy and - args.curriculum_seqlen < args.seq_length + args.curriculum_learning_legacy + and args.curriculum_seqlen < args.seq_length ): assert args.curriculum_seqlen is not None curriculum_seqlen = args.curriculum_seqlen tokens = tokens[:, :curriculum_seqlen].contiguous() position_ids = position_ids[:, :curriculum_seqlen].contiguous() csl = curriculum_seqlen - attention_mask = ( - attention_mask[:, :, :csl, :csl].contiguous() - ) + attention_mask = attention_mask[:, :, :csl, :csl].contiguous() # No need to truncate labels # as we do not need it for the teacher logits tea_output, tea_other_losses = teacher_model( - tokens, - position_ids, - attention_mask + tokens, position_ids, attention_mask ) assert stu_output.size() == tea_output.size(), ( - 'teacher and student output should match in size. ' - f'Student: {stu_output.size()}, ' - f'Teacher: {tea_output.size()}, ' - f'CL seq length {args.curriculum_seqlen}' + "teacher and student output should match in size. " + f"Student: {stu_output.size()}, " + f"Teacher: {tea_output.size()}, " + f"CL seq length {args.curriculum_seqlen}" ) - student_logits = F.log_softmax(stu_output / kd_temp, dim=2) # The target logits is expected to be probabilities. # If we use log_softmax, # then we need to set target_log to true # when initializing the KLDivLoss. tea_logits = F.softmax(tea_output / kd_temp, dim=2) - - mos_loss = kd_temp * kd_temp * nn.KLDivLoss(reduction='batchmean')( - student_logits, - tea_logits + mos_loss = ( + kd_temp + * kd_temp + * nn.KLDivLoss(reduction="batchmean")(student_logits, tea_logits) ) - mos_loss = mos_loss.div(args.seq_length) * beta return mos_loss -def forward_step(data_iterator, model): +def forward_step(data_iterator, model) -> tuple[torch.Tensor | None, Callable]: """Forward step.""" args = get_args() timers = get_timers() - + assert args is not None + assert timers is not None # Get the batch. - timers('batch-generator', log_level=2).start() - tokens, labels, loss_mask, attention_mask, position_ids = get_batch( - data_iterator) - timers('batch-generator').stop() - + timers("batch-generator", log_level=2).start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data_iterator) + timers("batch-generator").stop() if args.data_efficiency_curriculum_learning: args.curriculum_seqlen = tokens.size()[1] - if ( - hasattr( - args, - 'data_efficiency_curriculum_learning_seqlen_type') - and ( - args.data_efficiency_curriculum_learning_seqlen_type - == 'seqlen_reshape' - ) + if hasattr(args, "data_efficiency_curriculum_learning_seqlen_type") and ( + args.data_efficiency_curriculum_learning_seqlen_type == "seqlen_reshape" ): - args.data_efficiency_curriculum_learning_numel = ( - torch.numel(tokens) - ) - + args.data_efficiency_curriculum_learning_numel = torch.numel(tokens) + stu_output = None if args.mos or args.kd: # The forward func can return either the loss or the logits, # depending on whether passing in the labels or not. stu_output, other_losses = model(tokens, position_ids, attention_mask) - if ( - args.curriculum_learning_legacy - and args.curriculum_seqlen < args.seq_length - ): + if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: assert args.curriculum_seqlen is not None - labels = labels[:, :args.curriculum_seqlen].contiguous() + labels = labels[:, : args.curriculum_seqlen].contiguous() output_tensor = tensor_parallel.vocab_parallel_cross_entropy( - stu_output.contiguous().float(), - labels + stu_output.contiguous().float(), labels ) else: output_tensor, other_losses = model( - tokens, - position_ids, - attention_mask, - labels=labels + tokens, position_ids, attention_mask, labels=labels ) - if ( - args.curriculum_learning_legacy and - args.curriculum_seqlen < args.seq_length - ): - loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() + if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: + loss_mask = loss_mask[:, : args.curriculum_seqlen].contiguous() moe_losses = [] for moe_loss in other_losses: @@ -450,7 +446,7 @@ def forward_step(data_iterator, model): args.teacher_model[0], tokens, position_ids, - attention_mask + attention_mask, ) # Output_tensor stores the standard loss, @@ -458,28 +454,51 @@ def forward_step(data_iterator, model): return output_tensor, partial(loss_func, loss_mask, moe_loss, mos_loss) +@ez.dist.timeitlogit(rank=RANK) def train_valid_test_datasets_provider(train_val_test_num_samples): """Build train, valid, and test datasets.""" + t0 = time.perf_counter() args = get_args() - - print_rank_0('> building train, validation, and test datasets ' - 'for GPT ...') + assert args is not None + # from ezpz.profile import get_context_manager + # cm = get_context_manager(rank=RANK, outdir=args.save) + # with cm: + log.info("> building train, validation, and test datasets for GPT ...") files = [] if args.data_file_list is not None: - with open(args.data_file_list, 'r') as flist: + log.info(f"Reading datasets from {args.data_file_list}") + # [!NOTE]: + # - We expect each line of args.data_file_list to be of the form: + # `weight /path/tp/data_text_document corpus` + # where: + # - `weight` is the relative weight of that document + # across all documents (i.e. lines in `args.data_file_list`) + # - `/path/to/data_text_document` is the path to the text document + # - `corpus` is the corpus (~ source, can be made up) where that + # document came from (i.e. `books`, `arxiv`, etc.) + with open(args.data_file_list, "r") as flist: for f in flist.readlines(): - w, fname = f.split() - files.append(float(w)) - files.append(fname) + if len(f.strip()) != 0: + try: + w, fname, c = f.split() + except Exception as exc: + log.exception(exc) + raise Exception( + "Please provide the file list as " + "'weight, filename, corpus'" + ) + if fname.find(".bin") != -1: + fname = fname.split(".bin")[0] + files.extend([float(w), fname, c]) # weight # filename # corpus elif len(args.data_path) == 1 and os.path.isdir(args.data_path[0]): path = args.data_path[0] + "/" for f in os.listdir(path): - if (os.path.isfile(path + f) and f.find(".bin") != -1): + if os.path.isfile(path + f) and f.find(".bin") != -1: files.append(1) files.append(path + f.split(".bin")[0]) else: files = args.data_path - print_rank_0(f"file list {files}") + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=files, data_impl=args.data_impl, @@ -487,23 +506,19 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): train_valid_test_num_samples=train_val_test_num_samples, seq_length=args.seq_length, seed=args.seed, - skip_warmup=True, - # skip_warmup=(not args.mmap_warmup), + skip_warmup=(not args.mmap_warmup), train_data_prefix=args.train_data_path, valid_data_prefix=args.valid_data_path, test_data_prefix=args.test_data_path, - data_cache_path=args.data_cache_path) - print_rank_0("> finished creating GPT datasets ...") - + data_cache_path=args.data_cache_path, + ) + dt = time.perf_counter_ns() - t0 + log.info(f"> finished creating GPT datasets. Took: {dt:.5f}s") return train_ds, valid_ds, test_ds def command_exists(cmd): - result = subprocess.Popen( - f'type {cmd}', - stdout=subprocess.PIPE, - shell=True - ) + result = subprocess.Popen(f"type {cmd}", stdout=subprocess.PIPE, shell=True) return result.wait() == 0 @@ -511,17 +526,18 @@ def git_ds_info(): if RANK != 0: return from deepspeed.env_report import main as ds_report + ds_report() # Write out version/git info git_hash_cmd = "git rev-parse --short HEAD" git_branch_cmd = "git rev-parse --abbrev-ref HEAD" - if command_exists('git'): + if command_exists("git"): try: result = subprocess.check_output(git_hash_cmd, shell=True) - git_hash = result.decode('utf-8').strip() + git_hash = result.decode("utf-8").strip() result = subprocess.check_output(git_branch_cmd, shell=True) - git_branch = result.decode('utf-8').strip() + git_branch = result.decode("utf-8").strip() except subprocess.CalledProcessError: git_hash = "unknown" git_branch = "unknown" @@ -529,36 +545,69 @@ def git_ds_info(): git_hash = "unknown" git_branch = "unknown" print( - f'**** Git info for Megatron: ' - f'git_hash={git_hash} git_branch={git_branch} ****' + f"**** Git info for Megatron: " + f"git_hash={git_hash} git_branch={git_branch} ****" ) def main(): - # if RANK == 0: - # setup_wandb() - if os.getenv('TORCH_PROFILER_ENABLED') == '1': - from torch.profiler import profile, record_function, ProfilerActivity - with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: + if os.getenv("TORCH_PROFILER_ENABLE") == "1": + # record_function + from torch.profiler import profile, ProfilerActivity + + try: + activities = [ + ProfilerActivity.CPU, + ProfilerActivity.CUDA, + ProfilerActivity.XPU, + ] + except Exception as exc: + log.exception(exc) + log.warning("TORCH PROFILER WARNING: XPU is not supported") + activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA] + with profile(activities=activities) as prof: model = pretrain( train_valid_test_datasets_provider, model_provider, ModelType.encoder_or_decoder, forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, - data_post_process=data_post_process + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process, ) - - prof.export_chrome_trace(f"{args.tensorboard_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}.json") + args = get_args() + assert args is not None + prof.export_chrome_trace( + f"{args.trace_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}.json" + ) else: model = pretrain( train_valid_test_datasets_provider, model_provider, ModelType.encoder_or_decoder, forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, - data_post_process=data_post_process + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process, ) + # try: + # from megatron.text_generation import generate_and_post_process + # with torch.autocast(device_type=DEVICE, dtype=args.dtype): + # response, _, _, _ = generate_and_post_process( + # model, + # prompts=[ + # "Hello world", + # "Nature is", + # "Turing test comprises", + # "Explain solar eclipse" + # ], + # tokens_to_generate=32 + # ) + # if RANK == 0: + # log.info(f'generation completed..\n response:{response}') + # except ValueError as ve: + # log.critical(f'ValueError: {ve}') + # pass + # dist.barrier() + # model.train() return model @@ -572,10 +621,11 @@ def main(): # data_post_process=data_post_process) import sys import deepspeed.comm as dist + model = main() dist.log_summary() if wandb.run is not None: print(f"wandb.run.name: {wandb.run.name}") print(f"wandb.run.url: {wandb.run.url}") wandb.finish() - sys.exit() + sys.exit(0) diff --git a/test_agptllama.py b/test_agptllama.py new file mode 100644 index 0000000000..e1d207fa27 --- /dev/null +++ b/test_agptllama.py @@ -0,0 +1,34 @@ +import torch +import intel_extension_for_pytorch as ipex +from transformers import GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, LlamaTokenizer, LlamaForCausalLM +def batch_encode(prompts, tokenizer, prompt_len=512): + input_tokens = tokenizer.batch_encode_plus(prompts, return_tensors="pt", padding="max_length", max_length=len(prompts)) + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to("xpu") + #input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) + return input_tokens + + +def generate_prompt(model, tokenizer, prompts): + + input_tokens = batch_encode(prompts, tokenizer) + print(input_tokens) + generate_kwargs = dict(max_new_tokens=30, do_sample=False) + output_ids = model.generate(**input_tokens, **generate_kwargs) + print(output_ids) + outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + + return outputs + +if __name__ == '__main__': + + model = LlamaForCausalLM.from_pretrained("/flare/Aurora_deployment/vsastry/hf_new_cp/") + model.to("xpu") # model.cuda() + model.seqlen = 4096 + + # get llama tokenizer + tokenizer = LlamaTokenizer.from_pretrained("/flare/Aurora_deployment/AuroraGPT/datasets/dolma/utils/tokenizer.model") + tokenizer.pad_token = tokenizer.eos_token + output = generate_prompt(model, tokenizer, prompts=["What is the language spoken in Mexico ?"]) + print(output) diff --git a/tests/models/test_gpt_embedding.py b/tests/models/test_gpt_embedding.py index 700990adc2..199f29dede 100644 --- a/tests/models/test_gpt_embedding.py +++ b/tests/models/test_gpt_embedding.py @@ -1,15 +1,22 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. import pytest import torch +import types from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.models.gpt.gpt_embedding import GPTEmbedding +from megatron.global_vars import set_args +from deepspeed.accelerator import get_accelerator +device_name = get_accelerator().device_name() @pytest.fixture def gpt_embedding(transformer_config): + args = types.SimpleNamespace(params_dtype=torch.float32, embed_layernorm=False) + set_args(args) embedding = GPTEmbedding(config=transformer_config, vocab_size=100, max_sequence_length=4) return embedding @@ -36,12 +43,12 @@ def test_cpu_forward(self, gpt_embedding: GPTEmbedding): assert embeddings.shape[1] == input_ids.shape[0] assert embeddings.shape[2] == gpt_embedding.config.hidden_size - def test_gpu_forward(self, gpt_embedding: GPTEmbedding): - gpt_embedding.cuda() - input_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).cuda() - position_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).cuda() + def test_accelerator_forward(self, gpt_embedding: GPTEmbedding): + gpt_embedding.to(device_name) + input_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).to(device_name) + position_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).to(device_name) embeddings = gpt_embedding(input_ids, position_ids) - assert embeddings.device.type == 'cuda' + assert embeddings.device.type == device_name assert embeddings.shape[0] == gpt_embedding.max_sequence_length assert embeddings.shape[1] == input_ids.shape[0] assert embeddings.shape[2] == gpt_embedding.config.hidden_size diff --git a/tests/models/test_gpt_model.py b/tests/models/test_gpt_model.py index b854ecd918..cf322908b3 100644 --- a/tests/models/test_gpt_model.py +++ b/tests/models/test_gpt_model.py @@ -1,20 +1,28 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. import pytest import torch +import types from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.global_vars import set_args +from deepspeed.accelerator import get_accelerator +device_name = get_accelerator().device_name() @pytest.fixture def gpt_model(transformer_config): + args = types.SimpleNamespace(params_dtype=torch.float32, embed_layernorm=False) + set_args(args) language_model = GPTModel(config=transformer_config, vocab_size=100, max_sequence_length=4) return language_model class TestGPTModel: + @pytest.mark.xfail(device_name=='hpu', reason="TELayerNorm is not defined in HPU") def test_constructor(self, gpt_model: GPTModel): assert isinstance(gpt_model, GPTModel) @@ -23,6 +31,7 @@ def test_constructor(self, gpt_model: GPTModel): num_weights = sum([p.numel() for p in gpt_model.parameters()]) assert num_weights == 5040 + @pytest.mark.xfail(device_name=='hpu', reason="TELayerNorm is not defined in HPU") def test_set_input_tensor(self, gpt_model: GPTModel): config: TransformerConfig = gpt_model.config sequence_length = gpt_model.max_sequence_length @@ -37,17 +46,18 @@ def test_set_input_tensor(self, gpt_model: GPTModel): assert gpt_model.decoder.input_tensor.shape[1] == micro_batch_size assert gpt_model.decoder.input_tensor.shape[2] == config.hidden_size + @pytest.mark.xfail(device_name=='hpu', reason="TELayerNorm is not defined in HPU") def test_post_process_forward(self, gpt_model: GPTModel): config: TransformerConfig = gpt_model.config sequence_length = gpt_model.max_sequence_length micro_batch_size = 2 - gpt_model.cuda() + gpt_model.to(device_name) data = list(range(sequence_length)) - input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() - position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() - attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).to(device_name) + position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).to(device_name) + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).to(device_name) logits = gpt_model.forward(input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask) @@ -55,15 +65,19 @@ def test_post_process_forward(self, gpt_model: GPTModel): assert logits.shape[1] == sequence_length assert logits.shape[2] == gpt_model.vocab_size + @pytest.mark.xfail(device_name=='hpu', reason="TELayerNorm is not defined in HPU") def test_no_post_process_forward(self, gpt_model: GPTModel): pass + @pytest.mark.xfail(device_name=='hpu', reason="TELayerNorm is not defined in HPU") def test_no_preprocess_forward(self, gpt_model: GPTModel): pass + @pytest.mark.xfail(device_name=='hpu', reason="TELayerNorm is not defined in HPU") def test_state_dict_for_save_checkpoint(self, gpt_model: GPTModel): pass + @pytest.mark.xfail(device_name=='hpu', reason="TELayerNorm is not defined in HPU") def test_load_state_dict(self, gpt_model: GPTModel): pass diff --git a/tests/pipeline_parallel/test_schedules.py b/tests/pipeline_parallel/test_schedules.py index a6bac5b2a3..72c2372ba4 100644 --- a/tests/pipeline_parallel/test_schedules.py +++ b/tests/pipeline_parallel/test_schedules.py @@ -1,3 +1,5 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. + import torch from tests.test_utilities import Utils from megatron.core import ModelParallelConfig @@ -21,7 +23,9 @@ def test_get_forward_backward_func(): def test_deallocate_output_tensor(): out = torch.tensor([[1, 2, 3], [4, 5, 6]]) schedule.deallocate_output_tensor(out) - assert(out.nelement() == 1) + assert(out.nelement() == 6) + schedule.deallocate_output_tensor(out, True) + assert(out.nelement() == 1) def test_forward_backward_func_without_pipeline_parallel(mocker): from megatron.core.pipeline_parallel import get_forward_backward_func diff --git a/tests/transformer/test_parallel_mlp.py b/tests/transformer/test_parallel_mlp.py index f43dc0b467..098f18a9d6 100644 --- a/tests/transformer/test_parallel_mlp.py +++ b/tests/transformer/test_parallel_mlp.py @@ -1,14 +1,30 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. import pytest import torch +import types from megatron.core.transformer.parallel_mlp import ParallelMLP +from megatron.global_vars import set_args +from deepspeed.accelerator import get_accelerator +device_name = get_accelerator().device_name() @pytest.fixture def mlp(transformer_config): + mlp_args = types.SimpleNamespace( + swiglu=False, + openai_gelu=True, + onnx_safe=False, + bias_gelu_fusion=False, + transformer_impl="", + cache_fp8_weight=False, + fp8_interval=False, + cache_fp8_weight_fwd=False + ) + set_args(mlp_args) return ParallelMLP(transformer_config) @@ -19,28 +35,27 @@ def test_constructor(self, mlp): num_weights = sum([p.numel() for p in mlp.parameters()]) assert num_weights == 1212 - def test_cpu_forward(self, mlp): + def test_cpu_forward(self, mlp, transformer_config): # [sequence length, micro batch size, hidden size] - hidden_states = torch.ones((32, 2, mlp.config.hidden_size)) + hidden_states = torch.ones((32, 2, transformer_config.hidden_size)) output, output_bias = mlp(hidden_states) assert output.shape[0] == 32 assert output.shape[1] == 2 - assert output.shape[2] == mlp.config.hidden_size - assert output_bias.shape[0] == mlp.config.hidden_size + assert output.shape[2] == transformer_config.hidden_size + assert output_bias == None assert output.dtype == torch.float32 - @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") - def test_gpu_forward(self, mlp): - mlp.cuda() + @pytest.mark.skipif(not get_accelerator().is_available(), reason="accelerator not available") + def test_accelerator_forward(self, mlp, transformer_config): + mlp.to(device_name) # [sequence length, batch size, hidden size] - hidden_states = torch.ones((32, 2, mlp.config.hidden_size)) - hidden_states = hidden_states.cuda() + hidden_states = torch.ones((32, 2, transformer_config.hidden_size)) + hidden_states = hidden_states.to(device_name) output, output_bias = mlp(hidden_states) assert output.shape[0] == 32 assert output.shape[1] == 2 - assert output.shape[2] == mlp.config.hidden_size - assert output_bias.shape[0] == mlp.config.hidden_size + assert output.shape[2] == transformer_config.hidden_size + assert output_bias == None assert output.dtype == torch.float32 - assert output.device.type == 'cuda' - assert output_bias.device.type == 'cuda' + assert output.device.type == device_name diff --git a/tests/unit_tests/test_utilities.py b/tests/unit_tests/test_utilities.py index b35c77b58d..68c6e6b55c 100644 --- a/tests/unit_tests/test_utilities.py +++ b/tests/unit_tests/test_utilities.py @@ -1,21 +1,25 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. + import os import torch import megatron.core.parallel_state as ps +from deepspeed.accelerator import get_accelerator + class Utils: - world_size = torch.cuda.device_count() - rank = int(os.environ['LOCAL_RANK']) + world_size = int(os.getenv("WORLD_SIZE", '1')) + rank = int(os.getenv('LOCAL_RANK', '0')) @staticmethod def initialize_distributed(): print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}') - torch.cuda.set_device(Utils.rank % torch.cuda.device_count()) + get_accelerator().set_device(Utils.rank % get_accelerator().device_count()) init_method = 'tcp://' master_ip = os.getenv('MASTER_ADDR', 'localhost') master_port = os.getenv('MASTER_PORT', '6000') init_method += master_ip + ':' + master_port - torch.distributed.init_process_group(backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method) + torch.distributed.init_process_group(backend=get_accelerator().communication_backend_name(), world_size=Utils.world_size, rank=Utils.rank, init_method=init_method) @staticmethod def destroy_model_parallel(): @@ -23,8 +27,8 @@ def destroy_model_parallel(): torch.distributed.barrier() @staticmethod - def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None): + def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, sequence_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None): ps.destroy_model_parallel() if not torch.distributed.is_initialized(): Utils.initialize_distributed() - ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank) \ No newline at end of file + ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, sequence_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank) \ No newline at end of file diff --git a/tools/hf2megads_weight_converter.py b/tools/hf2megads_weight_converter.py index bfbde1fd05..12468963c5 100755 --- a/tools/hf2megads_weight_converter.py +++ b/tools/hf2megads_weight_converter.py @@ -3,9 +3,11 @@ import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import torch.distributed from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP from megatron import print_rank_0, get_tokenizer, get_args from megatron.core import mpu +from megatron.core import tensor_parallel from megatron.core.utils import divide from megatron.model import GPTModelPipe, Float16Module from megatron.utils import unwrap_model @@ -13,20 +15,30 @@ from megatron.arguments import core_transformer_config_from_args from megatron.initialize import initialize_megatron from megatron.optimizer import get_megatron_optimizer -from megatron.checkpointing import save_checkpoint +from megatron.checkpointing import save_checkpoint, load_checkpoint from megatron.training import get_optimizer_param_scheduler from deepspeed.runtime.utils import see_memory_usage import deepspeed +import copy +from pathlib import Path + def add_extra_args(parser): """Text generation arguments.""" group = parser.add_argument_group(title='hf2mega') - group.add_argument("--hf-ckpt-num-shards", type=int, help='num of llama ckpt.') - group.add_argument("--origin-hf-ckpt-dir", + group.add_argument("--hf-ckpt-dir", type=str, default="", - help="the original path of the llama-hf ckpt") + help="the llama-hf ckpt") + group.add_argument("--hf-ckpt-num-shards", type=int, default=-1, help='num of llama ckpt.') + group.add_argument("--load-mode", type=str, + default=None, + choices=['torchbin', 'safetensor', 'auto'], + help="load ckpt format: pytorch.bin or model.safetensor or auto.") + group.add_argument("--to-hf-ckpt", action="store_true", + help="by default convert from hf to megads" + "if set, convert reversely from megads to hf ckpt.") return parser @@ -55,6 +67,49 @@ def load_and_print_hf_weight(hf_ckpt_dir, hf_ckpt_num_of_shards): return loaded +def load_and_print_hf_weight_from_safetensor(hf_ckpt_dir, hf_ckpt_num_of_shards): + from safetensors import safe_open + # Optimization point: We can selectively load specific 'shared' data to reduce CPU memory usage. + hf_model = {} + print_rank_0( + f"----------------------------hf weight list----------------------------") + + for wid in range(1, hf_ckpt_num_of_shards + 1): + if hf_ckpt_num_of_shards == 1: + ckpt_path = f"{hf_ckpt_dir}/model.safetensors" + else: + ckpt_path = f"{hf_ckpt_dir}/model-{wid:05d}-of-{hf_ckpt_num_of_shards:05d}.safetensors" + + with safe_open(ckpt_path, framework="pt", device="cpu") as f: + for k in f.keys(): + print_rank_0(f"name: {k}, shape: {f.get_tensor(k).shape}") + assert k not in hf_model + hf_model[k] = f.get_tensor(k).clone() + + return hf_model + + +def load_and_print_hf_weight_auto(hf_ckpt_dir, no_init=True): + from transformers import AutoConfig, AutoModelForCausalLM + from transformers.modeling_utils import no_init_weights + + if no_init: + hf_config = AutoConfig.from_pretrained(hf_ckpt_dir, trust_remote_code=True) + with no_init_weights(): + hf_model = AutoModelForCausalLM.from_config(hf_config, trust_remote_code=True, torch_dtype=torch.bfloat16) + else: + hf_model = {} + hf_auto_model = AutoModelForCausalLM.from_pretrained(hf_ckpt_dir, trust_remote_code=True, torch_dtype=torch.bfloat16) + print_rank_0( + f"----------------------------hf weight list----------------------------") + + for name, param in hf_auto_model.named_parameters(): + hf_model[name] = param.clone() + print_rank_0(name) + + return hf_model + + def print_distinct_weights(model): print_rank_0( f"----------------------------mega-ds weight list----------------------------") @@ -70,16 +125,19 @@ def print_distinct_weights(model): class refactor: - def __init__(self, model, loaded, args, config): + def __init__(self, ds_model, hf_model, args, config): tokenizer = get_tokenizer() # align layer number - self.model = model - self.loaded = loaded + self.ds_model = ds_model + self.hf_model = hf_model + self.hf_dict = {} # for handling pp case when converting mds => hf self.config = config self.offset_num = 2 self.mega_emb_wnum = 1 self.mega_norm_wnum = args.num_layers + 2 + self.num_attention_heads = args.num_attention_heads + self.num_key_value_heads = args.num_key_value_heads self.mega_lm_head_wnum = self.mega_norm_wnum + 1 self.token_vocab = tokenizer.vocab_size self.padded_vocab_size = args.padded_vocab_size @@ -95,7 +153,7 @@ def _embedding_refactor(self, pname, p): hf_name = "lm_head.weight" elif pname == f"{self.mega_emb_wnum}.word_embeddings.weight": hf_name = "model.embed_tokens.weight" - hf_w = self.loaded[hf_name] + hf_w = self.hf_model[hf_name] assert hf_w.shape[0] == self.token_vocab per_partition_vocab_size, start_index, end_index = compute_partition_range( self.padded_vocab_size, self.tp_rank, self.tp_size) @@ -112,24 +170,28 @@ def _embedding_refactor(self, pname, p): ) return new_w + + + def _direct_refactor(self, pname, p, hf_layer=None, subname=None): if pname == f"{self.mega_norm_wnum}.weight": hf_name = "model.norm.weight" elif subname in ["input_layernorm.weight", "post_attention_layernorm.weight"]: hf_name = f"model.layers.{hf_layer}.{subname}" - new_w = hf_w = self.loaded[hf_name] + new_w = hf_w = self.hf_model[hf_name] self.record_mapping_info( f"mega-ds:{pname,p.data.shape}<--hf{hf_name,} {hf_w.shape}") return new_w + def _qkv_refactor(self, pname, p, hf_layer): hf_wq_name = f"model.layers.{hf_layer}.self_attn.q_proj.weight" hf_wk_name = f"model.layers.{hf_layer}.self_attn.k_proj.weight" hf_wv_name = f"model.layers.{hf_layer}.self_attn.v_proj.weight" - wq = self.loaded[hf_wq_name] - wk = self.loaded[hf_wk_name] - wv = self.loaded[hf_wv_name] + wq = self.hf_model[hf_wq_name] + wk = self.hf_model[hf_wk_name] + wv = self.hf_model[hf_wv_name] hidden_size = wq.shape[0] per_partition_size, start_index, end_index = compute_partition_range( @@ -159,8 +221,8 @@ def _qkv_refactor(self, pname, p, hf_layer): def _mlphto4h_dense_refactor(self, pname, p, hf_layer): hf_w_gate_name = f"model.layers.{hf_layer}.mlp.gate_proj.weight" hf_w_up_name = f"model.layers.{hf_layer}.mlp.up_proj.weight" - w_gate = self.loaded[hf_w_gate_name] - w_up = self.loaded[hf_w_up_name] + w_gate = self.hf_model[hf_w_gate_name] + w_up = self.hf_model[hf_w_up_name] hidden_size = w_gate.shape[0] per_partition_size, start_index, end_index = compute_partition_range( @@ -184,7 +246,7 @@ def _attn_dense_refactor(self, pname, p, hf_layer, subname): else: hf_name = f"model.layers.{hf_layer}.mlp.down_proj.weight" - hf_w = self.loaded[hf_name] + hf_w = self.hf_model[hf_name] hidden_size = hf_w.shape[1] per_partition_size, start_index, end_index = compute_partition_range( hidden_size, self.tp_rank, self.tp_size) @@ -200,7 +262,7 @@ def _mlphto4h1_refactor(self, pname, p, hf_layer, subname): hf_name = f"model.layers.{hf_layer}.mlp.gate_proj.weight" else: hf_name = f"model.layers.{hf_layer}.mlp.up_proj.weight" - hf_w = self.loaded[hf_name] + hf_w = self.hf_model[hf_name] hidden_size = hf_w.shape[0] per_partition_size, start_index, end_index = compute_partition_range( hidden_size, self.tp_rank, self.tp_size) @@ -212,10 +274,11 @@ def _mlphto4h1_refactor(self, pname, p, hf_layer, subname): ) return new_w - def refactor(self): + def transform_from_hf_to_megds(self): assert self.is_refactored == False new_w = None - for pname, p in self.model.named_parameters(): + for pname, p in self.ds_model.named_parameters(): + if pname in [ f"{self.mega_emb_wnum}.word_embeddings.weight", f"{self.mega_lm_head_wnum}.lm_head.weight" @@ -253,6 +316,123 @@ def refactor(self): new_w = None self.is_refactored = True + + def _embedding_refactor_to_hf(self, pname, ds_w): + if pname == f"{self.mega_lm_head_wnum}.lm_head.weight": + hf_w = self.hf_model.lm_head.weight + hf_w_name = "lm_head.weight" + elif pname == f"{self.mega_emb_wnum}.word_embeddings.weight": + hf_w = self.hf_model.model.embed_tokens.weight + hf_w_name = "model.embed_tokens.weight" + + with torch.no_grad(): + ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w) + + self.hf_dict[hf_w_name] = copy.deepcopy(ds_w_all_rank[:hf_w.shape[0], :]) + + def _direct_refactor_to_hf(self, pname, ds_w, hf_layer=None, subname=None): + if pname in [f"{self.mega_norm_wnum}.weight"]: + hf_w = self.hf_model.model.norm.weight + hf_w_name = "model.norm.weight" + elif subname in ["input_layernorm.weight"]: + hf_w = self.hf_model.model.layers[hf_layer].input_layernorm.weight + hf_w_name = f"model.layers.{hf_layer}.input_layernorm.weight" + elif subname in ["post_attention_layernorm.weight"]: + hf_w = self.hf_model.model.layers[hf_layer].post_attention_layernorm.weight + hf_w_name = f"model.layers.{hf_layer}.post_attention_layernorm.weight" + + self.hf_dict[hf_w_name] = copy.deepcopy(ds_w) + + def _attn_dense_refactor_to_hf(self, pname, ds_w, hf_layer, subname): + if subname == "self_attention.dense.weight": + hf_w = self.hf_model.model.layers[hf_layer].self_attn.o_proj.weight + hf_w_name = f"model.layers.{hf_layer}.self_attn.o_proj.weight" + elif subname == "mlp.dense_4h_to_h.weight": + hf_w = self.hf_model.model.layers[hf_layer].mlp.down_proj.weight + hf_w_name = f"model.layers.{hf_layer}.mlp.down_proj.weight" + + with torch.no_grad(): + ds_w_all_rank = tensor_parallel.mappings._gather_along_last_dim(ds_w) + + self.hf_dict[hf_w_name] = copy.deepcopy(ds_w_all_rank) + + def _mlphto4h_dense_refactor_to_hf(self, pname, ds_w, hf_layer): + hf_g_name = f"model.layers.{hf_layer}.mlp.gate_proj.weight" + hf_u_name = f"model.layers.{hf_layer}.mlp.up_proj.weight" + + with torch.no_grad(): + ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w) + + ds_w_shape = ds_w_all_rank.shape + ds_w_all_rank = ds_w_all_rank.reshape(self.tp_size, 2, -1, ds_w_shape[-1]) + self.hf_dict[hf_g_name] = copy.deepcopy(ds_w_all_rank[:, 0, :, :].reshape(-1, ds_w_shape[-1])) + self.hf_dict[hf_u_name] = copy.deepcopy(ds_w_all_rank[:, 1, :, :].reshape(-1, ds_w_shape[-1])) + + + def _qkv_refactor_to_hf(self, pname, ds_w, hf_layer): + with torch.no_grad(): + ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w) + + hf_q = self.hf_model.model.layers[hf_layer].self_attn.q_proj.weight + hf_k = self.hf_model.model.layers[hf_layer].self_attn.k_proj.weight + hf_v = self.hf_model.model.layers[hf_layer].self_attn.v_proj.weight + hf_q_name = f"model.layers.{hf_layer}.self_attn.q_proj.weight" + hf_k_name = f"model.layers.{hf_layer}.self_attn.k_proj.weight" + hf_v_name = f"model.layers.{hf_layer}.self_attn.v_proj.weight" + oldshape = hf_q.shape + hidden_size = oldshape[-1] + hidden_size_per_attention_head = divide(hidden_size, + self.config.num_attention_heads) + num_attention_heads_per_partition = divide(self.config.num_attention_heads, + self.tp_size) + newshape = (self.tp_size, num_attention_heads_per_partition, 3, hidden_size_per_attention_head, hidden_size) + ds_w_out = ds_w_all_rank.reshape(*newshape) + self.hf_dict[hf_q_name] = copy.deepcopy(ds_w_out[:, :, 0, :, :].reshape(-1, oldshape[-1])) + self.hf_dict[hf_k_name] = copy.deepcopy(ds_w_out[:, :, 1, :, :].reshape(-1, oldshape[-1])) + self.hf_dict[hf_v_name] = copy.deepcopy(ds_w_out[:, :, 2, :, :].reshape(-1, oldshape[-1])) + + + def transform_from_megads_to_hf(self): + use_gqa = True if self.num_attention_heads != self.num_key_value_heads else False + + for pname, p in self.ds_model.named_parameters(): + if pname in [ + f"{self.mega_emb_wnum}.word_embeddings.weight", + f"{self.mega_lm_head_wnum}.lm_head.weight", + ]: + self._embedding_refactor_to_hf(pname, p) + elif pname in [ + f"{self.mega_norm_wnum}.weight", + ]: + self._direct_refactor_to_hf(pname, p) + else: + mobj = self.decoder_pat.match(pname) + layer_num = int(mobj.group(1)) + subname = mobj.group(2) + hf_layer = layer_num - self.offset_num + if subname in ["self_attention.query_key_value.weight"]: + if not use_gqa: + self._qkv_refactor_to_hf(pname, p, hf_layer) + else: + #TODO(billishyahao): Not impl yet ... + assert False + elif subname in ["mlp.dense_h_to_4h.weight"]: + self._mlphto4h_dense_refactor_to_hf(pname, p, hf_layer) + elif subname in [ + "self_attention.dense.weight", + "mlp.dense_4h_to_h.weight" + ]: + self._attn_dense_refactor_to_hf(pname, p, hf_layer, subname) + elif subname in [ + "input_layernorm.weight", + "post_attention_layernorm.weight", + ]: + self._direct_refactor_to_hf(pname, p, hf_layer, subname) + else: + print(f"Unrecognized weight type: {pname}") + raise ValueError(f"Unrecognized weight type: {pname}") + self.is_refactored = True + def record_mapping_info(self, record_msg): self.refactor_weight_list.append(record_msg) @@ -272,7 +452,18 @@ def inorder_show_record(self): torch.distributed.barrier() -def convert_hf_to_mega_ds(): +def load_hf_weights(args, no_init): + if args.load_mode == 'torchbin': + assert no_init == False, "only work with init" + return load_and_print_hf_weight(args.hf_ckpt_dir, args.hf_ckpt_num_shards) + elif args.load_mode == 'safetensor': + assert no_init == False, "only work with init" + return load_and_print_hf_weight_from_safetensor(args.hf_ckpt_dir, args.hf_ckpt_num_shards) + elif args.load_mode == 'auto': + return load_and_print_hf_weight_auto(args.hf_ckpt_dir, no_init) + + +def convert_ckpt(): """Build the model.""" args = get_args() print_rank_0(f'building model ...') @@ -286,49 +477,74 @@ def convert_hf_to_mega_ds(): enabled=args.zero_stage == 3, mpu=mpu): if args.deepspeed and not args.no_pipeline_parallel: - model = GPTModelPipe(config, num_tokentypes=0, parallel_output=True) + ds_model = GPTModelPipe(config, num_tokentypes=0, parallel_output=True) else: raise NotImplementedError("Not implemented") see_memory_usage(f"After Building Model", force=True) if torch.distributed.get_rank() < 2: - print(f"{torch.distributed.get_rank()} {model}") - - # load and initialize HF weight dict - # print hf weights list & mega-ds weights list - hf_ckpt_dir = args.origin_hf_ckpt_dir - hf_ckpt_num_of_shards = args.hf_ckpt_num_shards - loaded = load_and_print_hf_weight(hf_ckpt_dir, hf_ckpt_num_of_shards) - print_distinct_weights(model) - - # refactor weight from hf to mega-ds - - cur_refactor = refactor(model, loaded, args, config) - cur_refactor.refactor() - cur_refactor.inorder_show_record() + print(f"{torch.distributed.get_rank()} {ds_model}") - del loaded + # 'torchbin', 'safetensor', 'auto' + hf_model = load_hf_weights(args, no_init=args.to_hf_ckpt) - unwrapped_model = unwrap_model([model], (torchDDP, LocalDDP, Float16Module)) - optimizer = get_megatron_optimizer(unwrapped_model) - opt_param_scheduler = get_optimizer_param_scheduler(optimizer) + # print_distinct_weights(hf_model) #init model and save print_rank_0(f"before deepspeed init") ds_engine, _, _, _ = deepspeed.initialize( - model=model, - optimizer=optimizer, + model=ds_model, + optimizer=None, args=args, - lr_scheduler=opt_param_scheduler, + lr_scheduler=None, mpu=mpu if args.no_pipeline_parallel else None) print_rank_0(f"after deepspeed init") - print_rank_0(f"mega-ds checkpoint will be saved in {args.save}") - save_checkpoint(0, [ds_engine], optimizer, opt_param_scheduler) - print_rank_0(f"save checkpoint completed") + if args.to_hf_ckpt: + load_checkpoint([ds_engine], None, None, load_only_weights=True) + print_rank_0(f"completed to load deepspeed actual checkpoint") + + # refactor weight from hf to mega-ds and vice versa + + cur_refactor = refactor(ds_model, hf_model, args, config) + if args.to_hf_ckpt: + cur_refactor.transform_from_megads_to_hf() + else: + cur_refactor.transform_from_hf_to_megds() + # cur_refactor.inorder_show_record() + + if args.to_hf_ckpt: + save_path = args.save + if not os.path.exists(save_path): + Path(save_path).mkdir(parents=True, exist_ok=True) + ckpt_per_pp_path = os.path.join(save_path, f"model_pp{mpu.get_pipeline_model_parallel_rank()}.pt") + torch.save(cur_refactor.hf_dict, ckpt_per_pp_path) + if torch.distributed.is_initialized(): + torch.distributed.barrier() + + print_rank_0(f"hf checkpoint will be saved in {save_path}/release ") + if mpu.is_pipeline_last_stage(): + ## doing checkpoint merging and saving... + # hf_model.tie_weights() + + all_wei = {} + for pprank in range(mpu.get_pipeline_model_parallel_world_size()): + ckpt_per_pp_path = os.path.join(save_path, f"model_pp{pprank}.pt") + partial_wei = torch.load(ckpt_per_pp_path) + all_wei = all_wei | partial_wei + + hf_model.load_state_dict(all_wei) + + # mega-ds checkpoint will be saved in args.save + hf_model.save_pretrained(os.path.join(save_path, "release"), safe_serialization=True) + else: + print_rank_0(f"mega-ds checkpoint will be saved in {args.save}") + save_checkpoint(0, [ds_engine], None, None) + + print_rank_0(f"save checkpoint completed") if __name__ == "__main__": initialize_megatron(extra_args_provider=add_extra_args) - convert_hf_to_mega_ds() + convert_ckpt() diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 399f93c10e..6e117db31a 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -1,3 +1,4 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. """Processing large data for pretraining.""" @@ -193,10 +194,15 @@ def get_args(): group.add_argument('--tokenizer-type', type=str, required=True, choices=['BertWordPieceLowerCase','BertWordPieceCase', 'GPT2BPETokenizer', 'SentencePieceTokenizer', - 'GPTSentencePieceTokenizer', 'NullTokenizer'], + 'GPTSentencePieceTokenizer', 'HFTokenizer', + 'NullTokenizer'], help='What type of tokenizer to use.') group.add_argument('--tokenizer-model', type=str, default=None, help='YTTM tokenizer model.') + group.add_argument('--seq-length', type=int, default=None, + help='Maximum sequence length to process.') + group.add_argument('--trust-remote-code', action='store_true', + help='To run HFTokenizer model from local path.') group.add_argument('--vocab-file', type=str, default=None, help='Path to the vocab file') group.add_argument('--vocab-size', default=786, @@ -229,7 +235,7 @@ def get_args(): print("Are you sure you don't want to split sentences?") # some default/dummy values for the tokenizer - args.rank = 1 + args.rank = 0 args.make_vocab_size_divisible_by = 128 args.tensor_model_parallel_size = 1 args.vocab_extra_ids = 0 diff --git a/train_aGPT_7B.sh b/train_aGPT_7B.sh new file mode 100644 index 0000000000..1350ea0f2a --- /dev/null +++ b/train_aGPT_7B.sh @@ -0,0 +1,40 @@ +#!/bin/bash --login +#PBS -q lustre_scaling +#PBS -A Aurora_Deployment +#PBS -j oe + +##################################### +# AuroraGPT-7B +# +# Main production script for training +# AuroraGPT-7B @ ALCF +##################################### + +# 1. Navigate into `$PBS_O_WORKDIR` +cd "${PBS_O_WORKDIR}" || exit +HERE=$(python3 -c 'import os; print(os.getcwd())') && export HERE +GIT_BRANCH=$(git branch --show-current) && export GIT_BRANCH + + +# 2. source `ALCF/helpers.sh` +source "${HERE}/ALCF/helpers.sh" || exit + +# 3. call `setup` from `./ALCF/helpers.sh` +setup "$@" || exit +# export run_cmd="${run_cmd}" +echo "${run_cmd[@]}" | tee -a "${OUTPUT_LOG}" + +# 4. Tell user where to find output +printf "[!! %s] View output at:\n %s\n" "$(printBlue "NOTE")" "$(printYellow "${OUTPUT_LOG}")" | tee -a "${OUTPUT_LOG}" + +# # 5. Ignore the following strings on Intel XPU devices +# # (otherwise they'll clutter up logs) +# XPU_IGNORE_STRING="CCL_WARN|\ -\ INFO\ \-\ |real_accelerator\.py|numexpr\.utils|async_io|libaio" + +# if [[ $(ezpz_get_machine_name) == "aurora" ]]; then +# module unload mpich && module load mpich +# fi +# +# 6. Evaluate ${run_cmd} and append outputs to ${OUTPUT_LOG} +# eval "${run_cmd[@]}" |& tee -a "${OUTPUT_LOG}" +eval "${run_cmd[*]}" |& tee -a "${OUTPUT_LOG}" diff --git a/train_agpt_polaris_7B_production.sh b/train_agpt_polaris_7B_production.sh new file mode 100644 index 0000000000..f83b6ebc29 --- /dev/null +++ b/train_agpt_polaris_7B_production.sh @@ -0,0 +1,29 @@ +#!/bin/bash --login +# +# This script can be submitted with `qsub` via: +# +# ```bash +# $ git clone https://github.com/argonee-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed +# $ qsub train_agpt_polaris_7B_production.sh +# ``` + +cd "${PBS_O_WORKDIR}" || exit + +TODAY="$(date "+%Y-%m-%d")" +NOW="$(date "+%Y-%m-%d-%H%M%S")" +OUTDIR="${PBS_O_WORKDIR}/pbslogs/${TODAY}" +OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" +mkdir -p $(dirname "${OUTFILE}") + +echo "${OUTFILE}" >> "$(dirname ${OUTDIR})/latest" +echo "Logging job output to: ${OUTFILE}" + +# export DEBUG=1 +# export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=6000 + +# Path to the data file list: +DFL="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt" + +# Launch: +MICRO_BATCH=2 DATA_FILE_LIST="${DFL}" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh new file mode 100644 index 0000000000..259552ffbc --- /dev/null +++ b/train_llama_alcf.sh @@ -0,0 +1,49 @@ +#!/bin/bash --login + +############################################################################### +# Check if running in DEBUG=1 mode. +# - If so, this will print each command before it is ran and exit if any of +# them return a nonzero exit status. +############################################################################### +if [[ -n "${DEBUG-}" ]]; then # to use: `DEBUG=1 bash train_llama_alcf.sh` + printf "\e[1;31m%s\e[0m\n" "!! RUNNING IN DEBUG MODE !!" + set -euxo pipefail +fi + +############################################################################### +# Print (but DO NOT EXECUTE !!) each command that would be ran. +# +# Enable with: NOOP=1 PBS_O_WORKDIR=$(pwd) bash train_llama_alcf.sh +############################################################################### +if [[ -v NOOP ]]; then # to use: `NOOP=1 bash train_llama_alcf.sh` + echo "Run NOOP mode" + set -o noexec # same as set -n +fi + +XPU_IGNORE_STRING="CCL_WARN|\ -\ INFO\ \-\ |real_accelerator\.py|numexpr\.utils|async_io|libaio" + +##################### +# MAIN PROGRAM LOGIC +##################### +main() { + # 1. Navigate into `$PBS_O_WORKDIR` + cd "${PBS_O_WORKDIR}" || exit + HERE=$(python3 -c 'import os; print(os.getcwd())') && export HERE + # 2. source `ALCF/helpers.sh` + source "${HERE}/ALCF/helpers.sh" || exit + # 3. call `setup` from `./ALCF/helpers.sh` + setup "$@" || exit + # 4. Take custom args + export custom_args=" $@" + # 5. Update ${run_cmd} (from setup ALCF/helpers.sh) with ${custom_args} + export run_cmd="${run_cmd} ${custom_args}" + # 6. Add "${run_cmd}" to output log + echo "${run_cmd}" | tee -a "${OUTPUT_LOG}" + # 7. Tell user where to find output + printf "[!! %s] View output at:\n %s\n" "$(printBlue "NOTE")" "$(printYellow "${OUTPUT_LOG}")" | tee -a "${OUTPUT_LOG}" + # 8. Evaluate ${run_cmd} and append outputs to ${OUTPUT_LOG} + eval "${run_cmd}" |& grep -E -v "${XPU_IGNORE_STRING}" |& tee -a "${OUTPUT_LOG}" + set +x +} + +main diff --git a/train_llama_alcf_aurora.sh b/train_llama_alcf_aurora.sh deleted file mode 100644 index 48651dbeb1..0000000000 --- a/train_llama_alcf_aurora.sh +++ /dev/null @@ -1,234 +0,0 @@ -#!/bin/bash --login -#PBS -l walltime=06:00:00 -#PBS -A argonne_tpc -#PBS -q prod -#PBS -l select=48 -#PBS -l filesystems=eagle:home -# - -function sourceFile() { - fp="$1" - if [[ -f "${fp}" ]]; then - echo "Found ${fp}, \`source\`-ing" - # shellcheck source="${fp}" - source "${fp}" - else - echo "ERROR: UNABLE TO SOURCE ${fp}" - fi -} - -# +++++++++++++++ SCRIPT START ++++++++++++++++++++++ -# ---- source ./helpers_alcf.sh --------------------- -cd "${PBS_O_WORKDIR}" || exit -HERE=$(python3 -c 'import os; print(os.getcwd())') -sourceFile "${HERE}/ALCF_utils/helpers_alcf.sh" || exit -# cd ~/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed || exit -# eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate anl_release_q4v2 -ezpz || exit -setEnv || exit -saveDSenv || exit -makeHostfiles || exit -setupData "${DATA_FILE_LIST:-${HERE}/data_file_list_reweighted.txt}" || exit -# dfl_fallback="${HERE}/data_file_list_shuf_debug.txt" - -# # ---- DATA SETUP ------------------------------------ -# dfl_debug="./data_file_list_shuf_debug.txt" -# DATA_FILE_LIST="${DATA_FILE_LIST:-${dfl_debug}}" && export DATA_FILE_LIST="${DATA_FILE_LIST}" -# NUM_DOCS=$(wc -l < "${DATA_FILE_LIST}") && export NUM_DOCS="${NUM_DOCS}" -# WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" && export WEIGHT_SUM="${WEIGHT_SUM}" -# DFL_STEM=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") && export DFL_STEM="${DFL_STEM}" -# dcp="${HERE}/.cache/${DFL_STEM}-index-cache" -# DATA_CACHE_PATH="${DATA_CACHE_PATH:-${dcp}}" && export DATA_CACHE_PATH="${DATA_CACHE_PATH}" -# mkdir -p "${DATA_CACHE_PATH}" -# if [[ -n "${DOLMA_CHUNK_IDX}" ]]; then -# echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NUM_DOCS} documents..." -# else -# echo "Using NUM_DOCS=${NUM_DOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" -# fi - - -# ---- Parallelism Settings -------------------------- -PP=${PP:-1} -TP=${TP:-1} -export PP="${PP}" -export TP="${TP}" -export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" -export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} -# export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${PBS_NODEFILE}")} -# ---------------------------------------------------- - -# ---- Llama2 7B Config ----------------------- -export HEADS=${HEADS:-32} -export NLAYERS=${NLAYERS:-32} -export HIDDEN=${HIDDEN:-4096} -export NUM_KV_HEAD=${NUM_KV_HEAD:-8} -export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" -# --------------------------------------------- - -# ---- Run Settings --------------------------- -export LR=${LR:-0.0003} -export SEQ=${SEQ:-4096} -export DTYPE=${DTYPE:-bf16} -export ZERO_STAGE=${ZERO_STAGE:-2} -export MICRO_BATCH=${MICRO_BATCH:-4} -export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} -export TRAIN_ITER=${TRAIN_ITER:-317892} -export SAVE_INTERVAL=${SAVE_INTERVAL:-200} -export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} -export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) -export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} -export TOKENIZER_MODEL="/lus/gecko/projects/Aurora_deployment/AuroraGPT/datasets/dolma/utils/tokenizer.model" -# export EXTRA_ARGS="" -export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" -# --------------------------------------------- - -# ---- Build DeepSpeed Config --------------------------------- -export DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" -bash "${HERE}/generate_config.sh" "${DS_CONFIG}" || exit -# ------------------------------------------------------------- - - -# ---- Specify output location -------------------------------- -export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" -# OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} -OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" -export OUTPUT_DIR="${OUTPUT_DIR}" -export OUTPUT_LOG="${OUTPUT_DIR}/output.log" -export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}" -echo "${OUTPUT_LOG}" >> "logs/latest" -mkdir -p "${OUTPUT_DIR}" -echo "!!!Please see logs at ${OUTPUT_DIR}" - - -gpt_args=() -ds_args=" " -ds_args=" --deepspeed ${ds_args}" -if [ "$PP" == 1 ]; then - ds_args=" --no-pipeline-parallel ${ds_args}" -fi -ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" -ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" - -# BUG: [???] ---------------------------------------------------------------- -# I dont know where this came from... -# > we are now using activation checkpoint provided by megatron, see below. -# --------------------------------------------------------------------------- -# -# NOTE: [???] --------------------------------------------------------------- -# In `train_llama_alcf_polaris.sh` we also pass -# `"--checkpoint-num-layers 1"` -# ---------------------------------------------------------------------------- -if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then - echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" - ds_args=" --deepspeed-activation-checkpointing ${ds_args}" - gpt_args+=( - "--checkpoint-activations" - ) - # "--checkpoint-num-layers 1" - # --checkpoint-activations \ - # --deepspeed-activation-checkpointing -fi - -# take custom args -custom_args=" $@" - -# Ensure `./hostfile_deepspeed` and `./hostfile_mpich` exist in $(pwd) -hfds="${HERE}/hostfile_deepspeed" -hfmpi="${HERE}/hostfile_mpich" -[ -f "$hfds" ] || exit -[ -f "$hfmpi" ] || exit - -# launcher setting -LAUNCHER=${LAUNCHER:-MPICH} -if [[ $LAUNCHER == "deepspeed" ]]; then - launcher="" -else - launcher="--force_multi --hostfile ${hfds} --launcher=${LAUNCHER} --launcher_args='-hostfile ${hfmpi}'" -fi - - -if [[ $(hostname) == x4* ]]; then - CCL=${CCL:-ccl} - BE="${CCL}" -elif [[ $(hostname) == x3* ]]; then - NCCL=${NCCL:-nccl} - BE="${NCCL}" -fi -# NCCL=${NCCL:-nccl} -EXEC=pretrain_gpt_alcf.py - -# MODEL=LLAMA_7B -# OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_tp${TP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} -echo "++++++++++++++++++++++++++++++++++++++++++++++++++" -echo "- WORLD_SIZE:${WORLD_SIZE}" -echo "- BACKEND: ${BE}" -echo "- MODEL_TYPE: ${MODEL_TYPE}" -echo "- DOCUMENT WEIGHT_SUM: ${WEIGHT_SUM}" -echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" -echo "- Using NUM_DOCS=${NUM_DOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" -echo "++++++++++++++++++++++++++++++++++++++++++++++++++" - -run_cmd=" - deepspeed $launcher ${EXEC} \ - --use-flash-attn \ - --num-key-value-heads ${NUM_KV_HEAD} \ - --tensor-model-parallel-size $TP \ - --pipeline-model-parallel-size $PP \ - --num-layers $NLAYERS \ - --hidden-size $HIDDEN \ - --num-attention-heads $HEADS \ - --seq-length $SEQ \ - --max-position-embeddings $SEQ \ - --micro-batch-size $MICRO_BATCH \ - --global-batch-size $GLOBAL_BATCH \ - --train-iters $TRAIN_ITER \ - --lr ${LR} \ - --lr-decay-style cosine \ - --log-interval 1 \ - --save-interval ${SAVE_INTERVAL} \ - --split 100,0,0 \ - --$DTYPE \ - --no-masked-softmax-fusion \ - --no-bias-gelu-fusion \ - --no-bias-dropout-fusion \ - --no-gradient-accumulation-fusion \ - --distributed-backend ${BE} \ - --tokenizer-type Llama2Tokenizer \ - --save checkpoints/${OUTPUT_PREFIX} \ - --load checkpoints/${OUTPUT_PREFIX} \ - --use-checkpoint-opt_param-scheduler \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --data-file-list ${DATA_FILE_LIST} \ - --data-cache-path ${DATA_CACHE_PATH} \ - $ds_args \ - ${LLAMA_ARGS} \ - ${gpt_args[*]} \ - $custom_args \ - |& tee ${OUTPUT_LOG} - " - # >> ${OUTPUT_LOG} 2>&1 & - # |& tee $OUTPUT_DIR/output.log - -# --ffn-hidden-size 11008 \ -# --vocab-file $VOCAB_FILE \ -# --merge-file $MERGE_FILE \ -# --lr-decay-iters 320000 \ -# --num-workers 0 \ -# --eval-iters ${EVAL_ITERS} \ -# --eval-interval ${EVAL_INTERVAL} \ -# --lr-warmup-iters 5000 \ -# --lr-decay-iters 10000 \ -# --accumulate-allreduce-grads-in-fp32 \ -# --data-impl mmap \ - -echo "All DeepSpeed(s): $(which -a deepspeed)" -echo "Using $(which deepspeed)" -ds_report - -echo "${run_cmd}" - -printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" -printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" - -eval "${run_cmd}" -set +x diff --git a/train_llama_alcf_polaris_hzheng.sh b/train_llama_alcf_polaris_hzheng.sh index 0ca7cb78bb..83d8a2c5a7 100755 --- a/train_llama_alcf_polaris_hzheng.sh +++ b/train_llama_alcf_polaris_hzheng.sh @@ -4,25 +4,22 @@ #PBS -q debug-scaling #PBS -l select=2 #PBS -l filesystems=eagle:grand:home -export PPN=4 -export MD=/home/hzheng/ALCF-Megatron-DeepSpeed -module load conda/2023-10-04 -#conda activate /soft/datascience/megatron-deepspeed/2023-10-04 -conda activate $HOME/PolarisAT/pyenvs/megatron/2023-10-04 cd ${PBS_O_WORKDIR} +export PPN=4 +export MD=/eagle/argonne_tpc/soft/Megatron-DeepSpeed +source /eagle/argonne_tpc/soft/conda.sh + export PBS_JOBSIZE=$(cat $PBS_NODEFILE | uniq | wc -l) export TP=1 export PP=1 export MBS=1 export BS=$((MBS*PBS_JOBSIZE*PPN/PP/TP)) export SP=$((PBS_JOBSIZE*PPN/PP/TP)) -#export DATA_PATH="/eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/" - -export export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") -export DATA_PATH="/eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple/" -#export DATA_FILE_LIST="/eagle/datasets//dolma//data_file_list_select.txt" -DATA_FILE_LIST=$PWD/test.txt -echo "BS: $BS\n PP:$PP \n TP: $TP, PBS_JOBSIZE: $PBS_JOBSIZE" +export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") +export DATA_FILE_LIST="/eagle/datasets//dolma/data_file_list_reweighted.txt" +echo "BS: $BS - PP:$PP - TP: $TP, PBS_JOBSIZE: $PBS_JOBSIZE" +# First time running, it will compile the fused kernels, which will take about 10 mins +# >>> done with compiling and loading fused kernels. Compilation time: 545.468 seconds HIDDEN_SIZE=4096 NUM_LAYERS=32 @@ -31,8 +28,9 @@ EMBEDDINGS=2048 TRAIN_ITERS=10 ZERO_STAGE=2 MODEL=LLAMA_7B +#LAUNCHER="//eagle/argonne_tpc/soft/Megatron-DeepSpeed/..//conda/2024-03-11/lib/python3.10/site-packages/deepspeed/launcher/launcher_helper.py --launcher mpich " OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} -MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --cpu-bind depth -d 16 --hostfile $PBS_NODEFILE python3 ./pretrain_gpt_alcf.py \ +APRUN_PMI=pmix aprun -n $((PBS_JOBSIZE*PPN)) -N $PPN --cc depth -d 16 /eagle/argonne_tpc/soft/Megatron-DeepSpeed/local_rank.sh python3 $LAUNCHER ./pretrain_gpt_alcf.py \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ --num-layers ${NUM_LAYERS} \ @@ -74,4 +72,5 @@ MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --data-file-list ${DATA_FILE_LIST} \ --data-path ${DATA_PATH} \ --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ - --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed + --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed \ + --data-cache-path ./data_cache_path/ diff --git a/train_llama_alcf_sunspot.sh b/train_llama_alcf_sunspot.sh deleted file mode 100644 index d5e83c57a0..0000000000 --- a/train_llama_alcf_sunspot.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/bin/bash --login -#PBS -l walltime=06:00:00 -#PBS -A argonne_tpc -#PBS -q prod -#PBS -l select=48 -#PBS -l filesystems=eagle:home - -function sourceFile() { - fp="$1" - echo "source-ing ${fp}" - if [[ -f "${fp}" ]]; then - # shellcheck source="${fp}" - source "${fp}" - else - echo "ERROR: UNABLE TO SOURCE ${fp}" - fi -} - -module () { - if [ -z "${LMOD_SH_DBG_ON+x}" ] - then - case "$-" in - (*v*x*) __lmod_sh_dbg='vx' ;; - (*v*) __lmod_sh_dbg='v' ;; - (*x*) __lmod_sh_dbg='x' ;; - esac - fi - if [ -n "${__lmod_sh_dbg:-}" ] - then - set +$__lmod_sh_dbg - echo "Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod's output" >&2 - fi - eval "$($LMOD_CMD $LMOD_SHELL_PRGM "$@")" && eval "$(${LMOD_SETTARG_CMD:-:} -s sh)" - __lmod_my_status=$? - if [ -n "${__lmod_sh_dbg:-}" ] - then - echo "Shell debugging restarted" >&2 - set -$__lmod_sh_dbg - fi - unset __lmod_sh_dbg - return $__lmod_my_status -} - -# -# eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" -# conda activate q4-drop - -if [[ $(hostname) == x1* || $(hostname) == x4* ]] ; then - echo "!!!! Caught Intel XPU, using CPU_OPTIMIZER !!!!" - export CPU_OPTIMIZER=1; -fi - - -# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -# ---- 0. Navigate into `$PBS_O_WORKDIR` ------------------------------------- -cd "${PBS_O_WORKDIR}" || exit -HERE=$(python3 -c 'import os; print(os.getcwd())') -export HERE -# ---- 1. Assert `./pretrain_gpt_alcf.py` exists: ----------------------------- -export EXEC="${HERE}/pretrain_gpt_alcf.py" -[ -f "${EXEC}" ] || exit -# ---- 2. `source ./ALCF/helpers_alcf.sh`: ------------------------------------ -sourceFile "${HERE}/ALCF/helpers.sh" || exit -# ---- 3. Call fns from `./ALCF/helpers_alcf.sh` ------------------------------------------------------------------ -setEnv || exit # 1. load `conda` environment -saveDSenv || exit # 2. save env vars to `.deepspeed_env` -ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars -makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` -setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` -buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ -setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} -setArgs || exit # 8. specify additional `deepspeed` arguments -setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset -setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` -printJobInfo || exit # 11. print job info -# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -# Take custom args -custom_args=" $@" - -# Assert `./hostfile_deepspeed` exists -export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit - -# hf="${HOSTFILE:-${PBS_NODEFILE}}" -# nh=$(wc -l "${hf}") -# if [[ "${nh}" -gt 1 ]]; then -# launch_cmd="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" -# else -# launch_cmd="python3 ${EXEC}" -# fi -# -# echo "launch_cmd: ${launch_cmd}" - - # --use-flash-attn-v2 \ - # python3 ${EXEC} \ -run_cmd=" - deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ - --$DTYPE \ - --num-workers 0 \ - --split 100,0,0 \ - --log-interval 1 \ - --no-bias-gelu-fusion \ - --lr-decay-style cosine \ - --no-bias-dropout-fusion \ - --no-masked-softmax-fusion \ - --tokenizer-type Llama2Tokenizer \ - --no-gradient-accumulation-fusion \ - --accumulate-allreduce-grads-in-fp32 \ - --use-checkpoint-opt_param-scheduler \ - --lr ${LR} \ - --seq-length $SEQ \ - --save ${CKPT_DIR} \ - --load ${CKPT_DIR} \ - --num-layers ${NLAYERS} \ - --hidden-size ${HIDDEN} \ - --train-iters ${TRAIN_ITER} \ - --eval-iters ${EVAL_ITERS} \ - --distributed-backend ${BE} \ - --num-attention-heads ${HEADS} \ - --save-interval ${SAVE_INTERVAL} \ - --eval-interval ${EVAL_INTERVAL} \ - --max-position-embeddings ${SEQ} \ - --micro-batch-size ${MICRO_BATCH} \ - --data-file-list ${DATA_FILE_LIST} \ - --tensor-model-parallel-size ${TP} \ - --global-batch-size ${GLOBAL_BATCH} \ - --pipeline-model-parallel-size ${PP} \ - --num-key-value-heads ${NUM_KV_HEAD} \ - --data-cache-path ${DATA_CACHE_PATH} \ - --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ - --tokenizer-model ${TOKENIZER_MODEL} \ - ${LLAMA_ARGS} \ - $ds_args \ - ${gpt_args[*]} \ - $custom_args \ - |& tee ${OUTPUT_LOG} - " - - # --------------------------------------------------- - # --vocab-file $VOCAB_FILE \ - # --merge-file $MERGE_FILE \ - # --lr-decay-iters 320000 \ - # --lr-warmup-iters 5000 \ - # --lr-decay-iters 10000 \ - # --num-workers 4 \ - # launch python3 ${EXEC} \ - # --data-impl mmap \ - # source ./ezpz/src/ezpz/bin/getjobenv || exit - # --------------------------------------------------- - # ${DIST_LAUNCH} ./local_rank.sh python3 ${EXEC} \ - # ${DIST_LAUNCH} python3 ${EXEC} \ - # deepspeed $launcher ${EXEC} \ - # >> ${OUTPUT_LOG} 2>&1 & - # >> ${OUTPUT_LOG} 2>&1 & - # |& tee $OUTPUT_DIR/output.log - # ${EXTRA_ARGS} \ - -echo "All DeepSpeed(s): $(which -a deepspeed)" -echo "Using $(which deepspeed)" -ds_report - -echo "${run_cmd}" - -printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" -printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" -# echo "${OUTPUT_LOG}" -eval "${run_cmd}" -set +x diff --git a/train_llama_alcf_polaris.sh b/train_llama_nersc_perlmutter.sh similarity index 70% rename from train_llama_alcf_polaris.sh rename to train_llama_nersc_perlmutter.sh index 2e1a23010c..8131579809 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_nersc_perlmutter.sh @@ -1,9 +1,11 @@ #!/bin/bash --login -#PBS -l walltime=06:00:00 -#PBS -A argonne_tpc -#PBS -q prod -#PBS -l select=48 -#PBS -l filesystems=eagle:home +#SBATCH -A m4388_g +#SBATCH -C 'gpu&hbm80g' +#SBATCH -q regular +#SBATCH -t 00:30:00 +#SBATCH --nodes 128 +#SBATCH --gpus 512 +# function sourceFile() { fp="$1" @@ -18,9 +20,11 @@ function sourceFile() { # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # ---- 0. Navigate into `$PBS_O_WORKDIR` ------------------------------------- -cd "${PBS_O_WORKDIR}" || exit +# cd "${PBS_O_WORKDIR}" || exit +cd "${SLURM_SUBMIT_DIR}" || exit HERE=$(python3 -c 'import os; print(os.getcwd())') export HERE +# dflfb="${HERE}/genslm-subsample.txt" # ---- 1. Assert `./pretrain_gpt_alcf.py` exists: ----------------------------- export EXEC="${HERE}/pretrain_gpt_alcf.py" [ -f "${EXEC}" ] || exit @@ -35,7 +39,7 @@ setParams || exit # 5. set command line arguments to pass to ` buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} setArgs || exit # 8. specify additional `deepspeed` arguments -setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset +setData "${DATA_FILE_LIST:-${dflfb}}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` printJobInfo || exit # 11. print job info # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -45,16 +49,40 @@ custom_args=" $@" # Assert `./hostfile_deepspeed` exists export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit +TBDIR="${CKPT_DIR}/tensorboard" +mkdir -p "${TBDIR}" # source "${HERE}/venvs/polaris/2024-03-14/bin/activate" || exit # echo "Using $(which python3)" # --launcher_args='--pmi=pmix' # deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ # ${launch_cmd} \ + # --optimizer adam \ + # --use-flash-attn-v2 \ + # deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ +# source ezpz/src/ezpz/bin/getjobenv || exit +# if [[ -z "${DIST_LAUNCH}" ]]; then +# setupSrun || exit +# echo "Using SRUN_EXEC: ${SRUN_EXEC}" +# else +# SRUN_EXEC="${DIST_LAUNCH}" +# fi +# echo "Using SRUN_EXEC: ${SRUN_EXEC}" +# +export NHOSTS="${SLURM_NNODES:-1}" +export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" +export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" +export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose" + + # srun --gpus ${NGPUS} \ + # --gpus-per-node ${NGPU_PER_HOST} \ + # -N ${NHOSTS} \ + # -n ${NGPUS} \ + # -l -u --verbose python3 ${EXEC} \ run_cmd=" - deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ - --use-flash-attn-v2 \ + ${SRUN_EXEC} python3 ${EXEC} \ --$DTYPE \ + --optimizer ${OPT} \ --num-workers 0 \ --split 100,0,0 \ --log-interval 1 \ @@ -66,10 +94,13 @@ run_cmd=" --no-gradient-accumulation-fusion \ --accumulate-allreduce-grads-in-fp32 \ --use-checkpoint-opt_param-scheduler \ + --tensorboard-dir ${TBDIR} \ + --log-timers-to-tensorboard \ + --log-optimizer-states-to-tensorboard \ --lr ${LR} \ - --seq-length $SEQ \ --save ${CKPT_DIR} \ --load ${CKPT_DIR} \ + --seq-length ${SEQ} \ --num-layers ${NLAYERS} \ --hidden-size ${HIDDEN} \ --train-iters ${TRAIN_ITER} \ @@ -95,9 +126,10 @@ run_cmd=" |& tee ${OUTPUT_LOG} " +run_cmd=$(echo "${run_cmd}" | sed -e 's/ */ /g') -echo "All DeepSpeed(s): $(which -a deepspeed)" -echo "Using $(which deepspeed)" +# echo "All DeepSpeed(s): $(which -a deepspeed)" +echo "! Using $(which deepspeed)" ds_report echo "${run_cmd}" diff --git a/train_sbatch_pp64.sh b/train_sbatch_pp64.sh deleted file mode 100755 index b7baf2539e..0000000000 --- a/train_sbatch_pp64.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash --login -#SBATCH -A m3957_g -#SBATCH -C 'gpu&hbm80g' -#SBATCH -q regular -#SBATCH -t 00:30:00 -#SBATCH --nodes 128 -#SBATCH --gpus 512 - - -# TODO:: -# - Add logic for catching / killing hung process at end of run to ensure -# second run starts up (otherwise, it will wait for the hung process, which -# will run until the job is killed) -# - This wll let us try running multiple experiments in a single slurm job -# allocation. -# - Existing (similar implementation) from my `~/bin/kill-match`: -# ```bash -# #!/bin/bash --login -# TO_KILL=$1 -# kill $(ps aux | grep -E "$USER.+($TO_KILL)" | grep -v grep | awk '{print $2}') - - -PPSIZE=64 \ - MODEL_SIZE_KEY="GPT1T_$(( 2 * PPSIZE ))L" \ - SEQ_LEN=2048 \ - MICRO_BATCH=2 \ - GAS=$(( 8 * PPSIZE )) \ - SP_TYPE=megatron \ - ZERO_STAGE=1 \ - USE_SEQUENCE_PARALLEL=0 \ - MPSIZE=8 \ - SPSIZE=1 \ - USE_ACTIVATION_CHECKPOINTING=1 \ - ./ALCF/train-gpt3.sh