Skip to content

Commit

Permalink
new old forgotten commits
Browse files Browse the repository at this point in the history
  • Loading branch information
hearues-zueke-github committed Oct 26, 2020
1 parent 6681d30 commit d7914ad
Show file tree
Hide file tree
Showing 11 changed files with 818 additions and 43 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
nim

.idea

*.pyc
Expand Down
1 change: 1 addition & 0 deletions compress_enwiki8/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
*.pkl.gz
*.hex
data_enwik8
192 changes: 192 additions & 0 deletions compress_enwiki8/compress_enwik8_attempt_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
#! /usr/bin/python3

# -*- coding: utf-8 -*-

import dill
import gzip
import os
import sys

# import tempfile
from memory_tempfile import MemoryTempfile
tempfile = MemoryTempfile()

from collections import defaultdict
from copy import deepcopy
from dotmap import DotMap
from operator import itemgetter

from os.path import expanduser

import multiprocessing as mp

PATH_ROOT_DIR = os.path.dirname(os.path.abspath(__file__)).replace("\\", "/")+"/"
HOME_DIR = os.path.expanduser("~")
TEMP_DIR = tempfile.gettempdir()+"/"

from PIL import Image

import numpy as np

sys.path.append("../")
from utils_serialization import get_pkl_gz_obj, save_pkl_gz_obj
import global_object_getter_setter

import utils_compress_enwik8

from create_stats_enwik8 import calc_sorted_stats

def calc_stats_using_bytes_tuple(arr, max_len):
used_len = 1000000
max_amount_values = 50
# max_len = 6
l_stat = []
s_chars = set()
for pos_i in range(0, arr.shape[0], used_len):
arr_1 = arr[pos_i:pos_i+used_len+max_len].reshape((-1, 1))
print("pos_i: {:9}, {:9}".format(pos_i, pos_i+used_len))
for _ in range(0, max_len-1):
arr_1 = np.hstack((arr_1[:-1], arr_1[1:, -1:]))
u, c = np.unique(arr_1.reshape((-1, )).view(','.join(['u1']*max_len)), return_counts=True)
if max_len == 1:
d = {tuple((t, )): j for t, j in zip(u, c)}
else:
d = {tuple(t): j for t, j in zip(u, c)}

# get the max len for each seperate combined bytes!
l_t, l_j = list(zip(*list(d.items())))
i_max = np.argmax(l_j)

print("- max_len: {:2}, amount: {:10}, mult: {:10}, t: {}".format(max_len, l_j[i_max], max_len*l_j[i_max], l_t[i_max]))
print("-- len(d): {}".format(len(d)))
s_chars |= set(list(d.keys()))

l = list(d.items())
l_sort = sorted(list(d.items()), reverse=True, key=lambda x: (x[1], x[0]))

l_stat.append('{:9},{:9}:{}'.format(
pos_i,
pos_i+used_len,
'|'.join(['{},{:5}'.format(''.join(map(lambda x: '{:02X}'.format(x), t)), c) for t, c in l_sort[:max_amount_values]])
))
l = sorted(s_chars)
print("l: {}".format(l))
print("len(l): {}".format(len(l)))

with open(TEMP_DIR+'enwik8_stats_max_len_{}.txt'.format(max_len), 'w') as f:
f.write('\n'.join(l_stat)+'\n')


if __name__ == "__main__":
file_object_name = 'global_compress_enwik8_attempt_2_object'

if not global_object_getter_setter.do_object_exist(file_object_name):
arr = utils_compress_enwik8.get_arr(used_length=-1)


# calc_stats_using_bytes_tuple(arr, 1)

# l_proc = []
# cpu_count = mp.cpu_count()
# for i in range(2, cpu_count+2):
# l_proc.append(mp.Process(target=calc_stats_using_bytes_tuple, args=(arr, i)))
# for proc in l_proc: proc.start()
# for proc in l_proc: proc.join()

d_all_part, l_sort = calc_sorted_stats()
d3 = d_all_part[3]
l_k_3 = list(d3.keys())
l_k_i_2_byte = [(k, (0, i)) for i, k in enumerate(l_k_3, 0)]

l_sort_ge_4_byte = sorted([(len(k)*v, -len(k), v, k) for k1 in range(4, 14) for k, v in d_all_part[k1].items()], reverse=True)
l_k_i_2_byte += [(k, (0, i)) for i, (_, _, _, k) in enumerate(l_sort_ge_4_byte[:256-len(l_k_3)], len(l_k_3))]

l_k_i_3_byte = [(k, (0, i//256, i%256)) for i, (_, _, _, k) in enumerate(l_sort_ge_4_byte[256-len(l_k_3):], 0)]


d_obj = {
'arr': arr,
'd_all_part': d_all_part,
'l_sort': l_sort,
'd3': d3,
'l_k_3': l_k_3,
'l_k_i_2_byte': l_k_i_2_byte,
'l_sort_ge_4_byte': l_sort_ge_4_byte,
'l_k_i_2_byte': l_k_i_2_byte,
'l_k_i_3_byte': l_k_i_3_byte,
# 'd_k_to_count': d_k_to_count,
# 'd_k_to_i_byte': d_k_to_i_byte,
# 'l_arr': l_arr,
}
print('Save global DATA!')
global_object_getter_setter.save_object(file_object_name, d_obj)
else:
print('Load global DATA!')
d_obj = global_object_getter_setter.load_object(file_object_name)
arr = d_obj['arr']
d_all_part = d_obj['d_all_part']
l_sort = d_obj['l_sort']
d3 = d_obj['d3']
l_k_3 = d_obj['l_k_3']
l_k_i_2_byte = d_obj['l_k_i_2_byte']
l_sort_ge_4_byte = d_obj['l_sort_ge_4_byte']
l_k_i_2_byte = d_obj['l_k_i_2_byte']
l_k_i_3_byte = d_obj['l_k_i_3_byte']
# d_k_to_count = d_obj['d_k_to_count']
# d_k_to_i_byte = d_obj['d_k_to_i_byte']
# l_arr = d_obj['l_arr']

d_k_to_count = {k: v for k1 in range(3, 14) for k, v in d_all_part[k1].items()}
d_k_to_i_byte = dict(l_k_i_2_byte+l_k_i_3_byte)

print("len(d_k_to_count): {}".format(len(d_k_to_count)))
print("len(d_k_to_i_byte): {}".format(len(d_k_to_i_byte)))

assert set(list(d_k_to_count)) == set(list(d_k_to_i_byte))

l_arr = arr.tolist()
l_encrypt = []

max_len = 13
length = len(l_arr)
i = 0
while i < length:
l = []
l_count = []
l_mult = []

length_byte = 3
j = i+3
while j <= length and length_byte <= max_len:
t = tuple(l_arr[i:j])

if t in d_k_to_count:
l.append(t)
c = d_k_to_count[t]
l_count.append(c)
l_mult.append(len(t)*c)

j += 1
length_byte += 1

if len(l) == 0:
l_encrypt.append(l_arr[i])
i += 1
else:
i_max = len(l)-1
# i_max = np.argmax(l_mult)
t_max = l[i_max]
l_byte = d_k_to_i_byte[t_max]
l_encrypt.extend(l_byte)
i += len(t_max)

# print("l: {}".format(l))
# print("l_count: {}".format(l_count))
# print("l_mult: {}".format(l_mult))
# break

if i % 10000 == 0:
print("i: {}".format(i))

if i > 10000000:
break
102 changes: 89 additions & 13 deletions compress_enwiki8/compress_enwiki8.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,23 @@
import gzip
import os
import sys
import tempfile

# import tempfile
from memory_tempfile import MemoryTempfile
tempfile = MemoryTempfile()

from collections import defaultdict
from copy import deepcopy
from dotmap import DotMap
from operator import itemgetter
# from sortedcontainers import SortedSet

from os.path import expanduser
PATH_HOME = expanduser("~")+'/'
print("PATH_HOME: {}".format(PATH_HOME))

import multiprocessing as mp

PATH_ROOT_DIR = os.path.dirname(os.path.abspath(__file__)).replace("\\", "/")+"/"
HOME_DIR = os.path.expanduser("~")
TEMP_DIR = tempfile.gettempdir()+"/"

from PIL import Image

Expand All @@ -27,10 +33,8 @@
import global_object_getter_setter

import utils_compress_enwik8
# utils_compress_enwik8.do_some_simple_tests()
# sys.exit()

PATH_ROOT_DIR = os.path.abspath(os.path.dirname(sys.argv[0]))+"/"
from create_stats_enwik8 import calc_sorted_stats

def create_dict_word_count_for_arr(arr, max_byte_length=10):
d_arr_comb = {}
Expand Down Expand Up @@ -59,13 +63,84 @@ def create_dict_word_count_for_arr(arr, max_byte_length=10):


if __name__ == "__main__":
arr = utils_compress_enwik8.get_arr(used_length=-1)

def calc_stats_using_bytes_tuple(arr, max_len):
used_len = 1000000
max_amount_values = 50
# max_len = 6
l_stat = []
s_chars = set()
for pos_i in range(0, arr.shape[0], used_len):
arr_1 = arr[pos_i:pos_i+used_len+max_len].reshape((-1, 1))
print("pos_i: {:9}, {:9}".format(pos_i, pos_i+used_len))
for _ in range(0, max_len-1):
arr_1 = np.hstack((arr_1[:-1], arr_1[1:, -1:]))
u, c = np.unique(arr_1.reshape((-1, )).view(','.join(['u1']*max_len)), return_counts=True)
if max_len == 1:
d = {tuple((t, )): j for t, j in zip(u, c)}
else:
d = {tuple(t): j for t, j in zip(u, c)}

# arr = utils_compress_enwik8.get_arr(used_length=2**21)
arr = utils_compress_enwik8.get_arr(used_length=2**18)
# arr = utils_compress_enwik8.get_arr(used_length=2**23)
bytes_starting_size = arr.shape[0]
# arr = utils_compress_enwik8.get_arr(used_length=2**22+1)
# get the max len for each seperate combined bytes!
l_t, l_j = list(zip(*list(d.items())))
i_max = np.argmax(l_j)

print("- max_len: {:2}, amount: {:10}, mult: {:10}, t: {}".format(max_len, l_j[i_max], max_len*l_j[i_max], l_t[i_max]))
print("-- len(d): {}".format(len(d)))
s_chars |= set(list(d.keys()))

l = list(d.items())
l_sort = sorted(list(d.items()), reverse=True, key=lambda x: (x[1], x[0]))

l_stat.append('{:9},{:9}:{}'.format(
pos_i,
pos_i+used_len,
'|'.join(['{},{:5}'.format(''.join(map(lambda x: '{:02X}'.format(x), t)), c) for t, c in l_sort[:max_amount_values]])
))
l = sorted(s_chars)
print("l: {}".format(l))
print("len(l): {}".format(len(l)))

with open(TEMP_DIR+'enwik8_stats_max_len_{}.txt'.format(max_len), 'w') as f:
f.write('\n'.join(l_stat)+'\n')

# calc_stats_using_bytes_tuple(arr, 1)

# l_proc = []
# cpu_count = mp.cpu_count()
# for i in range(2, cpu_count+2):
# l_proc.append(mp.Process(target=calc_stats_using_bytes_tuple, args=(arr, i)))
# for proc in l_proc: proc.start()
# for proc in l_proc: proc.join()

d_all_part, l_sort = calc_sorted_stats()
d3 = d_all_part[3]
l_k_3 = list(d3.keys())
l_k_i_2_byte = [(k, (0, i)) for i, k in enumerate(l_k_3, 0)]

l_sort_ge_4_byte = sorted([(len(k)*v, -len(k), v, k) for k1 in range(4, 14) for k, v in d_all_part[k1].items()], reverse=True)
l_k_i_2_byte += [(k, (0, i)) for i, (_, _, _, k) in enumerate(l_sort_ge_4_byte[:256-len(l_k_3)], len(l_k_3))]

l_k_i_3_byte = [(k, (0, i//256, i%256)) for i, (_, _, _, k) in enumerate(l_sort_ge_4_byte[256-len(l_k_3):], 0)]

d_k_to_count = {k: v for k1 in range(3, 14) for k, v in d_all_part[k1].items()}
d_k_to_i_byte = dict(l_k_i_2_byte+l_k_i_3_byte)

print("len(d_k_to_count): {}".format(len(d_k_to_count)))
print("len(d_k_to_i_byte): {}".format(len(d_k_to_i_byte)))

assert set(list(d_k_to_count)) == set(list(d_k_to_i_byte))

l_encrypt = []

l_arr = arr.tolist()


sys.exit()


bytes_starting_size = arr.shape[0]

# global_object_getter_setter.delete_object(OBJ_NAME_D_ARR_COMB)
# global_object_getter_setter.delete_object(OBJ_NAME_D_ARR_COMB_UNIQUE)
Expand Down Expand Up @@ -411,7 +486,6 @@ def create_dict_word_count_for_arr(arr, max_byte_length=10):
# sys.exit()

print()
print("bytes_starting_size: {}".format(bytes_starting_size))
print("LEN_BITS_CHOSEN_INDEX: {}".format(LEN_BITS_CHOSEN_INDEX))
print("LEN_CHOSEN_INDEX: {}".format(LEN_CHOSEN_INDEX))
print("MAX_BYTE_LENGTH: {}".format(MAX_BYTE_LENGTH))
Expand Down Expand Up @@ -440,6 +514,8 @@ def create_dict_word_count_for_arr(arr, max_byte_length=10):
if not os.path.exists(tmp_hex_dir):
os.makedirs(tmp_hex_dir)

print("bytes_starting_size: {}".format(bytes_starting_size))

arr_compressed_full.tofile(
(tmp_hex_dir+'content_compressed_size_orig_{size_orig}_size_comp_{size_comp}_round_nr_{round_nr}'+
'_max_idx_{LEN_CHOSEN_INDEX}_max_word_len_{MAX_BYTE_LENGTH}_max_first_pos_{MAX_FIRST_POS}.hzzv2.hex').format(
Expand Down
Loading

0 comments on commit d7914ad

Please sign in to comment.