Skip to content

Commit

Permalink
Latest changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Grigory Zhigalov committed Feb 21, 2019
1 parent dcf5aa3 commit 7608c2c
Show file tree
Hide file tree
Showing 9 changed files with 923 additions and 373 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ data/
*.gexf
*.html
*.csv
tensorboard/
cc_env/
runs/
models/
tmp/
tmp_*/
293 changes: 101 additions & 192 deletions 2.3_opencorpora_doc2vec.ipynb

Large diffs are not rendered by default.

122 changes: 1 addition & 121 deletions 3.2_civil_articles_classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -91,126 +91,6 @@
"print(f\"Number of unique articles in the dataset = {mlb.classes_.shape[0]}\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['1', '1.2', '1.3', '1.4', '1.5', '1.7', '10', '10.6', '100',\n",
" '1005', '101', '101.4', '102', '103', '103.', '104', '104.1',\n",
" '1041', '105', '1050', '106', '1064', '1065', '1068', '1069',\n",
" '107', '1070', '1071', '1072', '1073', '1074', '1079', '108',\n",
" '1080', '1081', '1082', '1083', '1084', '1085', '1086', '1088',\n",
" '1089', '109', '109.1', '1091', '1094', '1095', '1096', '1098',\n",
" '1099', '11', '11.1', '11.10', '11.2', '11.3', '11.5', '11.8',\n",
" '11.9', '110', '1100', '1101', '1102', '1103', '1104', '1105',\n",
" '1107', '1109', '111', '1110', '1111', '1112', '1113', '1114',\n",
" '1115', '1116', '1117', '1118', '1119', '112', '1120', '1124',\n",
" '1125', '1126', '1127', '1128', '1129', '113', '1130', '1131',\n",
" '1132', '114', '1141', '1142', '1143', '1144', '1146', '1148',\n",
" '1149', '115', '1150', '1151', '1152', '1153', '1154', '1155',\n",
" '1156', '1157', '1158', '1159', '116', '1161', '1162', '1165',\n",
" '1168', '1169', '117', '1170', '1174', '1175', '1177', '118',\n",
" '1181', '119', '12', '12.', '12.1', '12.12', '12.13', '12.14',\n",
" '12.15', '12.16', '12.19', '12.24', '12.26', '12.27', '12.28',\n",
" '12.29', '12.3', '12.37', '12.7', '12.8', '12.9', '120', '121',\n",
" '121.5', '122', '123', '123.17', '123.2', '123.20', '123.22',\n",
" '124', '125', '126', '127', '128', '129', '13', '13.1', '130',\n",
" '131', '132', '133', '134', '135', '136', '137', '138', '139',\n",
" '14', '14.1', '14.15', '14.16', '14.5', '14.6', '14.8', '140',\n",
" '142', '143', '143.1', '144', '145', '146', '147', '1473', '148',\n",
" '15', '15.1', '150', '151', '152', '153', '154', '154.', '155',\n",
" '156', '157', '158', '159', '159.2', '16', '16.1', '16.18',\n",
" '16.24', '160', '161', '161.1', '162', '163', '164', '165',\n",
" '165.1', '166', '167', '167.', '168', '169', '17', '17.1', '17.14',\n",
" '17.15', '170', '171', '173', '173.1', '174', '174.1', '175',\n",
" '176', '177', '178', '179', '18', '18.', '18.8', '180', '181',\n",
" '181.1', '181.2', '181.3', '181.4', '181.5', '182', '184', '185',\n",
" '185.1', '186', '187', '188', '189', '19', '19.15', '19.16',\n",
" '19.22', '19.24', '19.3', '19.5', '190', '191', '192', '193',\n",
" '194', '195', '196', '1966', '197', '198', '199', '2', '2.1',\n",
" '2.2', '2.9', '20', '20.', '20.1', '20.11', '20.12', '20.20',\n",
" '20.21', '20.25', '20.8', '200', '201', '201.1', '201.4', '201.8',\n",
" '202', '203', '204', '205', '206', '207', '208', '209', '21',\n",
" '21.1', '210', '211', '212', '213', '213.11', '214', '215', '216',\n",
" '217', '217.1', '218', '219', '22', '220', '221', '222', '223',\n",
" '224', '225', '225.1', '226', '227', '227.1', '228', '228.1',\n",
" '229', '23', '23.1', '23.2', '23.3', '230', '231', '232', '232.4',\n",
" '233', '234', '234.', '235', '236', '237', '238', '239', '24',\n",
" '24.5', '240', '241', '242', '243', '244', '244.1', '244.2',\n",
" '244.6', '245', '246', '247', '248', '249', '25', '25.1', '25.10',\n",
" '25.2', '25.3', '25.5', '25.7', '250', '251', '252', '253', '254',\n",
" '255', '256', '257', '258', '259', '26', '26.1', '26.3', '260',\n",
" '261', '261.1', '261.4', '261.5', '261.7', '261.8', '262', '263',\n",
" '264', '265', '266', '267', '268', '269', '27', '27.1', '27.13',\n",
" '270', '271', '272', '273', '274', '277', '278', '28', '28.',\n",
" '28.1', '28.2', '28.5', '282', '284', '285', '286', '287', '288',\n",
" '289', '29', '29.1', '290', '291', '291.2', '292', '293', '294',\n",
" '296', '3', '3.', '3.1', '3.3', '30', '30.1', '30.3', '30.7',\n",
" '301', '302', '303', '304', '305', '306', '307', '308', '309',\n",
" '31', '310', '312', '314', '314.1', '315', '316', '317', '317.1',\n",
" '318', '319', '32', '32.', '32.6', '321', '322', '322.1', '322.2',\n",
" '323', '324', '325', '327', '327.1', '329', '33', '33.19', '330',\n",
" '330.40', '331', '332', '333', '333.', '333.16', '333.17',\n",
" '333.18', '333.19', '333.20', '333.22', '333.33', '333.35',\n",
" '333.36', '333.369', '333.40', '333.41', '3333', '334', '334.1',\n",
" '335', '336', '337', '338', '339', '339.1', '339.19', '34', '34.',\n",
" '340', '341', '342', '343', '345', '346', '346.30', '347', '348',\n",
" '349', '35', '350', '350.1', '351', '351.1', '352', '353', '355',\n",
" '356', '357', '358', '359', '36', '36.21', '36.5', '360', '361',\n",
" '362', '363', '364', '365', '367', '37', '372', '374', '38',\n",
" '38.1', '380', '381', '382', '3829', '383', '384', '385', '387',\n",
" '388', '389', '389.1', '39', '39.1', '39.11', '39.14', '39.15',\n",
" '39.16', '39.17', '39.2', '39.20', '39.3', '39.5', '39.7', '39.9',\n",
" '390', '391', '392', '393', '394', '395', '396', '397', '398',\n",
" '399', '4', '4.1', '4.2', '4.3', '4.5', '4.6', '4.7', '40', '400',\n",
" '401', '402', '403', '404', '405', '406', '407', '408', '409',\n",
" '41', '410', '411', '415', '418', '419', '42', '420', '421', '422',\n",
" '423', '424', '425', '426', '427', '428', '429', '43', '430',\n",
" '431', '432', '433', '434', '435', '436', '437', '438', '44',\n",
" '44.1', '440', '441', '442', '443', '445', '446', '447', '448',\n",
" '449', '449.1', '45', '450', '451', '452', '453', '454', '455',\n",
" '456', '457', '458', '46', '464', '469', '47', '470', '475', '476',\n",
" '477', '48', '4839900', '484', '485', '486', '487', '488', '489',\n",
" '49', '492', '494', '5', '5.27', '5.3', '50', '500', '503', '506',\n",
" '51', '513', '516', '52', '523', '53', '53.1', '532', '539', '54',\n",
" '54.1', '540', '541', '544', '545', '546', '547', '548', '549',\n",
" '55', '550', '551', '552', '554', '555', '556', '558', '56', '567',\n",
" '568', '57', '570', '572', '574', '578', '58', '585', '59', '6',\n",
" '6.1', '6.11', '6.12', '6.17', '6.2', '6.3', '6.5', '6.6', '6.9',\n",
" '60', '60.2', '606', '607', '608', '609', '61', '61.1', '61.2',\n",
" '61.8', '610', '611', '614', '615', '619', '62', '62.1', '621',\n",
" '622', '63', '64', '642', '65', '651', '66', '67', '67.1', '671',\n",
" '672', '673', '674', '675', '677', '678', '679', '68', '68.',\n",
" '682', '684', '685', '687', '689', '69', '690', '699', '7', '7.1',\n",
" '7.6', '70', '702', '703', '708', '709', '71', '71.', '711', '72',\n",
" '720', '721', '723', '724', '729', '73', '730', '737', '739', '74',\n",
" '740', '746', '75', '76', '76.1', '77', '77.1', '779', '78', '781',\n",
" '782', '783', '786', '79', '8', '8.1', '8.21', '80', '800', '807',\n",
" '808', '809', '81', '810', '811', '811.', '812', '813', '814',\n",
" '818', '819', '82', '820', '821', '822', '823', '83', '834', '84',\n",
" '84.1', '843', '844', '845', '846', '847', '849', '85', '850',\n",
" '851', '854', '855', '857', '859', '86', '865', '87', '88', '886',\n",
" '889', '89', '896', '9', '9.', '9.1', '9.13', '9.2', '9.4', '90',\n",
" '906', '91', '92', '927', '929', '93', '930', '931', '934', '935',\n",
" '936', '94', '940', '942', '943', '944', '945', '947', '948', '95',\n",
" '951', '954', '957', '958', '96', '961', '963', '964', '965',\n",
" '966', '968', '971', '972', '974', '98', '98.1', '99'],\n",
" dtype=object)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mlb.classes_"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -523,7 +403,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
"version": "3.5.2"
}
},
"nbformat": 4,
Expand Down
120 changes: 120 additions & 0 deletions bert_experiments.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from bert_serving.client import BertClient"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"bc = BertClient(ip='localhost', check_length=False)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"oc = pd.read_csv('tmp/opencorpora.csv.gz', compression='gzip')\n",
"criminal_code = pd.read_csv('tmp/vectors/criminal_code.csv.gz', compression='gzip')\n",
"criminal_court_orders = pd.read_csv('tmp/vectors/criminal_court_orders.csv.gz', compression='gzip')\n",
"civil_code = pd.read_csv('tmp/vectors/civil_code.csv.gz', compression='gzip')\n",
"civil_court_orders = pd.read_csv('tmp/vectors/civil_court_orders.csv.gz', compression='gzip')\n",
"\n",
"oc['source'] = 'opencorpora'\n",
"criminal_code['source'] = 'criminal_code'\n",
"criminal_court_orders['source'] = 'criminal_court_orders'\n",
"civil_code['source'] = 'civil_code'\n",
"civil_court_orders['source'] = 'civil_court_orders'\n",
"\n",
"criminal_code['name'] = criminal_code.apply(lambda x: \"Ст.{} {}\".format(x['article_number'], x['article_name']), axis=1)\n",
"civil_code['name'] = civil_code.apply(lambda x: \"Ст.{} {}\".format(x['article_number'], x['article_name']), axis=1)\n",
"\n",
"civil_court_orders['name'] = [\"Гражд. дело {}\".format(i) for i in range(civil_court_orders.shape[0])]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"tmp = pd.concat([\n",
" oc.reset_index()[['index', 'title', 'text', 'source']].rename(columns={'title':'name'}), \n",
" criminal_code.reset_index()[['index', 'name', 'article', 'source']].rename(columns={\"article\": \"text\"}),\n",
" criminal_court_orders.reset_index()[['index', 'title', 'case', 'source']].rename(columns={'title':'name', \"case\":\"text\"}),\n",
" civil_code.reset_index()[['index', 'name', 'article', 'source']].rename(columns={\"article\": \"text\"}),\n",
" civil_court_orders.reset_index()[['index', 'name', 'text', 'source']],\n",
"]).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.13 s, sys: 3.28 s, total: 7.41 s\n",
"Wall time: 21min 29s\n"
]
}
],
"source": [
"%%time\n",
"bert_vec = bc.encode(tmp['text'].tolist())"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"tmp['vectors'] = bert_vec.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"tmp.to_csv(\"tmp/vectors/bert_all.csv.gz\", compression='gzip', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.6 (testing)",
"language": "python",
"name": "testing"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 7608c2c

Please sign in to comment.