-
Notifications
You must be signed in to change notification settings - Fork 1
/
metadata.py
33 lines (24 loc) · 1.08 KB
/
metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import json
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
with open("E:/JURIMIND/v0.0.1/ipc_reduced.json", 'r', encoding='utf-8') as json_file:
data = json.load(json_file)
section_desc = [entry['section_desc'] for entry in data]
section_title = [entry['section_title'] for entry in data]
chapter_title = [entry['chapter_title'] for entry in data]
section_no = [str(entry['Section']) for entry in data]
chapter_no = [str(entry['chapter']) for entry in data]
tfidf_vectorizer = TfidfVectorizer()
section_desc_vectors = tfidf_vectorizer.fit_transform(section_desc)
section_title_vectors = tfidf_vectorizer.fit_transform(section_title)
chapter_title_vectors = tfidf_vectorizer.fit_transform(chapter_title)
metadata_df = pd.DataFrame({
'Section': section_no,
'chapter': chapter_no,
'Section_Desc_Vector': section_desc_vectors,
'Section_Title_Vector': section_title_vectors,
'Chapter_Title_Vector': chapter_title_vectors
})
metadata_output_file = 'E:/vectorization/ipc_metadata.pkl'
joblib.dump(metadata_df, metadata_output_file)