diff --git a/your-code/main.ipynb b/your-code/main.ipynb index c68edab..f0595dc 100755 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -25,13 +25,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "corpus = []\n", - "\n", - "# Write your code here" + "# Write your code here\n", + "import re\n", + "import pandas as pd\n", + "data=[pd.read_csv(d) for d in docs]\n", + "data=pd.concat(data,axis=0)\n", + "data=list(data)\n", + "for i in range(len(data)):\n", + " corpus.append((data[i][:-1].lower())) " ] }, { @@ -43,10 +49,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ironhack is cool', 'i love ironhack', 'i am a student at ironhack']\n" + ] + } + ], + "source": [ + "print(corpus)" + ] }, { "cell_type": "markdown", @@ -84,13 +100,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "bag_of_words = []\n", - "\n", - "# Write your code here" + "# Write your code here\n", + "dummy=[re.split(r' ',i) for i in corpus]\n", + "for i in dummy:\n", + " for j in i:\n", + " if j not in bag_of_words:\n", + " bag_of_words.append(j)" ] }, { @@ -104,10 +124,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ironhack', 'is', 'cool', 'i', 'love', 'am', 'a', 'student', 'at']\n" + ] + } + ], + "source": [ + "print(bag_of_words)" + ] }, { "cell_type": "markdown", @@ -118,13 +148,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "term_freq = []\n", - "\n", - "# Write your code here" + "# Write your code here\n", + "i=0\n", + "dummy=[re.split(r' ',phrase) for phrase in corpus]\n", + "for phrase in corpus:\n", + " term_freq.append([])\n", + " for word in bag_of_words:\n", + " aux=0\n", + " for dum in dummy[i]:\n", + " if word==dum:\n", + " term_freq[i].append(1) \n", + " aux=1\n", + " if aux==0:\n", + " term_freq[i].append(0)\n", + " i=i+1" ] }, { @@ -138,10 +180,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 1, 1, 1, 1]]\n" + ] + } + ], + "source": [ + "print(term_freq)" + ] }, { "cell_type": "markdown", @@ -169,7 +221,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.8.8" } }, "nbformat": 4,