cleaningredients.py

# -*- coding: utf-8 -*-
"""CleanIngredients.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/11uoiNOGI9JVesOPz0vrBmbnoRFPwSSue
"""

import numpy as np
import pandas as pd

# upload words-by-frequency.txt
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))
  filename = fn

pip install clean-text

# input string
text = """
    ["Vegetable oil, for grill4ears of corn, shucked1/2c.(1 stick) unsalted butter, softened to room temperature1jalape\u00f1o, seeded and finely chopped1clove garlic, minced2tbsp.chopped fresh cilantro2tbsp.honey1tbsp.chopped fresh oregano1tsp.finely grated lime zest2tsp.fresh lime juice1/2tsp.kosher salt1/4tsp.fresh ground black pepper","Vegetable oil, for grill4ears of corn, shucked1/2c.(1 stick) unsalted butter, softened to room temperature1jalape\u00f1o, seeded and finely chopped1clove garlic, minced2tbsp.chopped fresh cilantro2tbsp.honey1tbsp.chopped fresh oregano1tsp.finely grated lime zest2tsp.fresh lime juice1/2tsp.kosher salt1/4tsp.fresh ground black pepper"]
    """

# import library
from cleantext import clean

 
print(clean(text=text,
            fix_unicode=True,
            to_ascii=True,
            lower=True,
            no_line_breaks=False,
            no_urls=False,
            no_emails=False,
            no_phone_numbers=False,
            no_numbers=False,
            no_digits=False,
            no_currency_symbols=False,
            no_punct=False,
            replace_with_punct="",
            replace_with_url="This is a URL",
            replace_with_email="Email",
            replace_with_phone_number="",
            replace_with_number="123",
            replace_with_digit="0",
            replace_with_currency_symbol="$",
            lang="en"
            ))

# text = text.replace("\"", "")
# text = text.replace(",", "")
# text = text.replace("(", "")
# text = text.replace("(", "")
text = text.replace(" ", "")
print(text)

from math import log

# Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
words = open("words-by-frequency.txt").read().split()
wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))
maxword = max(len(x) for x in words)

def infer_spaces(s):
    """Uses dynamic programming to infer the location of spaces in a string
    without spaces."""

    # Find the best match for the i first characters, assuming cost has
    # been built for the i-1 first characters.
    # Returns a pair (match_cost, match_length).
    def best_match(i):
        candidates = enumerate(reversed(cost[max(0, i-maxword):i]))
        return min((c + wordcost.get(s[i-k-1:i], 9e999), k+1) for k,c in candidates)

    # Build the cost array.
    cost = [0]
    for i in range(1,len(s)+1):
        c,k = best_match(i)
        cost.append(c)

    # Backtrack to recover the minimal-cost string.
    out = []
    i = len(s)
    while i>0:
        c,k = best_match(i)
        assert c == cost[i]
        out.append(s[i-k:i])
        i -= k

    return " ".join(reversed(out))

print(words)

print (text)

import re

regex = re.compile('[^a-zA-Z]')
#First parameter is the replacement, second parameter is your input string
regex.sub('', 'ab3d*E')
#Out: 'abdE'

text = regex.sub('', text)
text = text.lower()

text = infer_spaces(text)
print(text)

text = text.replace(" sauce", "_sauce")
text = text.replace(" sugar", "_sugar")
text = text.replace(" oil", "_oil")
text = text.replace(" juice", "_juice")
print(text)

# upload food.txt
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))
  filename = fn

food_words = open("food.txt").read().split()

print(food_words)

import spacy
nlp = spacy.load("en_core_web_sm")

set_ingredients = set()

def classify_edible(item):
    # edible_keywords = [
    #     'food', 'edible', 'eat', 'consume', 'taste', 'ingest', 'nutrition',
    #     'digest', 'swallow', 'nourishment', 'snack', 'meal', 'drink'
    # ]

    for keyword in food_words:
        if (keyword == item.lower()): # redifine equal as "similar"
            return "Edible"

    return "Not Edible"
def is_noun(string):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(string)

    for token in doc:
        if token.pos_ == "NOUN":
            return True
    
    return False
# Example usage
items = text.split(' ')

for item in items:
    classification = classify_edible(item)
    if((classification=="Edible")and(is_noun(item))):
      set_ingredients.add(item)
      #print(f"{item}: {classification}")

print("Ingredients are:", set_ingredients)

set_ingredients.discard("teaspoon")
set_ingredients.discard("teaspoons")
set_ingredients.discard("tablespoon")
set_ingredients.discard("tablespoons")
set_ingredients.discard("pieces")
set_ingredients.discard("piece")

print("Ingredients are:", set_ingredients)

final_str = ""

for val in set_ingredients:
    final_str += val + ","

final_str = final_str[:-1]
print(final_str)