Python

Read and write CSV with encoding

import pandas as pd
data_set = pd.read_csv("train.csv") # default encoding is utf-8
data_set = pd.read_csv("train.csv", encoding='utf-8')
data_set.to_csv("train-utf8.csv")

Look at first 5 rows

data_set.sample(5)

Find missing data

# null value Series object
missing_data_summary = data_set.isnull().sum()
# The total number of missing data cells
missing_data_summary.sum()

Data set attributes

data_set.shape #Find the size - tuple
data_set.columns #Find all columns
data_set.Sex #Find all values of 'Sex' column

Product of a tuple

import numpy as np
tup = (3,5)
np.product(tup) # 15

Remove all columns with at least one missing value

columns_with_na_dropped = data_set.dropna(axis=1)

Replace all NA's the value that comes directly after it in the same column, then replace all the remaining na's with 0

data_set.fillna(method = 'bfill', axis=0).fillna(0)

Add a new column to data set

data_set['NewColomn'] = data_set['Age'].isnull()

For in if syntax

a=[1,2,3,4,5]
b = [v for v in a if v%2 ==0] # [2,4]
# Get the index of the list
print ([i for i, e in enumerate([1, 2, 1]) if e == 1])

Find accuracy of the prediction

from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)

Parse date

date = '20/08/1981'
date_parsed = pd.to_datetime(date, format = "%d/%m/%Y")
day = date_parsed.day
month = date_parsed.month
year = date_parsed.year

Encode and decode

before = '你好'
after = before.encode("utf8", errors = "replace")
print(after.decode("utf8"))

Read rawdata and detect charcoding

import chardet

with open("ming.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

Data set unique, sort, lower case

unique_names = data_set['Name'].unique()
sorted_names = unique_names.sort()
lower_name_column = data_set['Name'].str.lower()
trimmed_name_column = data_set['Name'].str.strip()

Find similar words

import fuzzywuzzy
from fuzzywuzzy import process

trimmed_name_column = data_set['Name'].str.strip()
matches = fuzzywuzzy.process.extract("Braund, Mr. Owen Harris", trimmed_name_column, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

Print mix objects - convert object to string

l = ['a','b']
print ('list is'+str(l))

Data set Group by and mean

sex_survive_rate = data_set.groupby('Sex').mean()

Select rows from data set

#Select the rows with 'Name' column contains 'Master'
boys = train[train['Name'].str.contains('Master')] 
print(type(boys)) # DataFrame

Split a string separated by comma

str = "abc,def".split(',') # ['abc', 'def']

Convert a True/False series to 1/0 Series

from pandas import Series
series = Series([True, False, True])
series.astype(int) #[1,0,1]

Data frame custom index

normal_index_data_frame = DataFrame({'col1': [1, 2], 'col2': [3, 4]})
print(list(normal_index_data_frame.index)) # [0, 1]
custom_index_data_frame = DataFrame({'col1': [1, 2], 'col2': [3, 4]}, index=['a','b'])
print(list(custom_index_data_frame.index)) # ['a', 'b']

Drop a column from data set

data_set = data_set.drop('Cabin', axis = 1) # axis=0 means drop row, axis=1 means drop colomn
data_set.drop('Cabin', axis = 1, inplace = True) # Same effect as first line

Iterate through rows of data set

for index, row in data_set.iterrows():
    print (str(index)+' '+str(row[0])+ ' '+str(row[1]))

Slice/substring: a[start : end : step] # start through not past end, by step

a='abcdefg'
print (a[0:4]) #abcd
print (a[0:-1]) #abcdef
print (a[:-1]) #abcdef
print (a[0:4:1]) #abcd
print (a[0:4:2]) #ac
print (a[::1]) #abcdefg
print (a[::-1]) #gfedcba
print (a[4:0:-1]) #edcb
print (a[-2::-1]) #fedceba
print (a[:-3:-1]) #gf

Lambda function

def test_lambda(n):
    return lambda x: x+n

func = test_lambda(5)

func(4) # print 9

Data frame pivot table

df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                     "bar", "bar", "bar", "bar"],
                  "B": ["one", "one", "one", "two", "two",
                     "one", "one", "two", "two"],
                  "C": ["small", "large", "large", "small",
                     "small", "large", "small", "small",
                     "large"],
                  "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]})
df.pivot_table(values='D', index=['A', 'B'], aggfunc=np.sum)

          D
A   B      
bar one   9
    two  13
foo one   5
    two   6

df.pivot_table(values='D', index=['A'], columns=['B'], aggfunc=np.sum)

B    one  two
A            
bar    9   13
foo    5    6

DataFrame set index (Move a column to index)

df = pd.DataFrame({'month': [1, 4, 7, 10],
                   'year': [2012, 2014, 2013, 2014],
                   'sale':[55, 40, 84, 31]})

df.set_index('year')

          month  sale
year             
2012      1    55
2014      4    40
2013      7    84
2014     10    31

Print new line and escape new line

print('Ru\noob') 
Ru
oob
print(r'Ru\noob')
Ru\noob

Double * and Double /

print (2 ** 4) # 2^4 = 16
print (3 / 2) # 1.5
print (3 // 2) # 1, remove decimal point

Define 1 element tuple

tup = (1)
print (type(tup)) #int, not a tuple
tup = (1,)
print (type(tup)) #tuple

Set

student = {'Tom', 'Jim', 'Mary', 'Tom', 'Jack', 'Rose'}
print(student)  # Remove duplicates

if ('Rose' in student):
    print('Rose in set')
else:
    print('Rose not in set')

a = set('abracadabra')
b = set('alacazam')

print(a)
print(a - b)  # a minus b, {'b', 'd', 'r'}
print(a | b)  # a or b, {'a', 'z', 'r', 'd', 'c', 'l', 'm', 'b'}
print(a & b)  # a and b, {'a', 'c'}
print(a ^ b)  # either a or b, but not in (a and b), {'z', 'b', 'r', 'd', 'm', 'l'}

Create an executable python file, called run.py

The content of run.py

#! /usr/bin/env python3
print ("Hello, Python!")

Command line

chmod +x run.py
./run.py

is and id() which returns address of memory

a = 'abcdefg'
b = 'abcdefg'

print(id(a))
print(id(b))
print (id(a) == id(b)) # True
print (a is b) # True

Format a string

print("My name is %s. I'm %d yeas old! " % ('Mingtao', 10))

Multiple line string and multiple line comments

'''
abc = 'This is comment'
'''

multiple_line_string = '''
line 1
line 2
line 3
'''

print(multiple_line_string)

Iterator/Generator, yield

def fibonacci(n):  
a, b, counter = 0, 1, 0
while True:
    if counter > n:
        return
    yield a # make the function an iterator definition
    a, b = b, a + b
    counter += 1

f = fibonacci(10)  # f is an iterator

while True:
    try:
        print(next(f), end=' ')
        sys.stdout.flush()
    except StopIteration:
        sys.exit()

Output is 0 1 1 2 3 5 8 13 21 34 55

*vargs and **vargs

def test_vartuple(arg1, *vartuple):
    print("formal arg:", arg1)
    print("vartuple:", vartuple)

test_vartuple(10)
test_vartuple(70, 60, 50)
Output:
formal arg: 10
vartuple: ()
formal arg: 70
vartuple: (60, 50) 
    
def test_vardict(farg, **vardict):
    print ("formal arg:", farg)
    print ("vardict:", vardict)

test_vardict('a', a=2, b=3)
Output:
formal arg: a
vardict: {'a': 2, 'b': 3}

Argument after *vargs must be keyword argument

def arg_after_vargs(a, b, *varg, c):
    print(varg)
    return a+b+c

arg_after_vargs(1,2,3) # Wrong
arg_after_vargs(1,2,c=3) # Correct

Global and nonlocal

num = 1
def fun1():  
    num = 123
    print('local', num)
fun1()
print('global', num)

Output:
local 123
global 1

num = 1
def fun1():  
    global num
    num = 123
    print('local', num)
fun1()
print('global', num)

Output:
local 123
global 123

def outer():
    num = 10
    def inner():
        nonlocal num
        num = 100
        print(num)
    inner()
    print(num)
outer()

Output:
100
100

Add an 'index' column

data_set = data_set.reset_index()

Series.where() to replace

a = Series(range(1,5))
print(a.where(a<3, 100)) #Replace the items that don't satisfy the condition

Value count (Series)

from pandas import Series
a = Series(['a','a','b','b','b','c'])
a.value_counts()

Output:
b    3
a    2
c    1

NP array shape and reshape

a = np.array([1,2,3,4])
a.shape # (4,)
b = a.reshape(-1, 1) # Convert to N rows, 1 column
b.shape # (4, 1)
c = a.reshape(1, -1) # Convert to 1 row, N columns
c.shape # (1, 4)
d = a.reshape(2, 2) # Convert to 2 rows, 2 columns
d.shape # (2, 2)

Convert between DataFrame and ndarray

import pandas as pd

d = {'name': ['Braund', 'Cummings', 'Heikkinen', 'Allen'],
     'age': [22,38,26,35],
     'fare': [7.25, 71.83, 0 , 8.05],
     'survived': [False, True, True, False]}

df = pd.DataFrame(d)
columns = df.columns
arr = df.values
df2 = pd.DataFrame(arr, columns=columns)

nparray concat

a = np.array([1, 2, 3]).reshape(-1,1)
b = np.array([4, 5, 6]).reshape(-1,1)
np.c_[a,b] # concatenate from left to right
Output:
array([[1, 4],
      [2, 5],
      [3, 6]])
np.r_[a,b] # concatenate from top to bottom
Output:
array([[1],
   [2],
   [3],
   [4],
   [5],
   [6]])

zip, aggregate elements from each iterables

a,b = '123','abc'
for i, j in zip(a,b):
    print (i,j)
Output:
1 a
2 b
3 c

Create an empty 2D array

[[0 for col in range(4)] for row in range(11)]

# use numpy
import numpy as np
np.zeros((10,4)).astype(int).tolist()

Plot y = X function

import matplotlib.pyplot as plt
X = 6 * np.random.rand(m, 1) -3
y = 0.5 * X ** 2 + X + 2 + np.random.randn(m, 1)
plt.plot(X, y, 'b.') # b means blue, . means dot, not line
plt.show()

Set plot x, y axix limit

import matplotlib.pyplot as plt
plt.axis([0,90,0,3]) # X between 0 and 90, Y between 0 and 3

Reduce, Filter, Map and lambda

def add(x, y) :            # 两数相加
    return x + y

from functools import reduce
reduce(add, [1,2,3,4,5])   # 计算列表和：1+2+3+4+5
Output: 15
reduce(lambda x, y: x+y, [1,2,3,4,5])  # 使用 lambda 匿名函数
Output: 15

list(map(lambda x: x ** 2, [1,2,3,4]))
Output: [1, 4, 9, 16]
list(filter(lambda x: x>1, [1,2,3,4]))
Output: [2, 3, 4]

Tensorflow print out trainable_variables

variables_names = [v.name for v in tf.trainable_variables()]
print(variables_names)

Get a value of a tensor within a session

b0 = sess.run(sess.graph.get_tensor_by_name("dnn/outputs/bias:0"))

NP arange and array_split

x = np.arange(7.0)
np.array_split(x, 3)
Output: [array([ 0.,  1.,  2.]), array([ 3.,  4.]), array([ 5.,  6.])]

Add a new dimension to an array

origin = np.zeros((50,50))
print(origin.shape) # 50, 50
expanded = np.expand_dims(origin, -1) # -1 means inserted new axis at the end
print(expanded.shape) # 50, 50, 1

Pandas merger

df1 = pd.DataFrame({'key': ['a', 'b', 'b', 'd'], 'data1': range(4)})
df2 = pd.DataFrame({'key': ['a', 'b', 'c'], 'data2': range(3)})  
pd.merge(df1, df2) # Merge on 'key' with all the duplicate values, the other values are ignored. 
Output:
    key  data1  data2
0   a      0      0
1   b      1      1
2   b      2      1

* used in function call unpacks tuple

def sample(a, b, c):
    print a, b, c

tp = ("Hello", "Python", "Learner")
sample(*tp)

** used in function call unpacks dictionary

def sample(name="Sample Python Post", type="Post", Date=""):
print(name, type, Date)

d = {'type': "New Post", 'name': "* and ** in python", 'Date': ""}
sample(**d)

Resize an image and convert to grey scale

# Assuming old image is np array of size (height, width, 3)
from PIL import Image
INPUT_SHAPE = (84, 84)
pil_img = Image.fromarray(input_img)
resize_img = pil_img.resize(INPUT_SHAPE)
grey_img = resize_img.convert('L')
output_img = np.array(grey_img)
assert output_img.shape == (84,84)

Update epsilon

epsilon = 1.
max_epsilon = 1.
min_epsilon =0.01
decay_rate = 0.005
for episode in range(100):
    new_epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)

Select index value from np array

a = np.array([[1,2,3],[4,5,6],[7,8,9]])
b = a[range(3), [0,1,2]]
print(b)
# Output: [1 5 9]

Time

import time
start_time = time.time()
end_time = time.time() - start_time

Weighted sampling

aa = np.array([1,2,3,4])
print(np.random.choice(aa, 100, p=aa/sum(aa)))
# Output: array([3, 4, 4, 4, 4, 4, 1, 4, 4, 3, 1, 3, 1, 3, 3, 4, 3, 2, 3, 2, 4, 3,
   4, 3, 4, 4, 1, 3, 4, 2, 2, 4, 3, 3, 4, 1, 4, 2, 2, 3, 4, 2, 3, 3,
   4, 4, 3, 4, 1, 3, 1, 1, 3, 3, 1, 1, 4, 4, 4, 4, 3, 2, 2, 1, 2, 4,
   4, 2, 2, 1, 4, 1, 2, 1, 2, 3, 3, 3, 4, 3, 4, 3, 3, 2, 2, 2, 4, 2,
   4, 3, 2, 3, 3, 1, 4, 3, 1, 3, 4, 1])
# The proportion of 4 to 3 to 2 to 1 is 40%:30%:20%:10%

Quick testing tensorflow

import tensorflow as tf
from keras.losses import mean_squared_error

def my_loss(y_pred, y_true):
    x = y_true - y_pred
    return tf.reduce_sum(tf.square(x), axis=-1)


y_pred = tf.Variable([[1.,2.,3.],[1.,2.,3.]])
y_true = tf.Variable([[1.,2.,4.],[1.,2.,3.]])

sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())

loss = mean_squared_error(y_pred, y_true)
loss2 = my_loss(y_pred, y_true)

loss_val, loss2_val = sess.run([loss, loss2])
print (loss_val, loss2_val)

Quick way to reshape numpy array from vector to matrix

arr = np.array([1,2,3,4])
print(arr.shape)
print(arr.reshape(-1, arr.shape[0]).shape) # Bad way
print(arr[None, :].shape) # Good way

Tensorflow Eager execution, Gradient, Optimizer applies gradient

import tensorflow as tf
tf.enable_eager_execution()
x = tf.contrib.eager.Variable([3.0], dtype=tf.float32) # Can't use tf.Variable due to eager execution
y = tf.contrib.eager.Variable([4.0], dtype=tf.float32)
with tf.GradientTape() as tape:
    z = x ** 2 + y ** 2
grads = tape.gradient(z, [x, y])
opt = tf.train.GradientDescentOptimizer(learning_rate=0.01)
opt.apply_gradients(zip(grads, [x, y]))
print(grads, x.numpy(), y.numpy()) # Convert tensor to numpy array for output convenience

# Output
[<tf.Tensor: id=7247, shape=(1,), dtype=float32, numpy=array([6.], dtype=float32)>, <tf.Tensor: id=7228, shape=(1,), dtype=float32, numpy=array([8.], dtype=float32)>] [2.94] [3.92]

Good way to create dictionary

def char_num(word):
    return {ch: word.count(ch) for ch in word}

print(char_num('COMMUNICATION'))
# Output
{'C': 2, 'O': 2, 'M': 2, 'U': 1, 'N': 2, 'I': 2, 'A': 1, 'T': 1}

Permutation and Combination

import itertools
candidate_set = [1,2,3,4]
print(list(itertools.combinations(candidate_set, 2)))
# Output
[(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
print(list(itertools.permutation(candidate_set, 2)))
# Output
[(1, 2), (1, 3), (1, 4), (2, 1), (2, 3), (2, 4), (3, 1), (3, 2), (3, 4), (4, 1), (4, 2), (4, 3)]

Python

Read and write CSV with encoding

Look at first 5 rows

Find missing data

Data set attributes

Product of a tuple

Remove all columns with at least one missing value

Replace all NA's the value that comes directly after it in the same column, then replace all the remaining na's with 0

Add a new column to data set

For in if syntax

Find accuracy of the prediction

Parse date

Encode and decode

Read rawdata and detect charcoding

Data set unique, sort, lower case

Find similar words

Print mix objects - convert object to string

Data set Group by and mean

Select rows from data set

Split a string separated by comma

Convert a True/False series to 1/0 Series

Data frame custom index

Drop a column from data set

Iterate through rows of data set

Slice/substring: a[start : end : step] # start through not past end, by step

Lambda function

Data frame pivot table

DataFrame set index (Move a column to index)

Print new line and escape new line

Double * and Double /

Define 1 element tuple

Set

Create an executable python file, called run.py

The content of run.py

Command line

is and id() which returns address of memory

Format a string

Multiple line string and multiple line comments

Iterator/Generator, yield

*vargs and **vargs

Argument after *vargs must be keyword argument

Global and nonlocal

Add an 'index' column

Series.where() to replace

Value count (Series)

NP array shape and reshape

Convert between DataFrame and ndarray

nparray concat

zip, aggregate elements from each iterables

Create an empty 2D array

Plot y = X function

Set plot x, y axix limit

Reduce, Filter, Map and lambda

Tensorflow print out trainable_variables

Get a value of a tensor within a session

NP arange and array_split

Add a new dimension to an array

Pandas merger

* used in function call unpacks tuple

** used in function call unpacks dictionary

Resize an image and convert to grey scale

Update epsilon

Select index value from np array

Time

Weighted sampling

Quick testing tensorflow

Quick way to reshape numpy array from vector to matrix

Tensorflow Eager execution, Gradient, Optimizer applies gradient

Good way to create dictionary

Permutation and Combination

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!