-
Notifications
You must be signed in to change notification settings - Fork 4
Python
Mingtao edited this page Nov 2, 2018
·
72 revisions
import pandas as pd
data_set = pd.read_csv("train.csv") # default encoding is utf-8
data_set = pd.read_csv("train.csv", encoding='utf-8')
data_set.to_csv("train-utf8.csv")
data_set.sample(5)
# null value Series object
missing_data_summary = data_set.isnull().sum()
# The total number of missing data cells
missing_data_summary.sum()
data_set.shape #Find the size - tuple
data_set.columns #Find all columns
data_set.Sex #Find all values of 'Sex' column
import numpy as np
tup = (3,5)
np.product(tup) # 15
columns_with_na_dropped = data_set.dropna(axis=1)
Replace all NA's the value that comes directly after it in the same column, then replace all the remaining na's with 0
data_set.fillna(method = 'bfill', axis=0).fillna(0)
data_set['NewColomn'] = data_set['Age'].isnull()
a=[1,2,3,4,5]
b = [v for v in a if v%2 ==0] # [2,4]
# Get the index of the list
print ([i for i, e in enumerate([1, 2, 1]) if e == 1])
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)
date = '20/08/1981'
date_parsed = pd.to_datetime(date, format = "%d/%m/%Y")
day = date_parsed.day
month = date_parsed.month
year = date_parsed.year
before = '你好'
after = before.encode("utf8", errors = "replace")
print(after.decode("utf8"))
import chardet
with open("ming.csv", 'rb') as rawdata:
result = chardet.detect(rawdata.read(10000))
unique_names = data_set['Name'].unique()
sorted_names = unique_names.sort()
lower_name_column = data_set['Name'].str.lower()
trimmed_name_column = data_set['Name'].str.strip()
import fuzzywuzzy
from fuzzywuzzy import process
trimmed_name_column = data_set['Name'].str.strip()
matches = fuzzywuzzy.process.extract("Braund, Mr. Owen Harris", trimmed_name_column, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
l = ['a','b']
print ('list is'+str(l))
sex_survive_rate = data_set.groupby('Sex').mean()
#Select the rows with 'Name' column contains 'Master'
boys = train[train['Name'].str.contains('Master')]
print(type(boys)) # DataFrame
str = "abc,def".split(',') # ['abc', 'def']
from pandas import Series
series = Series([True, False, True])
series.astype(int) #[1,0,1]
normal_index_data_frame = DataFrame({'col1': [1, 2], 'col2': [3, 4]})
print(list(normal_index_data_frame.index)) # [0, 1]
custom_index_data_frame = DataFrame({'col1': [1, 2], 'col2': [3, 4]}, index=['a','b'])
print(list(custom_index_data_frame.index)) # ['a', 'b']
data_set = data_set.drop('Cabin', axis = 1) # axis=0 means drop row, axis=1 means drop colomn
data_set.drop('Cabin', axis = 1, inplace = True) # Same effect as first line
for index, row in data_set.iterrows():
print (str(index)+' '+str(row[0])+ ' '+str(row[1]))
a='abcdefg'
print (a[0:4]) #abcd
print (a[0:-1]) #abcdef
print (a[:-1]) #abcdef
print (a[0:4:1]) #abcd
print (a[0:4:2]) #ac
print (a[::1]) #abcdefg
print (a[::-1]) #gfedcba
print (a[4:0:-1]) #edcb
print (a[-2::-1]) #fedceba
print (a[:-3:-1]) #gf
def test_lambda(n):
return lambda x: x+n
func = test_lambda(5)
func(4) # print 9
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
"bar", "bar", "bar", "bar"],
"B": ["one", "one", "one", "two", "two",
"one", "one", "two", "two"],
"C": ["small", "large", "large", "small",
"small", "large", "small", "small",
"large"],
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7]})
df.pivot_table(values='D', index=['A', 'B'], aggfunc=np.sum)
D
A B
bar one 9
two 13
foo one 5
two 6
df.pivot_table(values='D', index=['A'], columns=['B'], aggfunc=np.sum)
B one two
A
bar 9 13
foo 5 6
df = pd.DataFrame({'month': [1, 4, 7, 10],
'year': [2012, 2014, 2013, 2014],
'sale':[55, 40, 84, 31]})
df.set_index('year')
month sale
year
2012 1 55
2014 4 40
2013 7 84
2014 10 31
print('Ru\noob')
Ru
oob
print(r'Ru\noob')
Ru\noob
print (2 ** 4) # 2^4 = 16
print (3 / 2) # 1.5
print (3 // 2) # 1, remove decimal point
tup = (1)
print (type(tup)) #int, not a tuple
tup = (1,)
print (type(tup)) #tuple
student = {'Tom', 'Jim', 'Mary', 'Tom', 'Jack', 'Rose'}
print(student) # Remove duplicates
if ('Rose' in student):
print('Rose in set')
else:
print('Rose not in set')
a = set('abracadabra')
b = set('alacazam')
print(a)
print(a - b) # a minus b, {'b', 'd', 'r'}
print(a | b) # a or b, {'a', 'z', 'r', 'd', 'c', 'l', 'm', 'b'}
print(a & b) # a and b, {'a', 'c'}
print(a ^ b) # either a or b, but not in (a and b), {'z', 'b', 'r', 'd', 'm', 'l'}
#! /usr/bin/env python3
print ("Hello, Python!")
chmod +x run.py
./run.py
a = 'abcdefg'
b = 'abcdefg'
print(id(a))
print(id(b))
print (id(a) == id(b)) # True
print (a is b) # True
print("My name is %s. I'm %d yeas old! " % ('Mingtao', 10))
'''
abc = 'This is comment'
'''
multiple_line_string = '''
line 1
line 2
line 3
'''
print(multiple_line_string)
def fibonacci(n):
a, b, counter = 0, 1, 0
while True:
if counter > n:
return
yield a # make the function an iterator definition
a, b = b, a + b
counter += 1
f = fibonacci(10) # f is an iterator
while True:
try:
print(next(f), end=' ')
sys.stdout.flush()
except StopIteration:
sys.exit()
Output is 0 1 1 2 3 5 8 13 21 34 55
def test_vartuple(arg1, *vartuple):
print("formal arg:", arg1)
print("vartuple:", vartuple)
test_vartuple(10)
test_vartuple(70, 60, 50)
Output:
formal arg: 10
vartuple: ()
formal arg: 70
vartuple: (60, 50)
def test_vardict(farg, **vardict):
print ("formal arg:", farg)
print ("vardict:", vardict)
test_vardict('a', a=2, b=3)
Output:
formal arg: a
vardict: {'a': 2, 'b': 3}
def arg_after_vargs(a, b, *varg, c):
print(varg)
return a+b+c
arg_after_vargs(1,2,3) # Wrong
arg_after_vargs(1,2,c=3) # Correct
num = 1
def fun1():
num = 123
print('local', num)
fun1()
print('global', num)
Output:
local 123
global 1
num = 1
def fun1():
global num
num = 123
print('local', num)
fun1()
print('global', num)
Output:
local 123
global 123
def outer():
num = 10
def inner():
nonlocal num
num = 100
print(num)
inner()
print(num)
outer()
Output:
100
100
data_set = data_set.reset_index()
a = Series(range(1,5))
print(a.where(a<3, 100)) #Replace the items that don't satisfy the condition
from pandas import Series
a = Series(['a','a','b','b','b','c'])
a.value_counts()
Output:
b 3
a 2
c 1
a = np.array([1,2,3,4])
a.shape # (4,)
b = a.reshape(-1, 1) # Convert to N rows, 1 column
b.shape # (4, 1)
c = a.reshape(1, -1) # Convert to 1 row, N columns
c.shape # (1, 4)
d = a.reshape(2, 2) # Convert to 2 rows, 2 columns
d.shape # (2, 2)
import pandas as pd
d = {'name': ['Braund', 'Cummings', 'Heikkinen', 'Allen'],
'age': [22,38,26,35],
'fare': [7.25, 71.83, 0 , 8.05],
'survived': [False, True, True, False]}
df = pd.DataFrame(d)
columns = df.columns
arr = df.values
df2 = pd.DataFrame(arr, columns=columns)
a = np.array([1, 2, 3]).reshape(-1,1)
b = np.array([4, 5, 6]).reshape(-1,1)
np.c_[a,b] # concatenate from left to right
Output:
array([[1, 4],
[2, 5],
[3, 6]])
np.r_[a,b] # concatenate from top to bottom
Output:
array([[1],
[2],
[3],
[4],
[5],
[6]])
a,b = '123','abc'
for i, j in zip(a,b):
print (i,j)
Output:
1 a
2 b
3 c
[[0 for col in range(4)] for row in range(11)]
# use numpy
import numpy as np
np.zeros((10,4)).astype(int).tolist()
import matplotlib.pyplot as plt
X = 6 * np.random.rand(m, 1) -3
y = 0.5 * X ** 2 + X + 2 + np.random.randn(m, 1)
plt.plot(X, y, 'b.') # b means blue, . means dot, not line
plt.show()
import matplotlib.pyplot as plt
plt.axis([0,90,0,3]) # X between 0 and 90, Y between 0 and 3
def add(x, y) : # 两数相加
return x + y
from functools import reduce
reduce(add, [1,2,3,4,5]) # 计算列表和:1+2+3+4+5
Output: 15
reduce(lambda x, y: x+y, [1,2,3,4,5]) # 使用 lambda 匿名函数
Output: 15
list(map(lambda x: x ** 2, [1,2,3,4]))
Output: [1, 4, 9, 16]
list(filter(lambda x: x>1, [1,2,3,4]))
Output: [2, 3, 4]
variables_names = [v.name for v in tf.trainable_variables()]
print(variables_names)
b0 = sess.run(sess.graph.get_tensor_by_name("dnn/outputs/bias:0"))
x = np.arange(7.0)
np.array_split(x, 3)
Output: [array([ 0., 1., 2.]), array([ 3., 4.]), array([ 5., 6.])]
origin = np.zeros((50,50))
print(origin.shape) # 50, 50
expanded = np.expand_dims(origin, -1) # -1 means inserted new axis at the end
print(expanded.shape) # 50, 50, 1
df1 = pd.DataFrame({'key': ['a', 'b', 'b', 'd'], 'data1': range(4)})
df2 = pd.DataFrame({'key': ['a', 'b', 'c'], 'data2': range(3)})
pd.merge(df1, df2) # Merge on 'key' with all the duplicate values, the other values are ignored.
Output:
key data1 data2
0 a 0 0
1 b 1 1
2 b 2 1
def sample(a, b, c):
print a, b, c
tp = ("Hello", "Python", "Learner")
sample(*tp)
def sample(name="Sample Python Post", type="Post", Date=""):
print(name, type, Date)
d = {'type': "New Post", 'name': "* and ** in python", 'Date': ""}
sample(**d)
# Assuming old image is np array of size (height, width, 3)
from PIL import Image
INPUT_SHAPE = (84, 84)
pil_img = Image.fromarray(input_img)
resize_img = pil_img.resize(INPUT_SHAPE)
grey_img = resize_img.convert('L')
output_img = np.array(grey_img)
assert output_img.shape == (84,84)
epsilon = 1.
max_epsilon = 1.
min_epsilon =0.01
decay_rate = 0.005
for episode in range(100):
new_epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
a = np.array([[1,2,3],[4,5,6],[7,8,9]])
b = a[range(3), [0,1,2]]
print(b)
# Output: [1 5 9]
import time
start_time = time.time()
end_time = time.time() - start_time
aa = np.array([1,2,3,4])
print(np.random.choice(aa, 100, p=aa/sum(aa)))
# Output: array([3, 4, 4, 4, 4, 4, 1, 4, 4, 3, 1, 3, 1, 3, 3, 4, 3, 2, 3, 2, 4, 3,
4, 3, 4, 4, 1, 3, 4, 2, 2, 4, 3, 3, 4, 1, 4, 2, 2, 3, 4, 2, 3, 3,
4, 4, 3, 4, 1, 3, 1, 1, 3, 3, 1, 1, 4, 4, 4, 4, 3, 2, 2, 1, 2, 4,
4, 2, 2, 1, 4, 1, 2, 1, 2, 3, 3, 3, 4, 3, 4, 3, 3, 2, 2, 2, 4, 2,
4, 3, 2, 3, 3, 1, 4, 3, 1, 3, 4, 1])
# The proportion of 4 to 3 to 2 to 1 is 40%:30%:20%:10%
import tensorflow as tf
from keras.losses import mean_squared_error
def my_loss(y_pred, y_true):
x = y_true - y_pred
return tf.reduce_sum(tf.square(x), axis=-1)
y_pred = tf.Variable([[1.,2.,3.],[1.,2.,3.]])
y_true = tf.Variable([[1.,2.,4.],[1.,2.,3.]])
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
loss = mean_squared_error(y_pred, y_true)
loss2 = my_loss(y_pred, y_true)
loss_val, loss2_val = sess.run([loss, loss2])
print (loss_val, loss2_val)
arr = np.array([1,2,3,4])
print(arr.shape)
print(arr.reshape(-1, arr.shape[0]).shape) # Bad way
print(arr[None, :].shape) # Good way
import tensorflow as tf
tf.enable_eager_execution()
x = tf.contrib.eager.Variable([3.0], dtype=tf.float32) # Can't use tf.Variable due to eager execution
y = tf.contrib.eager.Variable([4.0], dtype=tf.float32)
with tf.GradientTape() as tape:
z = x ** 2 + y ** 2
grads = tape.gradient(z, [x, y])
opt = tf.train.GradientDescentOptimizer(learning_rate=0.01)
opt.apply_gradients(zip(grads, [x, y]))
print(grads, x.numpy(), y.numpy()) # Convert tensor to numpy array for output convenience
# Output
[<tf.Tensor: id=7247, shape=(1,), dtype=float32, numpy=array([6.], dtype=float32)>, <tf.Tensor: id=7228, shape=(1,), dtype=float32, numpy=array([8.], dtype=float32)>] [2.94] [3.92]
def char_num(word):
return {ch: word.count(ch) for ch in word}
print(char_num('COMMUNICATION'))
# Output
{'C': 2, 'O': 2, 'M': 2, 'U': 1, 'N': 2, 'I': 2, 'A': 1, 'T': 1}
import itertools
candidate_set = [1,2,3,4]
print(list(itertools.combinations(candidate_set, 2)))
# Output
[(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
print(list(itertools.permutation(candidate_set, 2)))
# Output
[(1, 2), (1, 3), (1, 4), (2, 1), (2, 3), (2, 4), (3, 1), (3, 2), (3, 4), (4, 1), (4, 2), (4, 3)]