numpy statistics.txt

np.mean average
np.median (sort - high to low - middle point)
np.corrcoef
np.std
np.sum
np.sort
np.column_stack


listOfCoordinates=list(zip(whiteIndex[0],whiteIndex[1]))

for cord in listOfCoordinates:
    print(demo2[cord[0],:])

mathplotlib
import matplotlib.pyplot as plt
import numpy as py


plt.plot(year,pop)
plt.show()

or 
plt.scatter(year,pop)

or

divide into bins

plt.hist(race,bins=9)
plt.clf()
plt.xscale
plt.xlabel
plt.ylabel
plt.title
plt.yticks

plt.yticks([1000,10000,10000],['1k','10k','100k'])

plt.scatter(gdp_cap, life_exp, s = pop, c=col)
s = size
c= color

listOfCoordinates= list(zip(result[0], result[1]))

hist1=np.array(hist)
gender=hist1[:,1]
race=hist1[:,2]


for cord in listOfCoordinates:
    print(cord)

Dataframe
pandas stack
import pandas as pd

dict={ "country":["brazil","russia"],
"population":[200.4,143.5]}

brics=pd.DataFrame(dict)
brics=pd.read_csv("path/to/brics.csv", index_cols=0)

df["A"] = df["A"].fillna(0)

index and select data

miles=costs['Miles'].tolist()
dollars=costs['EMDollars'].tolist()
costPerMile=costs['CostPerMile'].tolist()
print(type(costs.loc[:,['Miles']]))

print(cars.loc[['JPN']])
print(cars.iloc[[2]])

##iseries
print(cars.loc[:,'drives_right'])
##dataframe
print(cars.loc[:,['drives_right']])

equivalent
miles=costs['Miles'].tolist()
miles=costs.loc[:,'Miles']

comparison

Numpy
print(np.logical_and(my_house<11,your_house<11 ))
print(np.logical_or(my_house>18.5,my_house<10))
np.logical_not(val1,val2)

Filtering pandas
costPerMile2=costs['CostPerMile']<1

dr=cars['drives_right']
sel=cars[dr]


cpc=cars["cars_per_cap"]
many_cars=cpc>500
car_maniac=cars[many_cars]

medium=cars['cars_per_cap']
sel=cars[np.logical_and(medium>100,medium<500)]

if z%2 ==0 :
 print('abc')
elif z%3==0 :
 print('def')
else :
 print('g')

while condition:
  expression

for var in seq:
  expression

fam=[1,2,3,4,5]
for height in fam :
    print(height)

fam=[1,2,3,4,5]
for index, height in enumerate(fam):
    print(str(height) + ":"+str(index))

for c in "family" :
    print(c.capitalize())

house = [["hallway", 11.25], 
         ["kitchen", 18.0], 
         ["living room", 20.0], 
         ["bedroom", 10.75], 
         ["bathroom", 9.50]]

for x in house :
    print(x[1])

#items() for a dictionary
world={"afghanistan":30.55,
       "albania":2.77,
       "algeria":39.21
      }
for key, value in world.items() :
    print(key + ' -- ' + str(value))

np_height=np.array([1.73,1.68,1.71,1.89,1.79])
np_weight=np.array([65.4,59.2,63.6,88.4,68.7])
measures= np.array([np_height, np_weight])
print(measures)

for val in np.nditer(measures):
    print(val)

for label,row in costs.iterrows():
    print(label)
    print(row)

for label,row in costs.iterrows():
    print(str(label)+":" +str(row["Equipment"]) + " "+str(row["Miles"]))

for label,row in costs.iterrows():
        costs.loc[label,"name_length"]=len(row["Equipment"])
print(costs)

costs.loc["name_length"]=costs["EmployeeName"].apply(len)

np.random.seed(123)
dice=randint(1,7)
print(np.random.rand())

step=50
while step>0:
    dice = np.random.randint(1,7)
    if dice<=2:
        step=step-7
    elif dice<=5:
        step=step+1
    else:
        step=step+np.random.randint(1,7)
    print(step)


np.random.seed(123)
outcomes=[]
for x in range(10):
    coin=np.random.randint(0,2)
    if coin==0:
        outcomes.append("heads")
    else:
        outcomes.append("tails")
        
print(outcomes)

np.random.seed(123)
tails=[0]
for x in range(100):
    coin=np.random.randint(0,2)
    tails.append(tails[x]+coin)
        
print(tails)

np.random.seed(123456)
step=0
plot_steps=[]
for x in range(10000):
    dice=np.random.randint(1,7)
    if dice<=2 :
        step=step-1
    elif dice>=3 and dice<=5:
        step=step+1
    elif dice==6:
        step=step+np.random.randint(1,7)
    #print(step)
    plot_steps.append(step)
    
plt.plot(plot_steps)
plt.show()

np.random.seed(123456)
step=0
final_tails=[]
for x in range(10000):
    tails=[0]
    for y in range(10):
        coin=np.random.randint(0,2)
        tails.append(tails[y]+coin)
    
    final_tails.append(tails[-1])

#print(final_tails)
plt.hist(final_tails, bins=10)
plt.show()

np.transpose


intermediate Data Visualization with Seaborn
introduction to python for finance
recurrent neural networks for language modeling
dealing with missing data in python
feature engineering for nlp in python
introduction to deep learning with pytorch
hyperparameter tuning in python
introduction to tensorflow in python
introduction to data visualization with matplotlib
linear classifiers in python
machine learning for time series data in python
extreme gradient boosting with xgboost
supervised learning with scikit
data visualization with python

from sklearn.neighbors import KNeighborsClassifier
iris=load_iris()

#bunch lets you use a python dict like an object
print(type(iris))
print(iris.keys())

#feature and target are numpy arrays
print('features type:' + str(type(iris.data)))
print('target type:' + str(type(iris.target)))
print(iris.data.shape)

X=iris.data
y=iris.target

df=pd.DataFrame(X,columns=iris.feature_names)
print(df.head(5))

#df.info()
#df.describe()
pd.plotting.scatter_matrix(df,c=y)

knn=KNeighborsClassifier(algorithm='auto', leaf_size=30,metric='minkowski',metric_params=None,n_jobs=1,n_neighbors=6,p=2,weights='uniform')

knn.fit(iris['data'],iris['target'])

#print(iris['data'])
#print(iris['target'])

X_new=np.array([[5.6,2.8,3.9,1.1],[5.7,2.6,3.8,1.3],[4.7,3.2,1.3,0.2]])
prediction=knn.predict(X_new);
print(prediction)

accurance is the fraction of correct predictions divided by the number of data points

model complexity curve
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.3, random_state=21, stratify=y)
knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)
#print(X_test)
#print(y_pred)
knn.score(X_test,y_test)


from sklearn import datasets
import matplotlib.pyplot as plt

# Load the digits dataset: digits
digits = datasets.load_digits()

# Print the keys and DESCR of the dataset
print(digits.DESCR)
print(digits.keys())

# Print the shape of the images and data keys
print(digits.images.shape)
print(digits.data.shape)

# Display digit 1010
plt.imshow(digits.images[1010], cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()


model complexity model

# Setup arrays to store train and test accuracies
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

# Loop over different values of k
for i, k in enumerate(neighbors):
    # Setup a k-NN Classifier with k neighbors: knn
    knn = KNeighborsClassifier(algorithm='auto',
                         leaf_size=30,
                         metric='minkowski',
                         metric_params=None,
                         n_jobs=1,
                         n_neighbors=k,
                         p=2,
                         weights='uniform')
    # Fit the classifier to the training data
    knn.fit(X_train,y_train)
    
    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)

    #Compute accuracy on the testing set
    test_accuracy[i] = knn.score(X_test, y_test)

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

regression
reg=LinearRegression()
reg.fit(X_rooms,y)
prediction_space=np.linspace(min(X_rooms),max(X_rooms)).reshape(-1,1)

plt.scatter(X_rooms,y)
plt.ylabel('house price in $1000s')
plt.xlabel('number of rooms')
plt.plot(prediction_space, reg.predict(prediction_space),color='black', linewidth=3)
plt.show()


plt.plot(prediction_space, reg.predict(prediction_space),color='black', linewidth=3)

sns.heatmap(df.corr(), square=True, cmap='RdYlGn')

Ordinary least squares (OLS)
Minimize sum of squares of residuals

X_CrimeIndex=X[:,0]
X_CrimeIndex=X_CrimeIndex.reshape(-1,1)
X_train, X_test,y_train,y_test=train_test_split(X_CrimeIndex,y,test_size=0.2,random_state=42)
regression=LinearRegression()
regression.fit(X_train,y_train)
y_prediction=regression.predict(X_test)
plt.scatter(X_CrimeIndex,y)
plt.plot(X_test,y_prediction, c="Purple")
plt.ylabel('house price in $1000s')
plt.xlabel('crime index')
plt.show()
regression.score(X_test,y_test)

R2 needs to be close to 1

print("R^2: {}".format(regression.score(X_test,y_test)))
rmse = np.sqrt(mean_squared_error(y_test,y_prediction))
print("Root Mean Squared Error: {}".format(rmse))

Regularization
linear regression is used to minimize loss function
large coefficients can lead to overfitting
regularizing is penalizing for large coefficients

Cross-Validation and model performance
5 fold validation 

Ridge regression (check for overfitting)

X_Rooms=X[:,5]
X_Rooms=X_Rooms.reshape(-1,1)
X_train, X_test,y_train,y_test=train_test_split(X_CrimeIndex,y,test_size=0.2,random_state=42)

lr=LinearRegression()
lr.fit(X_train,y_train)

train_score=lr.score(X_train, y_train)
test_score=lr.score(X_test, y_test)

rr=Ridge(alpha=0.1, normalize=True)
rr.fit(X_train,y_train)
rr_pred=ridge.predict(X_test)
#print(rr_pred)
rr.score(X_test,y_test)

Ridge_train_score=rr.score(X_train,y_train)
Ridge_test_score=rr.score(X_test,y_test)

rr100=Ridge(alpha=100)
rr100.fit(X_train, y_train)

Ridge_train_score100=rr100.score(X_train,y_train)
Ridge_test_score100=rr100.score(X_test,y_test)

print ("linear regression train score:"+ str(train_score))
print ("linear regression test score:"+ str( test_score))
print ("ridge regression train score low alpha:"+ str( Ridge_train_score))
print ("ridge regression test score low alpha:"+ str( Ridge_test_score))
print ("ridge regression train score high alpha:"+ str( Ridge_train_score100))
print ("ridge regression test score high alpha:"+ str( Ridge_test_score100))

plt.plot(rr.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markers
plt.plot(rr100.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Ridge; $\alpha = 100$') # alpha here is for transparency
plt.plot(lr.coef_,alpha=0.4,linestyle='none',marker='o',markersize=7,color='green',label='Linear Regression')
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.show()


lasso can be used to determine important features in a dataset

print(df.feature_names)
names=df.feature_names
lasso=Lasso(alpha=0.1)
lasso_coef=lass.fit(X,y).coef_

_ = plt.plot(range(len(names)),lasso_coef)
_ = plt.xticks(range(len(names)),names,rotation=60)
_ = plt.ylabel('Coefficients')
plt.show()


def display_plot(cv_scores, cv_scores_std):
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.plot(alpha_space, cv_scores)

    std_error = cv_scores_std / np.sqrt(10)

    ax.fill_between(alpha_space, cv_scores + std_error, cv_scores - std_error, alpha=0.2)
    ax.set_ylabel('CV Score +/- Std Error')
    ax.set_xlabel('Alpha')
    ax.axhline(np.max(cv_scores), linestyle='--', color='.5')
    ax.set_xlim([alpha_space[0], alpha_space[-1]])
    ax.set_xscale('log')
    plt.show()

confusion matrix
true positive    false negative
false positive   true negative

accuracy=(tp+fn) / (tp+tn+fp+fn)
precision= tp/(tp+fp)
recall=tp/(tp+fn)

f1= 2* (precision*recall)/(precision+recall)

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

knn=KNeighborsClassifier(algorithm='auto',
                         leaf_size=30,
                         metric='minkowski',
                         metric_params=None,
                         n_jobs=1,
                         n_neighbors=8,
                         p=2,
                         weights='uniform')

#print(X)
#print(y)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=42)
knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Logistic Regression
used in classification problems

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve

two possible labels

logreg=LogisticRegression()
X_train, X_test, y_train,y_test=train_test_split(X,y,test_size=0.4, random_state=42)
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

Receiver Operational Curve (ROC)


hyperparameter tuning

logreg=LogisticRegression(solver="lbfgs")
#a large C will result in overfit and a small C will lead to underfit
c_space = np.logspace(-4, -0.5, 30)
#print(c_space)
param_grid = {'C': c_space}
#param_grid={'C': np.arange(1,5)}

X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3,random_state=42)

#logreg_cv=GridSearchCV(logreg,param_grid,cv=5)
#logreg_cv.fit(X_train,y_train)

#print(logreg_cv.best_params_)
#print(logreg_cv.best_score_)

tree = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}
tree_cv=RandomizedSearchCV(tree,param_grid,cv=3)
tree_cv.fit(X,y)

#print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
#print("Best score is {}".format(tree_cv.best_score_))


Hold-out set reasoning

1. split data into training and hold-out set at the beginning
2. perform grid search cross validation on training set
3. choose best hyperparameters and evaluate on the hold-outset

Lasso used the L1 penalty to regularize, while ridge used the 
L2 penalty

isin

cities = ["Moscow", "Saint Petersburg"]

# Subset temperatures using square brackets
print(temperatures[temperatures["city"].isin(cities)])

# Subset temperatures_ind using .loc[]
print(temperatures_ind.loc[cities])

>>>tuple pairs to keep

# Index temperatures by country & city
temperatures_ind = temperatures.set_index(["country", "city"])

# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
rows_to_keep = [("Brazil", "Rio De Janeiro"), ("Pakistan", "Lahore")]

# Subset for rows to keep
print(temperatures_ind.loc[rows_to_keep])

print(temperatures_ind.sort_index())

# Sort temperatures_ind by index values at the city level
print(temperatures_ind.sort_index(level="city"))

# Sort temperatures_ind by country then descending city
print(temperatures_ind.sort_index(level=["country", "city"], ascending = [True, False]))

>>>loc

# Sort the index of temperatures_ind
temperatures_srt = temperatures_ind.sort_index()

# Subset rows from Pakistan to Russia
print(temperatures_srt.loc["Pakistan":"Russia"])

# Incorrectly subset rows from Lahore to Moscow
print(temperatures_srt.loc["Lahore":"Moscow"])

# Subset rows from Pakistan, Lahore to Russia, Moscow
print(temperatures_srt.loc[("Pakistan", "Lahore"):("Russia", "Moscow")])

>>>>loc

# Subset rows from India, Hyderabad to Iraq, Baghdad
print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad")])

# Subset columns from date to avg_temp_c
print(temperatures_srt.loc[:, "date":"avg_temp_c"].head(5))

# Subset in both directions at once
print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad"), "date":"avg_temp_c"])

>>Condition

# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
print(temperatures[(temperatures["date"] >= "2010") & (temperatures["date"] < "2012")])

# Set date as an index
temperatures_ind = temperatures.set_index("date")

# Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
print(temperatures_ind.loc["2010":"2011"])

# Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
print(temperatures_ind.loc["2010-08":"2011-02"])

>>>>>iLoc
# Get 23rd row, 2nd column (index 22, 1)
print(temperatures.iloc[22,1])

# Use slicing to get the first 5 rows
print(temperatures.iloc[:5,:])

# Use slicing to get columns 2 to 3
print(temperatures.iloc[:,2:4])

# Use slicing in both directions at once
print(temperatures.iloc[:5,2:4])

>>>>Pivot Table
# Add a year column to temperatures
temperatures["year"] = temperatures["date"].dt.year

# Pivot avg_temp_c by country and city vs year
temp_by_country_city_vs_year = temperatures.pivot_table("avg_temp_c", index = ["country", "city"], columns = "year")

# See the result
print(temp_by_country_city_vs_year)

>>>>Subsetting with loc

temp_by_country_city_vs_year.loc["Egypt":"India"]

# Subset for Egypt, Cairo to India, Delhi
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi")]

# Subset in both directions at once
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi"), "2005":"2010"]

>>Unpivot a table


def unpivot(frame):
    N, K = frame.shape
    data = {'value': frame.to_numpy().ravel('F'),
            'variable': np.asarray(frame.columns).repeat(N),
            'date': np.tile(np.asarray(frame.index), K)}
    return pd.DataFrame(data, columns=['date', 'variable', 'value'])

>>Find the max mean in a data set

# Get the worldwide mean temp by year
mean_temp_by_year = temp_by_country_city_vs_year.mean()

# Find the year that had the highest mean temp
print(mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()])

# Get the mean temp by city
mean_temp_by_city = temp_by_country_city_vs_year.mean(axis="columns")

# Find the city that had the lowest mean temp
print(mean_temp_by_city[mean_temp_by_city == mean_temp_by_city.min()])

>>>Plot bar

# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt

# Look at the first few rows of data
print(avocados.head(5))

# Get the total number of avocados sold of each size
nb_sold_by_size = avocados.groupby('size')['nb_sold'].sum()
print(nb_sold_by_size)

# Create a bar plot of the number of avocados sold by size
plt.plot(kind=bar, nb_sold_by_size)

# Show the plot
plt.show()
>>Plot line

import matplotlib.pyplot as plt

# Get the total number of avocados sold on each date
nb_sold_by_date = avocados.groupby('date')['nb_sold'].sum()

# Create a line plot of the number of avocados sold by date
nb_sold_by_date.plot(kind='line', x='date',y='nb_sold' )

# Show the plot
plt.show()

>>Histogram

# Histogram of conventional avg_price 
avocados[avocados['type']=='conventional']['avg_price'].hist(alpha=0.7)

# Histogram of organic avg_price
avocados[avocados['type']=='organic']['avg_price'].hist(alpha=0.7)

# Add a legend
plt.legend('conventional','organic')

# Show the plot
plt.show()

>>Is a Number

# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt

# Check individual values for missing values
print(avocados_2016.isna())

# Check each column for missing values
print(avocados_2016.isna().any())

# Bar plot of missing values by variable
avocados_2016.isna().sum().plot(kind="bar")

# Show plot
plt.show()

>>>drop na
avocados_complete = avocados_2016.dropna()

# Check if any columns contain missing values
print(avocados_complete.isna().any())

>>>fillna(0)

# From previous step
cols_with_missing = ["small_sold", "large_sold", "xl_sold"]
avocados_2016[cols_with_missing].hist()
plt.show()

# Fill in missing values with 0
avocados_filled = avocados_2016.fillna(0)

# Create histograms of the filled columns
avocados_filled[cols_with_missing].hist()

# Show the plot
plt.show()