Python-c07-Filtering-and-Ordering

1
import pandas as pd
1
df = pd.read_csv("world_population.csv")
1
df[df["Rank"] < 10]

Rank CCA3 Country Capital Continent 2022 Population 2020 Population 2015 Population 2010 Population 2000 Population 1990 Population 1980 Population 1970 Population Area (km²) Density (per km²) Growth Rate World Population Percentage
16 8 BGD Bangladesh Dhaka Asia 1.711864e+08 1.674210e+08 1.578300e+08 1.483911e+08 1.291933e+08 1.071477e+08 83929765.0 67541860.0 147570.0 1160.0350 1.0108 2.15
27 7 BRA Brazil Brasilia South America 2.153135e+08 2.131963e+08 2.051882e+08 1.963535e+08 1.758737e+08 1.507064e+08 122288383.0 96369875.0 8515767.0 25.2841 1.0046 2.70
41 1 CHN China Beijing Asia 1.425887e+09 1.424930e+09 1.393715e+09 1.348191e+09 1.264099e+09 1.153704e+09 982372466.0 822534450.0 9706961.0 146.8933 1.0000 17.88
92 2 IND India New Delhi Asia 1.417173e+09 1.396387e+09 1.322867e+09 1.240614e+09 1.059634e+09 NaN NaN 557501301.0 3287590.0 431.0675 1.0068 17.77
93 4 IDN Indonesia Jakarta Asia 2.755013e+08 2.718580e+08 2.590920e+08 2.440162e+08 2.140724e+08 1.821599e+08 148177096.0 115228394.0 1904569.0 144.6529 1.0064 3.45
149 6 NGA Nigeria Abuja Africa 2.185412e+08 2.083274e+08 1.839958e+08 1.609529e+08 1.228520e+08 9.521426e+07 72951439.0 55569264.0 923768.0 236.5759 1.0241 2.74
156 5 PAK Pakistan Islamabad Asia 2.358249e+08 2.271967e+08 2.109693e+08 1.944545e+08 1.543699e+08 1.154141e+08 80624057.0 59290872.0 881912.0 267.4018 1.0191 2.96
171 9 RUS Russia Moscow Europe 1.447133e+08 1.456173e+08 1.446684e+08 1.432426e+08 1.468448e+08 1.480057e+08 138257420.0 130093010.0 17098242.0 8.4636 0.9973 1.81
221 3 USA United States Washington, D.C. North America 3.382899e+08 3.359420e+08 3.246078e+08 3.111828e+08 2.823986e+08 2.480837e+08 223140018.0 200328340.0 9372610.0 36.0935 1.0038 4.24
1
2
specific_countries = ['Bangladesh', 'Brazil']
df[df['Country'].isin(specific_countries)]

Rank CCA3 Country Capital Continent 2022 Population 2020 Population 2015 Population 2010 Population 2000 Population 1990 Population 1980 Population 1970 Population Area (km²) Density (per km²) Growth Rate World Population Percentage
16 8 BGD Bangladesh Dhaka Asia 171186372.0 167420951.0 157830000.0 148391139.0 129193327.0 107147651.0 83929765.0 67541860.0 147570.0 1160.0350 1.0108 2.15
27 7 BRA Brazil Brasilia South America 215313498.0 213196304.0 205188205.0 196353492.0 175873720.0 150706446.0 122288383.0 96369875.0 8515767.0 25.2841 1.0046 2.70
1
2
# filter country which contains United
df[df['Country'].str.contains('United')]

Rank CCA3 Country Capital Continent 2022 Population 2020 Population 2015 Population 2010 Population 2000 Population 1990 Population 1980 Population 1970 Population Area (km²) Density (per km²) Growth Rate World Population Percentage
219 97 ARE United Arab Emirates Abu Dhabi Asia 9441129.0 9287289.0 8916899.0 8481771.0 3275333.0 1900151.0 1014048.0 298084.0 83600.0 112.9322 1.0081 0.12
220 21 GBR United Kingdom London Europe 67508936.0 67059474.0 65224364.0 62760039.0 58850043.0 57210442.0 56326328.0 55650166.0 242900.0 277.9289 1.0034 0.85
221 3 USA United States Washington, D.C. North America 338289857.0 335942003.0 324607776.0 311182845.0 282398554.0 248083732.0 223140018.0 200328340.0 9372610.0 36.0935 1.0038 4.24
222 200 VIR United States Virgin Islands Charlotte Amalie North America 99465.0 100442.0 102803.0 106142.0 108185.0 100685.0 96640.0 63446.0 347.0 286.6427 0.9937 0.00
1
2
3
4
# set index to country
df2 = df.set_index('Country')
# show only the columns Contient and CCA3
df2.filter(items = ['Continent', 'CCA3'])

Continent CCA3
Country
Afghanistan Asia AFG
Albania Europe ALB
Algeria Africa DZA
American Samoa Oceania ASM
Andorra Europe AND
... ... ...
Wallis and Futuna Oceania WLF
Western Sahara Africa ESH
Yemen Asia YEM
Zambia Africa ZMB
Zimbabwe Africa ZWE

234 rows × 2 columns

1
2
3
# axis= is the vertical axis and axis=1 is the horizontal axis
# The following returns 0 result as 'Continent', 'CCA3' do not exist in the vertical axis which is Country
df2.filter(items=['Continent', 'CCA3'], axis=0)

Rank CCA3 Capital Continent 2022 Population 2020 Population 2015 Population 2010 Population 2000 Population 1990 Population 1980 Population 1970 Population Area (km²) Density (per km²) Growth Rate World Population Percentage
1
2
# but axis=1 exists as 'Continent', 'CCA3' exist in the horizontal axis
df2.filter(items=['Continent', 'CCA3'], axis=1)

Continent CCA3
Country
Afghanistan Asia AFG
Albania Europe ALB
Algeria Africa DZA
American Samoa Oceania ASM
Andorra Europe AND
... ... ...
Wallis and Futuna Oceania WLF
Western Sahara Africa ESH
Yemen Asia YEM
Zambia Africa ZMB
Zimbabwe Africa ZWE

234 rows × 2 columns

1
2
# This will return all the countries which contains "United"
df2.filter(like='United', axis=0)

Rank CCA3 Capital Continent 2022 Population 2020 Population 2015 Population 2010 Population 2000 Population 1990 Population 1980 Population 1970 Population Area (km²) Density (per km²) Growth Rate World Population Percentage
Country
United Arab Emirates 97 ARE Abu Dhabi Asia 9441129.0 9287289.0 8916899.0 8481771.0 3275333.0 1900151.0 1014048.0 298084.0 83600.0 112.9322 1.0081 0.12
United Kingdom 21 GBR London Europe 67508936.0 67059474.0 65224364.0 62760039.0 58850043.0 57210442.0 56326328.0 55650166.0 242900.0 277.9289 1.0034 0.85
United States 3 USA Washington, D.C. North America 338289857.0 335942003.0 324607776.0 311182845.0 282398554.0 248083732.0 223140018.0 200328340.0 9372610.0 36.0935 1.0038 4.24
United States Virgin Islands 200 VIR Charlotte Amalie North America 99465.0 100442.0 102803.0 106142.0 108185.0 100685.0 96640.0 63446.0 347.0 286.6427 0.9937 0.00
1
df2.loc['United States']
Rank                                          3
CCA3                                        USA
Capital                        Washington, D.C.
Continent                         North America
2022 Population                     338289857.0
2020 Population                     335942003.0
2015 Population                     324607776.0
2010 Population                     311182845.0
2000 Population                     282398554.0
1990 Population                     248083732.0
1980 Population                     223140018.0
1970 Population                     200328340.0
Area (km²)                            9372610.0
Density (per km²)                       36.0935
Growth Rate                              1.0038
World Population Percentage                4.24
Name: United States, dtype: object
1
df2.iloc[3]
Rank                                 213
CCA3                                 ASM
Capital                        Pago Pago
Continent                        Oceania
2022 Population                  44273.0
2020 Population                  46189.0
2015 Population                  51368.0
2010 Population                  54849.0
2000 Population                  58230.0
1990 Population                  47818.0
1980 Population                  32886.0
1970 Population                  27075.0
Area (km²)                         199.0
Density (per km²)               222.4774
Growth Rate                       0.9831
World Population Percentage          0.0
Name: American Samoa, dtype: object
1
2
# sort by Rank and Country in ascending
df[df['Rank'] < 10].sort_values(by=['Rank', 'Country'], ascending=True)

Rank CCA3 Country Capital Continent 2022 Population 2020 Population 2015 Population 2010 Population 2000 Population 1990 Population 1980 Population 1970 Population Area (km²) Density (per km²) Growth Rate World Population Percentage
41 1 CHN China Beijing Asia 1.425887e+09 1.424930e+09 1.393715e+09 1.348191e+09 1.264099e+09 1.153704e+09 982372466.0 822534450.0 9706961.0 146.8933 1.0000 17.88
92 2 IND India New Delhi Asia 1.417173e+09 1.396387e+09 1.322867e+09 1.240614e+09 1.059634e+09 NaN NaN 557501301.0 3287590.0 431.0675 1.0068 17.77
221 3 USA United States Washington, D.C. North America 3.382899e+08 3.359420e+08 3.246078e+08 3.111828e+08 2.823986e+08 2.480837e+08 223140018.0 200328340.0 9372610.0 36.0935 1.0038 4.24
93 4 IDN Indonesia Jakarta Asia 2.755013e+08 2.718580e+08 2.590920e+08 2.440162e+08 2.140724e+08 1.821599e+08 148177096.0 115228394.0 1904569.0 144.6529 1.0064 3.45
156 5 PAK Pakistan Islamabad Asia 2.358249e+08 2.271967e+08 2.109693e+08 1.944545e+08 1.543699e+08 1.154141e+08 80624057.0 59290872.0 881912.0 267.4018 1.0191 2.96
149 6 NGA Nigeria Abuja Africa 2.185412e+08 2.083274e+08 1.839958e+08 1.609529e+08 1.228520e+08 9.521426e+07 72951439.0 55569264.0 923768.0 236.5759 1.0241 2.74
27 7 BRA Brazil Brasilia South America 2.153135e+08 2.131963e+08 2.051882e+08 1.963535e+08 1.758737e+08 1.507064e+08 122288383.0 96369875.0 8515767.0 25.2841 1.0046 2.70
16 8 BGD Bangladesh Dhaka Asia 1.711864e+08 1.674210e+08 1.578300e+08 1.483911e+08 1.291933e+08 1.071477e+08 83929765.0 67541860.0 147570.0 1160.0350 1.0108 2.15
171 9 RUS Russia Moscow Europe 1.447133e+08 1.456173e+08 1.446684e+08 1.432426e+08 1.468448e+08 1.480057e+08 138257420.0 130093010.0 17098242.0 8.4636 0.9973 1.81
1
2
# sort by Rank in ascending and Country in descending
df[df['Rank'] < 10].sort_values(by=['Rank', 'Country'], ascending=[True, False])

Rank CCA3 Country Capital Continent 2022 Population 2020 Population 2015 Population 2010 Population 2000 Population 1990 Population 1980 Population 1970 Population Area (km²) Density (per km²) Growth Rate World Population Percentage
41 1 CHN China Beijing Asia 1.425887e+09 1.424930e+09 1.393715e+09 1.348191e+09 1.264099e+09 1.153704e+09 982372466.0 822534450.0 9706961.0 146.8933 1.0000 17.88
92 2 IND India New Delhi Asia 1.417173e+09 1.396387e+09 1.322867e+09 1.240614e+09 1.059634e+09 NaN NaN 557501301.0 3287590.0 431.0675 1.0068 17.77
221 3 USA United States Washington, D.C. North America 3.382899e+08 3.359420e+08 3.246078e+08 3.111828e+08 2.823986e+08 2.480837e+08 223140018.0 200328340.0 9372610.0 36.0935 1.0038 4.24
93 4 IDN Indonesia Jakarta Asia 2.755013e+08 2.718580e+08 2.590920e+08 2.440162e+08 2.140724e+08 1.821599e+08 148177096.0 115228394.0 1904569.0 144.6529 1.0064 3.45
156 5 PAK Pakistan Islamabad Asia 2.358249e+08 2.271967e+08 2.109693e+08 1.944545e+08 1.543699e+08 1.154141e+08 80624057.0 59290872.0 881912.0 267.4018 1.0191 2.96
149 6 NGA Nigeria Abuja Africa 2.185412e+08 2.083274e+08 1.839958e+08 1.609529e+08 1.228520e+08 9.521426e+07 72951439.0 55569264.0 923768.0 236.5759 1.0241 2.74
27 7 BRA Brazil Brasilia South America 2.153135e+08 2.131963e+08 2.051882e+08 1.963535e+08 1.758737e+08 1.507064e+08 122288383.0 96369875.0 8515767.0 25.2841 1.0046 2.70
16 8 BGD Bangladesh Dhaka Asia 1.711864e+08 1.674210e+08 1.578300e+08 1.483911e+08 1.291933e+08 1.071477e+08 83929765.0 67541860.0 147570.0 1160.0350 1.0108 2.15
171 9 RUS Russia Moscow Europe 1.447133e+08 1.456173e+08 1.446684e+08 1.432426e+08 1.468448e+08 1.480057e+08 138257420.0 130093010.0 17098242.0 8.4636 0.9973 1.81

PS:
world_population.csv

Python-d01-selenium

Here is an example to use selenium in python on linkedin.
This code will connect to linkedin and send message “testing” to the first friend
You have to change the username and password to do the authentication

this code is working in python 3.8 and selenium 4.14.0

1
2
3
4
5
6
7
8
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

import time

driver = webdriver.Chrome(service=Service(executable_path='C:\dev\softwares\chromedriver\chromedriver.exe'))
driver.get('https://www.linkedin.com/')
1
2
3
4
5
6
7
8
9
time.sleep(2)

username = driver.find_element(By.XPATH, "//input[@name='session_key']")
password = driver.find_element(By.XPATH, "//input[@name='session_password']")
username.send_keys(USERNAME_TO_CHANGE)
password.send_keys(PASSWORD_TO_CHANGE)

time.sleep(2)
submit = driver.find_element(By.XPATH, "//button[@type='submit']").click()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
driver.get("https://www.linkedin.com/search/results/people/?network=%5B%22F%22%5D&origin=FACETED_SEARCH")
time.sleep(2)

all_buttons = driver.find_elements(By.TAG_NAME, 'button')
message_buttons = [btn for btn in all_buttons if btn.text == "Message"]

message_buttons[0].click()

time.sleep(2)

main_div = driver.find_element(By.XPATH, "//div[starts-with(@class, 'msg-form__msg-content-container')]")
main_div.click()

time.sleep(2)

paragraphs = driver.find_elements(By.TAG_NAME, "p")
paragraphs[-5].send_keys("testing")

ml-a03-polynomial

1
2
3
4
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
1
2
3
4
5
data = np.genfromtxt("job.csv", delimiter=",")
x_data = data[1:,1]
y_data = data[1:,2]
plt.scatter(x_data,y_data)
plt.show()
1
2
3
4
5
6
7
8
9
10
# In this example, we use the linearRegression, we can see that the line does not match well the points

x_data = data[1:,1,np.newaxis]
y_data = data[1:,2,np.newaxis]
model = LinearRegression()
model.fit(x_data, y_data)

plt.plot(x_data,y_data, 'b.')
plt.plot(x_data,model.predict(x_data), 'r')
plt.show()
1
2
3
4
5
6
7
8
poly_reg = PolynomialFeatures(degree=5)
# Polynomial is used to add the features to the data, degree=1 => add data * power 0, degree = 2 => add data * power 0 and data * power 1
x_poly = poly_reg.fit_transform(x_data)
lin_reg = LinearRegression()
lin_reg.fit(x_poly, y_data)

x_poly

array([[1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00],
       [1.0000e+00, 2.0000e+00, 4.0000e+00, 8.0000e+00, 1.6000e+01,
        3.2000e+01],
       [1.0000e+00, 3.0000e+00, 9.0000e+00, 2.7000e+01, 8.1000e+01,
        2.4300e+02],
       [1.0000e+00, 4.0000e+00, 1.6000e+01, 6.4000e+01, 2.5600e+02,
        1.0240e+03],
       [1.0000e+00, 5.0000e+00, 2.5000e+01, 1.2500e+02, 6.2500e+02,
        3.1250e+03],
       [1.0000e+00, 6.0000e+00, 3.6000e+01, 2.1600e+02, 1.2960e+03,
        7.7760e+03],
       [1.0000e+00, 7.0000e+00, 4.9000e+01, 3.4300e+02, 2.4010e+03,
        1.6807e+04],
       [1.0000e+00, 8.0000e+00, 6.4000e+01, 5.1200e+02, 4.0960e+03,
        3.2768e+04],
       [1.0000e+00, 9.0000e+00, 8.1000e+01, 7.2900e+02, 6.5610e+03,
        5.9049e+04],
       [1.0000e+00, 1.0000e+01, 1.0000e+02, 1.0000e+03, 1.0000e+04,
        1.0000e+05]])
1
2
3
plt.plot(x_data, y_data, 'b.')
plt.plot(x_data, lin_reg.predict(x_poly), 'r')
plt.show()

PS:
job.csv

ml-a02-multiple_linear_regression

Multiple Linear Regression

1
2
3
4
import numpy as np
from numpy import genfromtxt
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
1
2
data = genfromtxt(r"Delivery.csv", delimiter=",")
print(data)
[[100.    4.    9.3]
 [ 50.    3.    4.8]
 [100.    4.    8.9]
 [100.    2.    6.5]
 [ 50.    2.    4.2]
 [ 80.    2.    6.2]
 [ 75.    3.    7.4]
 [ 65.    4.    6. ]
 [ 90.    3.    7.6]
 [ 90.    2.    6.1]]
1
2
3
4
5
6
# all data until the last column
x_data = data[:, :-1]
# label: the last column
y_data = data[:, -1]
print(x_data)
print(y_data)
[[100.   4.]
 [ 50.   3.]
 [100.   4.]
 [100.   2.]
 [ 50.   2.]
 [ 80.   2.]
 [ 75.   3.]
 [ 65.   4.]
 [ 90.   3.]
 [ 90.   2.]]
[9.3 4.8 8.9 6.5 4.2 6.2 7.4 6.  7.6 6.1]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def compute_error(theta0, theta1, theta2, x_data, y_data):
totalError = 0
for i in range(0, len(x_data)):
totalError += (y_data[i] - (theta1 * x_data[i, 0] + theta2 * x_data[i, 1] + theta0))**2
return totalError / float(len(x_data))

def gradient_descent_runner(x_data, y_data, theta0, theta1, theta2, lr, epochs):
# total data number
m = float(len(x_data))
for i in range(epochs):
theta0_grad = 0
theta1_grad = 0
theta2_grad = 0
for j in range(0, len(x_data)):
theta0_grad += (1/m) * ((theta1 * x_data[j, 0] + theta2 * x_data[j, 1] + theta0) - y_data[j])
theta1_grad += (1/m) * x_data[j, 0] * ((theta1 * x_data[j, 0] + theta2 * x_data[j, 1] + theta0) - y_data[j])
theta2_grad += (1/m) * x_data[j, 1] * ((theta1 * x_data[j, 0] + theta2 * x_data[j, 1] + theta0) - y_data[j])
theta0 = theta0 - (lr * theta0_grad)
theta1 = theta1 - (lr * theta1_grad)
theta2 = theta2 - (lr * theta2_grad)
return theta0, theta1, theta2
1
2
3
4
5
6
7
8
9
10
11
12
lr = 0.0001
theta0 = 0
theta1 = 0
theta2 = 0
epochs = 1000
print("starting theta0={0}, theta1={1}, theta2={2}, error={3}"
.format( theta0,theta1,theta2,compute_error(theta0, theta1, theta2, x_data, y_data)))
print("Running...")
theta0, theta1, theta2 = gradient_descent_runner(x_data, y_data, theta0, theta1, theta2, lr, epochs)
print("After {0} iterations theta0={1}, theta1={2}, theta2={3}, error={4}"
.format( epochs, theta0,theta1,theta2,compute_error(theta0, theta1, theta2, x_data, y_data)))

starting theta0=0, theta1=0, theta2=0, error=47.279999999999994
Running...
After 1000 iterations theta0=0.006971416196678632, theta1=0.08021042690771771, theta2=0.07611036240566814, error=0.7731271432218118
1
2
3
4
5
6
7
8
9
10
11
12
13
14
ax = plt.figure().add_subplot(111, projection="3d")
# c='r' => red color, marker='o' => use o to print the figure, s=100 => the marker size is 100
ax.scatter(x_data[:, 0], x_data[:, 1], y_data, c='r', marker="o", s=100)

x0 = x_data[:, 0]
x1 = x_data[:, 1]
x0, x1 = np.meshgrid(x0, x1)
z = theta0 + theta1 * x0 + theta1 * x1
ax.plot_surface(x0, x1, z)

ax.set_xlabel('Miles')
ax.set_ylabel('Num of Deliveries')
ax.set_zlabel('Time')
plt.show()

PS:
Delivery.csv

Sklearn

1
2
3
4
5
import numpy as np
from numpy import genfromtxt
import matplotlib.pyplot as plt
from sklearn import linear_model
from mpl_toolkits.mplot3d import Axes3D
1
2
data = genfromtxt(r"Delivery.csv", delimiter=",")
print(data)
[[100.    4.    9.3]
 [ 50.    3.    4.8]
 [100.    4.    8.9]
 [100.    2.    6.5]
 [ 50.    2.    4.2]
 [ 80.    2.    6.2]
 [ 75.    3.    7.4]
 [ 65.    4.    6. ]
 [ 90.    3.    7.6]
 [ 90.    2.    6.1]]
1
2
3
4
5
6
# all data until the last column
x_data = data[:, :-1]
# label: the last column
y_data = data[:, -1]
print(x_data)
print(y_data)
[[100.   4.]
 [ 50.   3.]
 [100.   4.]
 [100.   2.]
 [ 50.   2.]
 [ 80.   2.]
 [ 75.   3.]
 [ 65.   4.]
 [ 90.   3.]
 [ 90.   2.]]
[9.3 4.8 8.9 6.5 4.2 6.2 7.4 6.  7.6 6.1]
1
2
model = linear_model.LinearRegression()
model.fit(x_data, y_data)
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
1
2
3
4
5
6
print("coefficients:", model.coef_)
print("intercept:", model.intercept_)

x_test = [[102,4]]
predict = model.predict(x_test)
print("predict:", predict)
coefficients: [0.0611346  0.92342537]
intercept: -0.868701466781709
predict: [9.06072908]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
ax = plt.figure().add_subplot(111, projection="3d")
ax.scatter(x_data[:,0], x_data[:,1], y_data, c='r', marker='o', s=100)
x0 = x_data[:,0]
x1 = x_data[:,1]
x0,x1 = np.meshgrid(x0,x1)

z = model.intercept_ + x0*model.coef_[0] + x1*model.coef_[1]

ax.plot_surface(x0, x1, z)
ax.set_xlabel('Miles')
ax.set_ylabel('Num of Deliveries')
ax.set_zlabel('Time')

plt.show()

ml-a01-regression

Regression and Gradient Descent

1
2
import numpy as np
import matplotlib.pyplot as plt
1
2
3
4
5
data = np.genfromtxt("gradient-descent-data.csv", delimiter=",")
x_data = data[:, 0]
y_data = data[:, 1]
plt.scatter(x_data, y_data)
plt.show()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# lost function
def compute_error(b, k, x_data, y_data):
total_error = 0
for i in range(0, len(x_data)):
total_error += (y_data[i] - (k * x_data[i] +b)) ** 2
return total_error/float(len(x_data)) / 2.0

# gradient descent
def gradient_descent_runner(x_data, y_data, b, k, lr, epochs):
# all data length
m = float(len(x_data))
# loop over epochs
for i in range(epochs):
b_grad = 0
k_grad = 0
for j in range(0, len(x_data)):
b_grad += (1/m) * ((k*x_data[j] +b) - y_data[j])
k_grad += (1/m) * x_data[j] * ((k * x_data[j] + b) - y_data[j])
b = b - (lr * b_grad)
k = k - (lr * k_grad)
return b, k
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# lr learning rate
lr = 0.0001
#intercept
b = 0
#rake ratio
k = 0
#iteration numbers
epochs = 50

print("starting b={0}, k={1}, error = {2}".format(b, k, compute_error(b, k, x_data, y_data)))

b, k = gradient_descent_runner(x_data, y_data, b, k, lr, epochs)
print("Running...")

print("after {0} iterations b={1}, k={2}, error={3}".format(epochs, b, k, compute_error(b, k, x_data, y_data)))

plt.plot(x_data, y_data, 'b.')
plt.plot(x_data, k*x_data +b, 'r')
plt.show()

The output result:

1
2
3
4

starting b=0, k=0, error = 2782.5539172416056
Running...
after 50 iterations b=0.030569950649287983, k=1.4788903781318357, error=56.32488184238028

PS:
gradient-descent-data.csv

Sklearn

1
2
3
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
1
2
3
4
5
6
data = np.genfromtxt("gradient-descent-data.csv", delimiter=",")
x_data = data[:, 0]
y_data = data[:, 1]
plt.scatter(x_data,y_data)
plt.show()
print(x_data.shape)

(100,)

1
2
3
4
x_data = data[:, 0, np.newaxis]
y_data = data[:, 1, np.newaxis]
model = LinearRegression()
model.fit(x_data,y_data)
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
1
2
3
plt.plot(x_data,y_data,'b.')
plt.plot(x_data, model.predict(x_data), 'r')
plt.show()

d02-Cloud Functions

Functions-framework

Functions framework helps in setting up the environment for testing of cloud functions locally
Here is an example

installation

1
pip install functions-framework

code example:

create a new project and install functions-framework as above, and then create a new python file main.py

1
2
3
4
5
6
def multiply(request):
request_json = request.get_json()
num_1 = request_json["num_1"]
num_2 = request_json["num_2"]
result = num_1 * num_2
return (f"The multiplication result is {result}", 200)

run the function on local:

1
functions-framework --port 8080 --target multiply --signature-type http --source main.py --debug

Test the function
use another terminal and enter the curl command:

1
2
3
4
5
curl -X POST \
-H "Content-type:application/json" \
-d '{"num_1":20, "num_2": 30}' \
-w '\n' \
http://localhost:8080

Response in this new terminal:

Python-c06-udpate data

1
2
3
4
5
people = {
"first": ["Corey", 'Jane', 'John'],
"last": ["Schafer", 'Doe', 'Doe'],
"email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}
1
import pandas as pd
1
df = pd.DataFrame(people)
1
df.columns

Index([‘first’, ‘last’, ‘email’], dtype=’object’)

1
2
# change all of the column names
df.columns=['first_name', 'last_name', 'email']
1
df

first_name last_name email
0 Corey Schafer CoreyMSchafer@gmail.com
1 Jane Doe JaneDoe@email.com
2 John Doe JohnDoe@email.com
1
2
# change all column names to upper case
df.columns = [x.upper() for x in df.columns]
1
df

FIRST_NAME LAST_NAME EMAIL
0 Corey Schafer CoreyMSchafer@gmail.com
1 Jane Doe JaneDoe@email.com
2 John Doe JohnDoe@email.com
1
2
3
# replace _ by space in all of the column names and vice versa
df.columns = df.columns.str.replace('_', ' ')
df

FIRST NAME LAST NAME EMAIL
0 Corey Schafer CoreyMSchafer@gmail.com
1 Jane Doe JaneDoe@email.com
2 John Doe JohnDoe@email.com
1
2
3
df.columns = df.columns.str.replace(' ', '_')
df.columns = [x.lower() for x in df.columns]
df

first_name last_name email
0 Corey Schafer CoreyMSchafer@gmail.com
1 Jane Doe JaneDoe@email.com
2 John Doe JohnDoe@email.com
1
2
3
# change only some columns: pass a dictionary for the key as the old name and the value as the new name
df.rename(columns={'first_name': 'first', 'last_name': 'last'}, inplace=True)
df

first last email
0 Corey Schafer CoreyMSchafer@gmail.com
1 Jane Doe JaneDoe@email.com
2 John Doe JohnDoe@email.com
1
# change the data values
1
2
3
# change all column values for a specific row
df.loc[2] = ['John', 'Smith', 'JohnSmith@email.com']
df

first last email
0 Corey Schafer CoreyMSchafer@gmail.com
1 Jane Doe JaneDoe@email.com
2 John Smith JohnSmith@email.com
1
2
3
# change only some columns in the row 2
df.loc[2, ['last', 'email']] = ['Smith', 'JohnSmith@email.com']
df

first last email
0 Corey Schafer CoreyMSchafer@gmail.com
1 Jane Doe JaneDoe@email.com
2 John Smith JohnSmith@email.com
1
2
3
# or we can also use at to change a specific column value
df.at[2, 'last'] = 'Doe'
df

first last email
0 Corey Schafer CoreyMSchafer@gmail.com
1 Jane Doe JaneDoe@email.com
2 John Doe JohnSmith@email.com
1
2
3
4
# change value with filter: we have to use loc
filt = (df['email'] == 'JohnSmith@email.com')
df.loc[filt, 'last'] = 'Smith'
df

first last email
0 Corey Schafer CoreyMSchafer@gmail.com
1 Jane Doe JaneDoe@email.com
2 John Smith JohnSmith@email.com
1
2
3
# change email to lower case
df['email'] = df['email'].str.lower()
df

first last email
0 Corey Schafer coreymschafer@gmail.com
1 Jane Doe janedoe@email.com
2 John Smith johnsmith@email.com
1
2
3
# apply
# return the email length of each row
df['email'].apply(len)

0 23
1 17
2 19
Name: email, dtype: int64

1
2
3
4
5
def update_email(email):
return email.upper()

df['email'] = df['email'].apply(update_email)
df

first last email
0 Corey Schafer COREYMSCHAFER@GMAIL.COM
1 Jane Doe JANEDOE@EMAIL.COM
2 John Smith JOHNSMITH@EMAIL.COM
1
2
3
# use lambda function
df['email'] = df['email'].apply(lambda x: x.lower())
df

first last email
0 Corey Schafer coreymschafer@gmail.com
1 Jane Doe janedoe@email.com
2 John Smith johnsmith@email.com
1
2
3
# run apply on a dataframe
# this gives us the value count of each column
df.apply(len)

first 3
last 3
email 3
dtype: int64

1
2
# this gives us the column count of each row
df.apply(len, axis='columns')

0 3
1 3
2 3
dtype: int64

1
2
# get the min of each column
df.apply(pd.Series.min)

first Corey
last Doe
email coreymschafer@gmail.com
dtype: object

1
2
# applymap works only on every element of the DataFrame
df.applymap(len)

first last email
0 5 7 23
1 4 3 17
2 4 5 19
1
2
3
# map works on the Series
# replace Corey by Chris, Jane by Mary and put NaN to email
df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})

0 Chris
1 Mary
2 NaN
Name: first, dtype: object

1
2
# this replace just the first and last, but not the email
df['first'].replace({'Corey': 'Chris', 'Jane': 'Mary'})

0 Chris
1 Mary
2 John
Name: first, dtype: object

Python-c05-Filtering

1
2
3
4
5
people = {
"first": ["Corey", 'Jane', 'John'],
"last": ["Schafer", 'Doe', 'Doe'],
"email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}
1
import pandas as pd
1
df = pd.DataFrame(people)
1
df

first last email
0 Corey Schafer CoreyMSchafer@gmail.com
1 Jane Doe JaneDoe@email.com
2 John Doe JohnDoe@email.com
1
filt = (df['last'] == 'Doe')
1
df[filt]

first last email
1 Jane Doe JaneDoe@email.com
2 John Doe JohnDoe@email.com
1
2
# filter df with filt and return just the column first and email
df.loc[filt, ['first', 'email']]

first email
1 Jane JaneDoe@email.com
2 John JohnDoe@email.com
1
2
# not in filter ~
df.loc[~filt, 'email']

0 CoreyMSchafer@gmail.com
Name: email, dtype: object

1
2
3
4
# isin filter
lasts = ['Doe', 'Mike', 'Bob']
filt = df['last'].isin(lasts)
df.loc[filt, 'email']

1 JaneDoe@email.com
2 JohnDoe@email.com
Name: email, dtype: object

1
df = pd.read_csv('data/survey_results_public.csv')
1
df

Respondent Professional ProgramHobby Country University EmploymentStatus FormalEducation MajorUndergrad HomeRemote CompanySize ... StackOverflowMakeMoney Gender HighestEducationParents Race SurveyLong QuestionsInteresting QuestionsConfusing InterestedAnswers Salary ExpectedSalary
0 1 Student Yes, both United States No Not employed, and not looking for work Secondary school NaN NaN NaN ... Strongly disagree Male High school White or of European descent Strongly disagree Strongly agree Disagree Strongly agree NaN NaN
1 2 Student Yes, both United Kingdom Yes, full-time Employed part-time Some college/university study without earning ... Computer science or software engineering More than half, but not all, the time 20 to 99 employees ... Strongly disagree Male A master's degree White or of European descent Somewhat agree Somewhat agree Disagree Strongly agree NaN 37500.0
2 3 Professional developer Yes, both United Kingdom No Employed full-time Bachelor's degree Computer science or software engineering Less than half the time, but at least one day ... 10,000 or more employees ... Disagree Male A professional degree White or of European descent Somewhat agree Agree Disagree Agree 113750.0 NaN
3 4 Professional non-developer who sometimes write... Yes, both United States No Employed full-time Doctoral degree A non-computer-focused engineering discipline Less than half the time, but at least one day ... 10,000 or more employees ... Disagree Male A doctoral degree White or of European descent Agree Agree Somewhat agree Strongly agree NaN NaN
4 5 Professional developer Yes, I program as a hobby Switzerland No Employed full-time Master's degree Computer science or software engineering Never 10 to 19 employees ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19097 19098 Professional developer Yes, I program as a hobby Canada No Employed full-time Bachelor's degree A business discipline A few days each month 10 to 19 employees ... Disagree Male Some college/university study, no bachelor's d... White or of European descent Somewhat agree Agree Disagree Agree NaN NaN
19098 19099 Student Yes, I program as a hobby India No Not employed, and not looking for work Secondary school NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
19099 19100 Professional non-developer who sometimes write... Yes, I program as a hobby United Kingdom No Independent contractor, freelancer, or self-em... Bachelor's degree Computer science or software engineering Never NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
19100 19101 Professional developer Yes, I program as a hobby United States No Employed full-time Some college/university study without earning ... A humanities discipline Less than half the time, but at least one day ... 100 to 499 employees ... Disagree Male Some college/university study, no bachelor's d... White or of European descent Somewhat agree Somewhat agree Disagree Agree 110000.0 NaN
19101 19102 Professional developer Yes, I program as a hobby France No Employed full-time Master's degree Computer science or software engineering All or almost all the time (I'm full-time remote) 100 to 499 employees ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

19102 rows × 154 columns

1
2
# na=False means if not found the result, put False
filt = df['MajorUndergrad'].str.contains('software', na=False)
1
df.loc[filt]

Respondent Professional ProgramHobby Country University EmploymentStatus FormalEducation MajorUndergrad HomeRemote CompanySize ... StackOverflowMakeMoney Gender HighestEducationParents Race SurveyLong QuestionsInteresting QuestionsConfusing InterestedAnswers Salary ExpectedSalary
1 2 Student Yes, both United Kingdom Yes, full-time Employed part-time Some college/university study without earning ... Computer science or software engineering More than half, but not all, the time 20 to 99 employees ... Strongly disagree Male A master's degree White or of European descent Somewhat agree Somewhat agree Disagree Strongly agree NaN 37500.0
2 3 Professional developer Yes, both United Kingdom No Employed full-time Bachelor's degree Computer science or software engineering Less than half the time, but at least one day ... 10,000 or more employees ... Disagree Male A professional degree White or of European descent Somewhat agree Agree Disagree Agree 113750.0 NaN
4 5 Professional developer Yes, I program as a hobby Switzerland No Employed full-time Master's degree Computer science or software engineering Never 10 to 19 employees ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7 8 Professional developer Yes, both Poland No Employed full-time Master's degree Computer science or software engineering All or almost all the time (I'm full-time remote) Fewer than 10 employees ... Somewhat agree Male A master's degree White or of European descent Agree Somewhat agree Disagree Agree NaN NaN
8 9 Professional developer Yes, I program as a hobby Colombia Yes, part-time Employed full-time Bachelor's degree Computer science or software engineering Less than half the time, but at least one day ... 5,000 to 9,999 employees ... Strongly disagree Male A bachelor's degree Hispanic or Latino/Latina Somewhat agree Strongly agree Disagree Strongly agree NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19091 19092 Professional developer No Sweden No Employed full-time Master's degree Computer science or software engineering Never 100 to 499 employees ... Strongly disagree Male A master's degree White or of European descent Disagree Agree Somewhat agree Strongly agree NaN NaN
19095 19096 Professional developer Yes, both United States No Employed full-time Bachelor's degree Computer science or software engineering A few days each month 100 to 499 employees ... Disagree Male A bachelor's degree White or of European descent Somewhat agree Somewhat agree Disagree Somewhat agree NaN NaN
19096 19097 Professional developer No France No Employed full-time Master's degree Computer science or software engineering Never 20 to 99 employees ... Strongly disagree Male High school White or of European descent Strongly agree Somewhat agree Somewhat agree Somewhat agree NaN NaN
19099 19100 Professional non-developer who sometimes write... Yes, I program as a hobby United Kingdom No Independent contractor, freelancer, or self-em... Bachelor's degree Computer science or software engineering Never NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
19101 19102 Professional developer Yes, I program as a hobby France No Employed full-time Master's degree Computer science or software engineering All or almost all the time (I'm full-time remote) 100 to 499 employees ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

8098 rows × 154 columns

Python-c04-indexes

1
import pandas as pd
1
2
df = pd.read_csv('data/survey_results_public.csv',  index_col='Respondent')
df_schema = pd.read_csv('data/survey_results_schema.csv', index_col='Column')
1
2
# inplace=True will apply the changes to the DataFrame
df_schema.sort_index(inplace=True)
1
df_schema

Question
Column
AnnoyingUI It annoys me when software has a poor UI
AssessJobCommute When you're assessing potential jobs to apply ...
AssessJobCompensation When you're assessing potential jobs to apply ...
AssessJobDept When you're assessing potential jobs to apply ...
AssessJobDiversity When you're assessing potential jobs to apply ...
... ...
WorkPayCare I don't really care what I work on, so long as...
WorkStart Suppose you could choose your own working hour...
YearsCodedJob For how many years have you coded as part of y...
YearsCodedJobPast For how many years did you code as part of you...
YearsProgram How long has it been since you first learned h...

154 rows × 1 columns

1
2
# set index to the Country
df.set_index('Country', inplace=True)
1
2
# give the list of index elements
df.index

Index([‘United States’, ‘United Kingdom’, ‘United Kingdom’, ‘United States’,
‘Switzerland’, ‘New Zealand’, ‘United States’, ‘Poland’, ‘Colombia’,
‘France’,

‘United States’, ‘India’, ‘United Kingdom’, ‘United States’, ‘France’,
‘Canada’, ‘India’, ‘United Kingdom’, ‘United States’, ‘France’],
dtype=’object’, name=’Country’, length=19102)

1
2
# pass the element of the index to loc
df.loc['United States']

University EmploymentStatus FormalEducation MajorUndergrad HomeRemote CompanySize CompanyType YearsProgram YearsCodedJob YearsCodedJobPast ... StackOverflowMakeMoney Gender HighestEducationParents Race SurveyLong QuestionsInteresting QuestionsConfusing InterestedAnswers Salary ExpectedSalary
Country
United States No Not employed, and not looking for work Secondary school NaN NaN NaN NaN 2 to 3 years NaN NaN ... Strongly disagree Male High school White or of European descent Strongly disagree Strongly agree Disagree Strongly agree NaN NaN
United States No Employed full-time Doctoral degree A non-computer-focused engineering discipline Less than half the time, but at least one day ... 10,000 or more employees Non-profit/non-governmental organization or pr... 14 to 15 years 9 to 10 years NaN ... Disagree Male A doctoral degree White or of European descent Agree Agree Somewhat agree Strongly agree NaN NaN
United States No Employed full-time Master's degree A non-computer-focused engineering discipline Less than half the time, but at least one day ... 20 to 99 employees Government agency or public school/university 9 to 10 years 8 to 9 years NaN ... Disagree Male A doctoral degree White or of European descent Disagree Agree Disagree Agree NaN NaN
United States No Employed full-time Bachelor's degree A social science All or almost all the time (I'm full-time remote) 100 to 499 employees Venture-funded startup 12 to 13 years 11 to 12 years NaN ... Strongly disagree Female Some college/university study, no bachelor's d... White or of European descent Strongly disagree Agree Strongly disagree Strongly agree NaN NaN
United States Yes, part-time Independent contractor, freelancer, or self-em... Primary/elementary school NaN All or almost all the time (I'm full-time remote) NaN NaN 11 to 12 years 1 to 2 years NaN ... Disagree Male A doctoral degree White or of European descent Disagree Somewhat agree Strongly disagree Agree NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
United States No Employed full-time Bachelor's degree A natural science All or almost all the time (I'm full-time remote) Fewer than 10 employees Privately-held limited company, not in startup... 6 to 7 years 3 to 4 years NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
United States No Employed part-time Some college/university study without earning ... Computer science or software engineering Less than half the time, but at least one day ... 10 to 19 employees Sole proprietorship or partnership, not in sta... 2 to 3 years 1 to 2 years NaN ... NaN Male Some college/university study, no bachelor's d... White or of European descent Disagree Agree Disagree Strongly agree NaN NaN
United States No Employed full-time Bachelor's degree Management information systems Never 10,000 or more employees Publicly-traded corporation 20 or more years 20 or more years NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
United States No Employed full-time Bachelor's degree Computer science or software engineering A few days each month 100 to 499 employees Venture-funded startup 20 or more years 19 to 20 years NaN ... Disagree Male A bachelor's degree White or of European descent Somewhat agree Somewhat agree Disagree Somewhat agree NaN NaN
United States No Employed full-time Some college/university study without earning ... A humanities discipline Less than half the time, but at least one day ... 100 to 499 employees Sole proprietorship or partnership, not in sta... 20 or more years 20 or more years NaN ... Disagree Male Some college/university study, no bachelor's d... White or of European descent Somewhat agree Somewhat agree Disagree Agree 110000.0 NaN

4364 rows × 150 columns

1
2
# reset the index to the integer
df.reset_index(inplace=True)
1
df

Country University EmploymentStatus FormalEducation MajorUndergrad HomeRemote CompanySize CompanyType YearsProgram YearsCodedJob ... StackOverflowMakeMoney Gender HighestEducationParents Race SurveyLong QuestionsInteresting QuestionsConfusing InterestedAnswers Salary ExpectedSalary
0 United States No Not employed, and not looking for work Secondary school NaN NaN NaN NaN 2 to 3 years NaN ... Strongly disagree Male High school White or of European descent Strongly disagree Strongly agree Disagree Strongly agree NaN NaN
1 United Kingdom Yes, full-time Employed part-time Some college/university study without earning ... Computer science or software engineering More than half, but not all, the time 20 to 99 employees Privately-held limited company, not in startup... 9 to 10 years NaN ... Strongly disagree Male A master's degree White or of European descent Somewhat agree Somewhat agree Disagree Strongly agree NaN 37500.0
2 United Kingdom No Employed full-time Bachelor's degree Computer science or software engineering Less than half the time, but at least one day ... 10,000 or more employees Publicly-traded corporation 20 or more years 20 or more years ... Disagree Male A professional degree White or of European descent Somewhat agree Agree Disagree Agree 113750.0 NaN
3 United States No Employed full-time Doctoral degree A non-computer-focused engineering discipline Less than half the time, but at least one day ... 10,000 or more employees Non-profit/non-governmental organization or pr... 14 to 15 years 9 to 10 years ... Disagree Male A doctoral degree White or of European descent Agree Agree Somewhat agree Strongly agree NaN NaN
4 Switzerland No Employed full-time Master's degree Computer science or software engineering Never 10 to 19 employees Privately-held limited company, not in startup... 20 or more years 10 to 11 years ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19097 Canada No Employed full-time Bachelor's degree A business discipline A few days each month 10 to 19 employees Privately-held limited company, not in startup... 1 to 2 years 1 to 2 years ... Disagree Male Some college/university study, no bachelor's d... White or of European descent Somewhat agree Agree Disagree Agree NaN NaN
19098 India No Not employed, and not looking for work Secondary school NaN NaN NaN NaN 1 to 2 years NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
19099 United Kingdom No Independent contractor, freelancer, or self-em... Bachelor's degree Computer science or software engineering Never NaN NaN 14 to 15 years 14 to 15 years ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
19100 United States No Employed full-time Some college/university study without earning ... A humanities discipline Less than half the time, but at least one day ... 100 to 499 employees Sole proprietorship or partnership, not in sta... 20 or more years 20 or more years ... Disagree Male Some college/university study, no bachelor's d... White or of European descent Somewhat agree Somewhat agree Disagree Agree 110000.0 NaN
19101 France No Employed full-time Master's degree Computer science or software engineering All or almost all the time (I'm full-time remote) 100 to 499 employees Sole proprietorship or partnership, not in sta... 14 to 15 years 9 to 10 years ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

19102 rows × 151 columns

1
df.sort_index(ascending=False)

Country University EmploymentStatus FormalEducation MajorUndergrad HomeRemote CompanySize CompanyType YearsProgram YearsCodedJob ... StackOverflowMakeMoney Gender HighestEducationParents Race SurveyLong QuestionsInteresting QuestionsConfusing InterestedAnswers Salary ExpectedSalary
19101 France No Employed full-time Master's degree Computer science or software engineering All or almost all the time (I'm full-time remote) 100 to 499 employees Sole proprietorship or partnership, not in sta... 14 to 15 years 9 to 10 years ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
19100 United States No Employed full-time Some college/university study without earning ... A humanities discipline Less than half the time, but at least one day ... 100 to 499 employees Sole proprietorship or partnership, not in sta... 20 or more years 20 or more years ... Disagree Male Some college/university study, no bachelor's d... White or of European descent Somewhat agree Somewhat agree Disagree Agree 110000.0 NaN
19099 United Kingdom No Independent contractor, freelancer, or self-em... Bachelor's degree Computer science or software engineering Never NaN NaN 14 to 15 years 14 to 15 years ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
19098 India No Not employed, and not looking for work Secondary school NaN NaN NaN NaN 1 to 2 years NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
19097 Canada No Employed full-time Bachelor's degree A business discipline A few days each month 10 to 19 employees Privately-held limited company, not in startup... 1 to 2 years 1 to 2 years ... Disagree Male Some college/university study, no bachelor's d... White or of European descent Somewhat agree Agree Disagree Agree NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4 Switzerland No Employed full-time Master's degree Computer science or software engineering Never 10 to 19 employees Privately-held limited company, not in startup... 20 or more years 10 to 11 years ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 United States No Employed full-time Doctoral degree A non-computer-focused engineering discipline Less than half the time, but at least one day ... 10,000 or more employees Non-profit/non-governmental organization or pr... 14 to 15 years 9 to 10 years ... Disagree Male A doctoral degree White or of European descent Agree Agree Somewhat agree Strongly agree NaN NaN
2 United Kingdom No Employed full-time Bachelor's degree Computer science or software engineering Less than half the time, but at least one day ... 10,000 or more employees Publicly-traded corporation 20 or more years 20 or more years ... Disagree Male A professional degree White or of European descent Somewhat agree Agree Disagree Agree 113750.0 NaN
1 United Kingdom Yes, full-time Employed part-time Some college/university study without earning ... Computer science or software engineering More than half, but not all, the time 20 to 99 employees Privately-held limited company, not in startup... 9 to 10 years NaN ... Strongly disagree Male A master's degree White or of European descent Somewhat agree Somewhat agree Disagree Strongly agree NaN 37500.0
0 United States No Not employed, and not looking for work Secondary school NaN NaN NaN NaN 2 to 3 years NaN ... Strongly disagree Male High school White or of European descent Strongly disagree Strongly agree Disagree Strongly agree NaN NaN

19102 rows × 151 columns

Python-c03-DataFrame and Series

1
import pandas as pd
1
2
3
4
5
person = {
"first": "Corey",
"last": "Schafer",
"email": "CoreyMSchafer@gmail.com"
}
1
2
3
4
5
people = {
"first": ["Corey"],
"last": ["Schafer"],
"email": ["CoreyMSchafer@gmail.com"]
}
1
2
3
4
5
people = {
"first": ["Corey", 'Jane', 'John'],
"last": ["Schafer", 'Doe', 'Doe'],
"email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}
1
people['email']

[‘CoreyMSchafer@gmail.com‘, ‘JaneDoe@email.com‘, ‘JohnDoe@email.com‘]

1
df = pd.DataFrame(people)
1
df

first last email
0 Corey Schafer CoreyMSchafer@gmail.com
1 Jane Doe JaneDoe@email.com
2 John Doe JohnDoe@email.com
1
df['email']

0 CoreyMSchafer@gmail.com
1 JaneDoe@email.com
2 JohnDoe@email.com
Name: email, dtype: object

1
2
#display multiple columns
df[['last', 'email']]

last email
0 Schafer CoreyMSchafer@gmail.com
1 Doe JaneDoe@email.com
2 Doe JohnDoe@email.com
1
2
3
# iloc: integer location: put the integer index to get the row
# get the first row in this example
df.iloc[0]

first Corey
last Schafer
email CoreyMSchafer@gmail.com
Name: 0, dtype: object

1
df.iloc[[0, 1]]

first last email
0 Corey Schafer CoreyMSchafer@gmail.com
1 Jane Doe JaneDoe@email.com
1
2
# get the email and last of the first and second row
df.loc[[0, 1], ['email', 'last']]

email last
0 CoreyMSchafer@gmail.com Schafer
1 JaneDoe@email.com Doe
1
df.iloc[[0, 1], ['email', 'last']]
---------------------------------------------------------------------------

IndexError Traceback (most recent call last)

Cell In [15], line 1
—-> 1 df.iloc[[0, 1], [‘email’, ‘last’]]

File /opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py:1068, in _LocationIndexer.getitem(self, key)
1066 if self._is_scalar_access(key):
1067 return self.obj._get_value(*key, takeable=self._takeable)
-> 1068 return self._getitem_tuple(key)
1069 else:
1070 # we by definition only have the 0th axis
1071 axis = self.axis or 0

File /opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py:1564, in _iLocIndexer._getitem_tuple(self, tup)
1562 def _getitem_tuple(self, tup: tuple):
-> 1564 tup = self._validate_tuple_indexer(tup)
1565 with suppress(IndexingError):
1566 return self._getitem_lowerdim(tup)

File /opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py:874, in _LocationIndexer._validate_tuple_indexer(self, key)
872 for i, k in enumerate(key):
873 try:
–> 874 self._validate_key(k, i)
875 except ValueError as err:
876 raise ValueError(
877 “Location based indexing can only have “
878 f”[{self._valid_types}] types”
879 ) from err

File /opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py:1478, in _iLocIndexer._validate_key(self, key, axis)
1476 # check that the key has a numeric dtype
1477 if not is_numeric_dtype(arr.dtype):
-> 1478 raise IndexError(f”.iloc requires numeric indexers, got {arr}”)
1480 # check that the key does not exceed the maximum size of the index
1481 if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis):

IndexError: .iloc requires numeric indexers, got [‘email’ ‘last’]

1
df.iloc[[0, 1], [1,2]]

last email
0 Schafer CoreyMSchafer@gmail.com
1 Doe JaneDoe@email.com