2023-11-03

Python-c07-Filtering-and-Ordering

1	import pandas as pd

1	df = pd.read_csv("world_population.csv")

1	df[df["Rank"] < 10]

	Rank	CCA3	Country	Capital	Continent	2022 Population	2020 Population	2015 Population	2010 Population	2000 Population	1990 Population	1980 Population	1970 Population	Area (km²)	Density (per km²)	Growth Rate	World Population Percentage
16	8	BGD	Bangladesh	Dhaka	Asia	1.711864e+08	1.674210e+08	1.578300e+08	1.483911e+08	1.291933e+08	1.071477e+08	83929765.0	67541860.0	147570.0	1160.0350	1.0108	2.15
27	7	BRA	Brazil	Brasilia	South America	2.153135e+08	2.131963e+08	2.051882e+08	1.963535e+08	1.758737e+08	1.507064e+08	122288383.0	96369875.0	8515767.0	25.2841	1.0046	2.70
41	1	CHN	China	Beijing	Asia	1.425887e+09	1.424930e+09	1.393715e+09	1.348191e+09	1.264099e+09	1.153704e+09	982372466.0	822534450.0	9706961.0	146.8933	1.0000	17.88
92	2	IND	India	New Delhi	Asia	1.417173e+09	1.396387e+09	1.322867e+09	1.240614e+09	1.059634e+09	NaN	NaN	557501301.0	3287590.0	431.0675	1.0068	17.77
93	4	IDN	Indonesia	Jakarta	Asia	2.755013e+08	2.718580e+08	2.590920e+08	2.440162e+08	2.140724e+08	1.821599e+08	148177096.0	115228394.0	1904569.0	144.6529	1.0064	3.45
149	6	NGA	Nigeria	Abuja	Africa	2.185412e+08	2.083274e+08	1.839958e+08	1.609529e+08	1.228520e+08	9.521426e+07	72951439.0	55569264.0	923768.0	236.5759	1.0241	2.74
156	5	PAK	Pakistan	Islamabad	Asia	2.358249e+08	2.271967e+08	2.109693e+08	1.944545e+08	1.543699e+08	1.154141e+08	80624057.0	59290872.0	881912.0	267.4018	1.0191	2.96
171	9	RUS	Russia	Moscow	Europe	1.447133e+08	1.456173e+08	1.446684e+08	1.432426e+08	1.468448e+08	1.480057e+08	138257420.0	130093010.0	17098242.0	8.4636	0.9973	1.81
221	3	USA	United States	Washington, D.C.	North America	3.382899e+08	3.359420e+08	3.246078e+08	3.111828e+08	2.823986e+08	2.480837e+08	223140018.0	200328340.0	9372610.0	36.0935	1.0038	4.24

1 2	specific_countries = ['Bangladesh', 'Brazil'] df[df['Country'].isin(specific_countries)]

	Rank	CCA3	Country	Capital	Continent	2022 Population	2020 Population	2015 Population	2010 Population	2000 Population	1990 Population	1980 Population	1970 Population	Area (km²)	Density (per km²)	Growth Rate	World Population Percentage
16	8	BGD	Bangladesh	Dhaka	Asia	171186372.0	167420951.0	157830000.0	148391139.0	129193327.0	107147651.0	83929765.0	67541860.0	147570.0	1160.0350	1.0108	2.15
27	7	BRA	Brazil	Brasilia	South America	215313498.0	213196304.0	205188205.0	196353492.0	175873720.0	150706446.0	122288383.0	96369875.0	8515767.0	25.2841	1.0046	2.70

1 2	# filter country which contains United df[df['Country'].str.contains('United')]

	Rank	CCA3	Country	Capital	Continent	2022 Population	2020 Population	2015 Population	2010 Population	2000 Population	1990 Population	1980 Population	1970 Population	Area (km²)	Density (per km²)	Growth Rate	World Population Percentage
219	97	ARE	United Arab Emirates	Abu Dhabi	Asia	9441129.0	9287289.0	8916899.0	8481771.0	3275333.0	1900151.0	1014048.0	298084.0	83600.0	112.9322	1.0081	0.12
220	21	GBR	United Kingdom	London	Europe	67508936.0	67059474.0	65224364.0	62760039.0	58850043.0	57210442.0	56326328.0	55650166.0	242900.0	277.9289	1.0034	0.85
221	3	USA	United States	Washington, D.C.	North America	338289857.0	335942003.0	324607776.0	311182845.0	282398554.0	248083732.0	223140018.0	200328340.0	9372610.0	36.0935	1.0038	4.24
222	200	VIR	United States Virgin Islands	Charlotte Amalie	North America	99465.0	100442.0	102803.0	106142.0	108185.0	100685.0	96640.0	63446.0	347.0	286.6427	0.9937	0.00

# set index to country
df2 = df.set_index('Country')
# show only the columns Contient and CCA3
df2.filter(items = ['Continent', 'CCA3'])

	Continent	CCA3
Country
Afghanistan	Asia	AFG
Albania	Europe	ALB
Algeria	Africa	DZA
American Samoa	Oceania	ASM
Andorra	Europe	AND
...	...	...
Wallis and Futuna	Oceania	WLF
Western Sahara	Africa	ESH
Yemen	Asia	YEM
Zambia	Africa	ZMB
Zimbabwe	Africa	ZWE

234 rows × 2 columns

1
2
3

# axis= is the vertical axis and axis=1 is the horizontal axis
# The following returns 0 result as 'Continent', 'CCA3' do not exist in the vertical axis which is Country
df2.filter(items=['Continent', 'CCA3'], axis=0)

	Rank	CCA3	Capital	Continent	2022 Population	2020 Population	2015 Population	2010 Population	2000 Population	1990 Population	1980 Population	1970 Population	Area (km²)	Density (per km²)	Growth Rate	World Population Percentage

1 2	# but axis=1 exists as 'Continent', 'CCA3' exist in the horizontal axis df2.filter(items=['Continent', 'CCA3'], axis=1)

	Continent	CCA3
Country
Afghanistan	Asia	AFG
Albania	Europe	ALB
Algeria	Africa	DZA
American Samoa	Oceania	ASM
Andorra	Europe	AND
...	...	...
Wallis and Futuna	Oceania	WLF
Western Sahara	Africa	ESH
Yemen	Asia	YEM
Zambia	Africa	ZMB
Zimbabwe	Africa	ZWE

234 rows × 2 columns

1 2	# This will return all the countries which contains "United" df2.filter(like='United', axis=0)

	Rank	CCA3	Capital	Continent	2022 Population	2020 Population	2015 Population	2010 Population	2000 Population	1990 Population	1980 Population	1970 Population	Area (km²)	Density (per km²)	Growth Rate	World Population Percentage
Country
United Arab Emirates	97	ARE	Abu Dhabi	Asia	9441129.0	9287289.0	8916899.0	8481771.0	3275333.0	1900151.0	1014048.0	298084.0	83600.0	112.9322	1.0081	0.12
United Kingdom	21	GBR	London	Europe	67508936.0	67059474.0	65224364.0	62760039.0	58850043.0	57210442.0	56326328.0	55650166.0	242900.0	277.9289	1.0034	0.85
United States	3	USA	Washington, D.C.	North America	338289857.0	335942003.0	324607776.0	311182845.0	282398554.0	248083732.0	223140018.0	200328340.0	9372610.0	36.0935	1.0038	4.24
United States Virgin Islands	200	VIR	Charlotte Amalie	North America	99465.0	100442.0	102803.0	106142.0	108185.0	100685.0	96640.0	63446.0	347.0	286.6427	0.9937	0.00

1	df2.loc['United States']

Rank                                          3
CCA3                                        USA
Capital                        Washington, D.C.
Continent                         North America
2022 Population                     338289857.0
2020 Population                     335942003.0
2015 Population                     324607776.0
2010 Population                     311182845.0
2000 Population                     282398554.0
1990 Population                     248083732.0
1980 Population                     223140018.0
1970 Population                     200328340.0
Area (km²)                            9372610.0
Density (per km²)                       36.0935
Growth Rate                              1.0038
World Population Percentage                4.24
Name: United States, dtype: object

1	df2.iloc[3]

Rank                                 213
CCA3                                 ASM
Capital                        Pago Pago
Continent                        Oceania
2022 Population                  44273.0
2020 Population                  46189.0
2015 Population                  51368.0
2010 Population                  54849.0
2000 Population                  58230.0
1990 Population                  47818.0
1980 Population                  32886.0
1970 Population                  27075.0
Area (km²)                         199.0
Density (per km²)               222.4774
Growth Rate                       0.9831
World Population Percentage          0.0
Name: American Samoa, dtype: object

1 2	# sort by Rank and Country in ascending df[df['Rank'] < 10].sort_values(by=['Rank', 'Country'], ascending=True)

	Rank	CCA3	Country	Capital	Continent	2022 Population	2020 Population	2015 Population	2010 Population	2000 Population	1990 Population	1980 Population	1970 Population	Area (km²)	Density (per km²)	Growth Rate	World Population Percentage
41	1	CHN	China	Beijing	Asia	1.425887e+09	1.424930e+09	1.393715e+09	1.348191e+09	1.264099e+09	1.153704e+09	982372466.0	822534450.0	9706961.0	146.8933	1.0000	17.88
92	2	IND	India	New Delhi	Asia	1.417173e+09	1.396387e+09	1.322867e+09	1.240614e+09	1.059634e+09	NaN	NaN	557501301.0	3287590.0	431.0675	1.0068	17.77
221	3	USA	United States	Washington, D.C.	North America	3.382899e+08	3.359420e+08	3.246078e+08	3.111828e+08	2.823986e+08	2.480837e+08	223140018.0	200328340.0	9372610.0	36.0935	1.0038	4.24
93	4	IDN	Indonesia	Jakarta	Asia	2.755013e+08	2.718580e+08	2.590920e+08	2.440162e+08	2.140724e+08	1.821599e+08	148177096.0	115228394.0	1904569.0	144.6529	1.0064	3.45
156	5	PAK	Pakistan	Islamabad	Asia	2.358249e+08	2.271967e+08	2.109693e+08	1.944545e+08	1.543699e+08	1.154141e+08	80624057.0	59290872.0	881912.0	267.4018	1.0191	2.96
149	6	NGA	Nigeria	Abuja	Africa	2.185412e+08	2.083274e+08	1.839958e+08	1.609529e+08	1.228520e+08	9.521426e+07	72951439.0	55569264.0	923768.0	236.5759	1.0241	2.74
27	7	BRA	Brazil	Brasilia	South America	2.153135e+08	2.131963e+08	2.051882e+08	1.963535e+08	1.758737e+08	1.507064e+08	122288383.0	96369875.0	8515767.0	25.2841	1.0046	2.70
16	8	BGD	Bangladesh	Dhaka	Asia	1.711864e+08	1.674210e+08	1.578300e+08	1.483911e+08	1.291933e+08	1.071477e+08	83929765.0	67541860.0	147570.0	1160.0350	1.0108	2.15
171	9	RUS	Russia	Moscow	Europe	1.447133e+08	1.456173e+08	1.446684e+08	1.432426e+08	1.468448e+08	1.480057e+08	138257420.0	130093010.0	17098242.0	8.4636	0.9973	1.81

1 2	# sort by Rank in ascending and Country in descending df[df['Rank'] < 10].sort_values(by=['Rank', 'Country'], ascending=[True, False])

	Rank	CCA3	Country	Capital	Continent	2022 Population	2020 Population	2015 Population	2010 Population	2000 Population	1990 Population	1980 Population	1970 Population	Area (km²)	Density (per km²)	Growth Rate	World Population Percentage
41	1	CHN	China	Beijing	Asia	1.425887e+09	1.424930e+09	1.393715e+09	1.348191e+09	1.264099e+09	1.153704e+09	982372466.0	822534450.0	9706961.0	146.8933	1.0000	17.88
92	2	IND	India	New Delhi	Asia	1.417173e+09	1.396387e+09	1.322867e+09	1.240614e+09	1.059634e+09	NaN	NaN	557501301.0	3287590.0	431.0675	1.0068	17.77
221	3	USA	United States	Washington, D.C.	North America	3.382899e+08	3.359420e+08	3.246078e+08	3.111828e+08	2.823986e+08	2.480837e+08	223140018.0	200328340.0	9372610.0	36.0935	1.0038	4.24
93	4	IDN	Indonesia	Jakarta	Asia	2.755013e+08	2.718580e+08	2.590920e+08	2.440162e+08	2.140724e+08	1.821599e+08	148177096.0	115228394.0	1904569.0	144.6529	1.0064	3.45
156	5	PAK	Pakistan	Islamabad	Asia	2.358249e+08	2.271967e+08	2.109693e+08	1.944545e+08	1.543699e+08	1.154141e+08	80624057.0	59290872.0	881912.0	267.4018	1.0191	2.96
149	6	NGA	Nigeria	Abuja	Africa	2.185412e+08	2.083274e+08	1.839958e+08	1.609529e+08	1.228520e+08	9.521426e+07	72951439.0	55569264.0	923768.0	236.5759	1.0241	2.74
27	7	BRA	Brazil	Brasilia	South America	2.153135e+08	2.131963e+08	2.051882e+08	1.963535e+08	1.758737e+08	1.507064e+08	122288383.0	96369875.0	8515767.0	25.2841	1.0046	2.70
16	8	BGD	Bangladesh	Dhaka	Asia	1.711864e+08	1.674210e+08	1.578300e+08	1.483911e+08	1.291933e+08	1.071477e+08	83929765.0	67541860.0	147570.0	1160.0350	1.0108	2.15
171	9	RUS	Russia	Moscow	Europe	1.447133e+08	1.456173e+08	1.446684e+08	1.432426e+08	1.468448e+08	1.480057e+08	138257420.0	130093010.0	17098242.0	8.4636	0.9973	1.81

PS:
world_population.csv

2023-11-02

python►d-selenium

Python-d01-selenium

Here is an example to use selenium in python on linkedin.
This code will connect to linkedin and send message “testing” to the first friend
You have to change the username and password to do the authentication

this code is working in python 3.8 and selenium 4.14.0

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

import time

driver = webdriver.Chrome(service=Service(executable_path='C:\dev\softwares\chromedriver\chromedriver.exe'))
driver.get('https://www.linkedin.com/')

time.sleep(2)

username = driver.find_element(By.XPATH, "//input[@name='session_key']")
password = driver.find_element(By.XPATH, "//input[@name='session_password']")
username.send_keys(USERNAME_TO_CHANGE)
password.send_keys(PASSWORD_TO_CHANGE)

time.sleep(2)
submit = driver.find_element(By.XPATH, "//button[@type='submit']").click()

driver.get("https://www.linkedin.com/search/results/people/?network=%5B%22F%22%5D&origin=FACETED_SEARCH")
time.sleep(2)

all_buttons = driver.find_elements(By.TAG_NAME, 'button')
message_buttons = [btn for btn in all_buttons if btn.text == "Message"]

message_buttons[0].click()

time.sleep(2)

main_div = driver.find_element(By.XPATH, "//div[starts-with(@class, 'msg-form__msg-content-container')]")
main_div.click()

time.sleep(2)

paragraphs = driver.find_elements(By.TAG_NAME, "p")
paragraphs[-5].send_keys("testing")

2023-10-13

ml►a-algorithm

ml-a03-polynomial

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

data = np.genfromtxt("job.csv", delimiter=",")
x_data = data[1:,1]
y_data = data[1:,2]
plt.scatter(x_data,y_data)
plt.show()

# In this example, we use the linearRegression, we can see that the line does not match well the points

x_data = data[1:,1,np.newaxis]
y_data = data[1:,2,np.newaxis]
model = LinearRegression()
model.fit(x_data, y_data)

plt.plot(x_data,y_data, 'b.')
plt.plot(x_data,model.predict(x_data), 'r')
plt.show()

poly_reg = PolynomialFeatures(degree=5)
# Polynomial is used to add the features to the data, degree=1 => add data * power 0, degree = 2 => add data * power 0 and data * power 1
x_poly = poly_reg.fit_transform(x_data)
lin_reg = LinearRegression()
lin_reg.fit(x_poly, y_data)

x_poly

array([[1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00],
       [1.0000e+00, 2.0000e+00, 4.0000e+00, 8.0000e+00, 1.6000e+01,
        3.2000e+01],
       [1.0000e+00, 3.0000e+00, 9.0000e+00, 2.7000e+01, 8.1000e+01,
        2.4300e+02],
       [1.0000e+00, 4.0000e+00, 1.6000e+01, 6.4000e+01, 2.5600e+02,
        1.0240e+03],
       [1.0000e+00, 5.0000e+00, 2.5000e+01, 1.2500e+02, 6.2500e+02,
        3.1250e+03],
       [1.0000e+00, 6.0000e+00, 3.6000e+01, 2.1600e+02, 1.2960e+03,
        7.7760e+03],
       [1.0000e+00, 7.0000e+00, 4.9000e+01, 3.4300e+02, 2.4010e+03,
        1.6807e+04],
       [1.0000e+00, 8.0000e+00, 6.4000e+01, 5.1200e+02, 4.0960e+03,
        3.2768e+04],
       [1.0000e+00, 9.0000e+00, 8.1000e+01, 7.2900e+02, 6.5610e+03,
        5.9049e+04],
       [1.0000e+00, 1.0000e+01, 1.0000e+02, 1.0000e+03, 1.0000e+04,
        1.0000e+05]])

1
2
3

plt.plot(x_data, y_data, 'b.')
plt.plot(x_data, lin_reg.predict(x_poly), 'r')
plt.show()

PS:
job.csv

2023-10-13

ml►a-algorithm

ml-a02-multiple_linear_regression

Multiple Linear Regression

import numpy as np
from numpy import genfromtxt
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

1 2	data = genfromtxt(r"Delivery.csv", delimiter=",") print(data)

[[100.    4.    9.3]
 [ 50.    3.    4.8]
 [100.    4.    8.9]
 [100.    2.    6.5]
 [ 50.    2.    4.2]
 [ 80.    2.    6.2]
 [ 75.    3.    7.4]
 [ 65.    4.    6. ]
 [ 90.    3.    7.6]
 [ 90.    2.    6.1]]

# all data until the last column
x_data = data[:, :-1]
# label: the last column
y_data = data[:, -1]
print(x_data)
print(y_data)

[[100.   4.]
 [ 50.   3.]
 [100.   4.]
 [100.   2.]
 [ 50.   2.]
 [ 80.   2.]
 [ 75.   3.]
 [ 65.   4.]
 [ 90.   3.]
 [ 90.   2.]]
[9.3 4.8 8.9 6.5 4.2 6.2 7.4 6.  7.6 6.1]

def compute_error(theta0, theta1, theta2, x_data, y_data):
    totalError = 0
    for i in range(0, len(x_data)):
        totalError += (y_data[i] - (theta1 * x_data[i, 0] + theta2 * x_data[i, 1] + theta0))**2
    return totalError / float(len(x_data))

def gradient_descent_runner(x_data, y_data, theta0, theta1, theta2, lr, epochs):
    # total data number
    m = float(len(x_data))
    for i in range(epochs):
        theta0_grad = 0
        theta1_grad = 0
        theta2_grad = 0
        for j in range(0, len(x_data)):
            theta0_grad += (1/m) * ((theta1 * x_data[j, 0] + theta2 * x_data[j, 1] + theta0) - y_data[j])
            theta1_grad += (1/m) * x_data[j, 0] * ((theta1 * x_data[j, 0] + theta2 * x_data[j, 1] + theta0) - y_data[j])
            theta2_grad += (1/m) * x_data[j, 1] * ((theta1 * x_data[j, 0] + theta2 * x_data[j, 1] + theta0) - y_data[j])
        theta0 = theta0 - (lr * theta0_grad)
        theta1 = theta1 - (lr * theta1_grad)
        theta2 = theta2 - (lr * theta2_grad)
    return theta0, theta1, theta2

lr = 0.0001
theta0 = 0
theta1 = 0
theta2 = 0
epochs = 1000
print("starting theta0={0}, theta1={1}, theta2={2}, error={3}"
      .format( theta0,theta1,theta2,compute_error(theta0, theta1, theta2, x_data, y_data)))
print("Running...")
theta0, theta1, theta2 = gradient_descent_runner(x_data, y_data, theta0, theta1, theta2, lr, epochs)
print("After {0} iterations theta0={1}, theta1={2}, theta2={3}, error={4}"
      .format( epochs, theta0,theta1,theta2,compute_error(theta0, theta1, theta2, x_data, y_data)))

starting theta0=0, theta1=0, theta2=0, error=47.279999999999994
Running...
After 1000 iterations theta0=0.006971416196678632, theta1=0.08021042690771771, theta2=0.07611036240566814, error=0.7731271432218118

ax = plt.figure().add_subplot(111, projection="3d")
# c='r' => red color, marker='o' => use o to print the figure, s=100 => the marker size is 100
ax.scatter(x_data[:, 0], x_data[:, 1], y_data, c='r', marker="o", s=100)

x0 = x_data[:, 0]
x1 = x_data[:, 1]
x0, x1 = np.meshgrid(x0, x1)
z = theta0 + theta1 * x0 + theta1 * x1
ax.plot_surface(x0, x1, z)

ax.set_xlabel('Miles')
ax.set_ylabel('Num of Deliveries')
ax.set_zlabel('Time')
plt.show()

PS:
Delivery.csv

Sklearn

import numpy as np
from numpy import genfromtxt
import matplotlib.pyplot as plt
from sklearn import linear_model
from mpl_toolkits.mplot3d import Axes3D

1 2	data = genfromtxt(r"Delivery.csv", delimiter=",") print(data)

[[100.    4.    9.3]
 [ 50.    3.    4.8]
 [100.    4.    8.9]
 [100.    2.    6.5]
 [ 50.    2.    4.2]
 [ 80.    2.    6.2]
 [ 75.    3.    7.4]
 [ 65.    4.    6. ]
 [ 90.    3.    7.6]
 [ 90.    2.    6.1]]

# all data until the last column
x_data = data[:, :-1]
# label: the last column
y_data = data[:, -1]
print(x_data)
print(y_data)

[[100.   4.]
 [ 50.   3.]
 [100.   4.]
 [100.   2.]
 [ 50.   2.]
 [ 80.   2.]
 [ 75.   3.]
 [ 65.   4.]
 [ 90.   3.]
 [ 90.   2.]]
[9.3 4.8 8.9 6.5 4.2 6.2 7.4 6.  7.6 6.1]

1 2	model = linear_model.LinearRegression() model.fit(x_data, y_data)

LinearRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

print("coefficients:", model.coef_)
print("intercept:", model.intercept_)

x_test = [[102,4]]
predict = model.predict(x_test)
print("predict:", predict)

coefficients: [0.0611346  0.92342537]
intercept: -0.868701466781709
predict: [9.06072908]

ax = plt.figure().add_subplot(111, projection="3d")
ax.scatter(x_data[:,0], x_data[:,1], y_data, c='r', marker='o', s=100)
x0 = x_data[:,0]
x1 = x_data[:,1]
x0,x1 = np.meshgrid(x0,x1)

z = model.intercept_ + x0*model.coef_[0] + x1*model.coef_[1]

ax.plot_surface(x0, x1, z)
ax.set_xlabel('Miles')
ax.set_ylabel('Num of Deliveries')
ax.set_zlabel('Time')

plt.show()

2023-10-12

ml►a-algorithm

ml-a01-regression

Regression and Gradient Descent

1 2	import numpy as np import matplotlib.pyplot as plt

data = np.genfromtxt("gradient-descent-data.csv", delimiter=",")
x_data = data[:, 0]
y_data = data[:, 1]
plt.scatter(x_data, y_data)
plt.show()

# lost function
def compute_error(b, k, x_data, y_data):
    total_error = 0
    for i in range(0, len(x_data)):
        total_error += (y_data[i] - (k * x_data[i] +b)) ** 2
    return total_error/float(len(x_data)) / 2.0

# gradient descent
def gradient_descent_runner(x_data, y_data, b, k, lr, epochs):
    # all data length
    m = float(len(x_data))
    # loop over epochs
    for i in range(epochs):
        b_grad = 0
        k_grad = 0
        for j in range(0, len(x_data)):
            b_grad += (1/m) * ((k*x_data[j] +b) - y_data[j])
            k_grad += (1/m) * x_data[j] * ((k * x_data[j] + b) - y_data[j])
        b = b - (lr * b_grad)
        k = k - (lr * k_grad)
    return b, k

# lr learning rate
lr = 0.0001
#intercept
b = 0
#rake ratio
k = 0
#iteration numbers
epochs = 50

print("starting b={0}, k={1}, error = {2}".format(b, k, compute_error(b, k, x_data, y_data)))

b, k = gradient_descent_runner(x_data, y_data, b, k, lr, epochs)
print("Running...")

print("after {0} iterations b={1}, k={2}, error={3}".format(epochs, b, k, compute_error(b, k, x_data, y_data)))

plt.plot(x_data, y_data, 'b.')
plt.plot(x_data, k*x_data +b, 'r')
plt.show()

The output result:


starting b=0, k=0, error = 2782.5539172416056
Running...
after 50 iterations b=0.030569950649287983, k=1.4788903781318357, error=56.32488184238028

PS:
gradient-descent-data.csv

Sklearn

1
2
3

from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt

data = np.genfromtxt("gradient-descent-data.csv", delimiter=",")
x_data = data[:, 0]
y_data = data[:, 1]
plt.scatter(x_data,y_data)
plt.show()
print(x_data.shape)

(100,)

x_data = data[:, 0, np.newaxis]
y_data = data[:, 1, np.newaxis]
model = LinearRegression()
model.fit(x_data,y_data)

LinearRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

1
2
3

plt.plot(x_data,y_data,'b.')
plt.plot(x_data, model.predict(x_data), 'r')
plt.show()

2022-12-21

gcp►d-practice

d02-Cloud Functions

Functions-framework

Functions framework helps in setting up the environment for testing of cloud functions locally
Here is an example

installation

1	pip install functions-framework

code example:

create a new project and install functions-framework as above, and then create a new python file main.py

def multiply(request):
    request_json = request.get_json()
    num_1 = request_json["num_1"]
    num_2 = request_json["num_2"]
    result = num_1 * num_2
    return (f"The multiplication result is {result}", 200)

run the function on local:

1	functions-framework --port 8080 --target multiply --signature-type http --source main.py --debug

Test the function
use another terminal and enter the curl command:

curl -X POST \
-H "Content-type:application/json" \
-d  '{"num_1":20, "num_2": 30}' \
-w '\n' \
http://localhost:8080

Response in this new terminal:

2022-10-10

python►c-pandas

Python-c06-udpate data

people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

1	import pandas as pd

1	df = pd.DataFrame(people)

1	df.columns

Index([‘first’, ‘last’, ‘email’], dtype=’object’)

1 2	# change all of the column names df.columns=['first_name', 'last_name', 'email']

df

	first_name	last_name	email
0	Corey	Schafer	CoreyMSchafer@gmail.com
1	Jane	Doe	JaneDoe@email.com
2	John	Doe	JohnDoe@email.com

1 2	# change all column names to upper case df.columns = [x.upper() for x in df.columns]

df

	FIRST_NAME	LAST_NAME	EMAIL
0	Corey	Schafer	CoreyMSchafer@gmail.com
1	Jane	Doe	JaneDoe@email.com
2	John	Doe	JohnDoe@email.com

1
2
3

# replace _ by space in all of the column names and vice versa
df.columns = df.columns.str.replace('_', ' ')
df

	FIRST NAME	LAST NAME	EMAIL
0	Corey	Schafer	CoreyMSchafer@gmail.com
1	Jane	Doe	JaneDoe@email.com
2	John	Doe	JohnDoe@email.com

1
2
3

df.columns = df.columns.str.replace(' ', '_')
df.columns = [x.lower() for x in df.columns]
df

	first_name	last_name	email
0	Corey	Schafer	CoreyMSchafer@gmail.com
1	Jane	Doe	JaneDoe@email.com
2	John	Doe	JohnDoe@email.com

1
2
3

# change only some columns: pass a dictionary for the key as the old name and the value as the new name
df.rename(columns={'first_name': 'first', 'last_name': 'last'}, inplace=True)
df

	first	last	email
0	Corey	Schafer	CoreyMSchafer@gmail.com
1	Jane	Doe	JaneDoe@email.com
2	John	Doe	JohnDoe@email.com

1	# change the data values

1
2
3

# change all column values for a specific row
df.loc[2] = ['John', 'Smith', 'JohnSmith@email.com']
df

	first	last	email
0	Corey	Schafer	CoreyMSchafer@gmail.com
1	Jane	Doe	JaneDoe@email.com
2	John	Smith	JohnSmith@email.com

1
2
3

# change only some columns in the row 2
df.loc[2, ['last', 'email']] = ['Smith', 'JohnSmith@email.com']
df

	first	last	email
0	Corey	Schafer	CoreyMSchafer@gmail.com
1	Jane	Doe	JaneDoe@email.com
2	John	Smith	JohnSmith@email.com

1
2
3

# or we can also use at to change a specific column value
df.at[2, 'last'] = 'Doe'
df

	first	last	email
0	Corey	Schafer	CoreyMSchafer@gmail.com
1	Jane	Doe	JaneDoe@email.com
2	John	Doe	JohnSmith@email.com

# change value with filter: we have to use loc
filt = (df['email'] == 'JohnSmith@email.com')
df.loc[filt, 'last'] = 'Smith'
df

	first	last	email
0	Corey	Schafer	CoreyMSchafer@gmail.com
1	Jane	Doe	JaneDoe@email.com
2	John	Smith	JohnSmith@email.com

1
2
3

# change email to lower case
df['email'] = df['email'].str.lower()
df

	first	last	email
0	Corey	Schafer	coreymschafer@gmail.com
1	Jane	Doe	janedoe@email.com
2	John	Smith	johnsmith@email.com

1
2
3

# apply
# return the email length of each row
df['email'].apply(len)

0 23
1 17
2 19
Name: email, dtype: int64

def update_email(email):
    return email.upper()

df['email'] = df['email'].apply(update_email)
df

	first	last	email
0	Corey	Schafer	COREYMSCHAFER@GMAIL.COM
1	Jane	Doe	JANEDOE@EMAIL.COM
2	John	Smith	JOHNSMITH@EMAIL.COM

1
2
3

# use lambda function
df['email'] = df['email'].apply(lambda x: x.lower())
df

	first	last	email
0	Corey	Schafer	coreymschafer@gmail.com
1	Jane	Doe	janedoe@email.com
2	John	Smith	johnsmith@email.com

1
2
3

# run apply on a dataframe
# this gives us the value count of each column
df.apply(len)

first 3
last 3
email 3
dtype: int64

1 2	# this gives us the column count of each row df.apply(len, axis='columns')

0 3
1 3
2 3
dtype: int64

1 2	# get the min of each column df.apply(pd.Series.min)

first Corey
last Doe
email coreymschafer@gmail.com
dtype: object

1 2	# applymap works only on every element of the DataFrame df.applymap(len)

	first	last	email
0	5	7	23
1	4	3	17
2	4	5	19

1
2
3

# map works on the Series
# replace Corey by Chris, Jane by Mary and put NaN to email
df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})

0 Chris
1 Mary
2 NaN
Name: first, dtype: object

1 2	# this replace just the first and last, but not the email df['first'].replace({'Corey': 'Chris', 'Jane': 'Mary'})

0 Chris
1 Mary
2 John
Name: first, dtype: object

2022-10-07

python►c-pandas

Python-c05-Filtering

people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

1	import pandas as pd

1	df = pd.DataFrame(people)

df

	first	last	email
0	Corey	Schafer	CoreyMSchafer@gmail.com
1	Jane	Doe	JaneDoe@email.com
2	John	Doe	JohnDoe@email.com

1	filt = (df['last'] == 'Doe')

df[filt]

	first	last	email
1	Jane	Doe	JaneDoe@email.com
2	John	Doe	JohnDoe@email.com

1 2	# filter df with filt and return just the column first and email df.loc[filt, ['first', 'email']]

	first	email
1	Jane	JaneDoe@email.com
2	John	JohnDoe@email.com

1 2	# not in filter ~ df.loc[~filt, 'email']

0 CoreyMSchafer@gmail.com
Name: email, dtype: object

# isin filter
lasts = ['Doe', 'Mike', 'Bob']
filt = df['last'].isin(lasts)
df.loc[filt, 'email']

1 JaneDoe@email.com
2 JohnDoe@email.com
Name: email, dtype: object

1	df = pd.read_csv('data/survey_results_public.csv')

df

	Respondent	Professional	ProgramHobby	Country	University	EmploymentStatus	FormalEducation	MajorUndergrad	HomeRemote	CompanySize	...	StackOverflowMakeMoney	Gender	HighestEducationParents	Race	SurveyLong	QuestionsInteresting	QuestionsConfusing	InterestedAnswers	Salary	ExpectedSalary
0	1	Student	Yes, both	United States	No	Not employed, and not looking for work	Secondary school	NaN	NaN	NaN	...	Strongly disagree	Male	High school	White or of European descent	Strongly disagree	Strongly agree	Disagree	Strongly agree	NaN	NaN
1	2	Student	Yes, both	United Kingdom	Yes, full-time	Employed part-time	Some college/university study without earning ...	Computer science or software engineering	More than half, but not all, the time	20 to 99 employees	...	Strongly disagree	Male	A master's degree	White or of European descent	Somewhat agree	Somewhat agree	Disagree	Strongly agree	NaN	37500.0
2	3	Professional developer	Yes, both	United Kingdom	No	Employed full-time	Bachelor's degree	Computer science or software engineering	Less than half the time, but at least one day ...	10,000 or more employees	...	Disagree	Male	A professional degree	White or of European descent	Somewhat agree	Agree	Disagree	Agree	113750.0	NaN
3	4	Professional non-developer who sometimes write...	Yes, both	United States	No	Employed full-time	Doctoral degree	A non-computer-focused engineering discipline	Less than half the time, but at least one day ...	10,000 or more employees	...	Disagree	Male	A doctoral degree	White or of European descent	Agree	Agree	Somewhat agree	Strongly agree	NaN	NaN
4	5	Professional developer	Yes, I program as a hobby	Switzerland	No	Employed full-time	Master's degree	Computer science or software engineering	Never	10 to 19 employees	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
19097	19098	Professional developer	Yes, I program as a hobby	Canada	No	Employed full-time	Bachelor's degree	A business discipline	A few days each month	10 to 19 employees	...	Disagree	Male	Some college/university study, no bachelor's d...	White or of European descent	Somewhat agree	Agree	Disagree	Agree	NaN	NaN
19098	19099	Student	Yes, I program as a hobby	India	No	Not employed, and not looking for work	Secondary school	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
19099	19100	Professional non-developer who sometimes write...	Yes, I program as a hobby	United Kingdom	No	Independent contractor, freelancer, or self-em...	Bachelor's degree	Computer science or software engineering	Never	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
19100	19101	Professional developer	Yes, I program as a hobby	United States	No	Employed full-time	Some college/university study without earning ...	A humanities discipline	Less than half the time, but at least one day ...	100 to 499 employees	...	Disagree	Male	Some college/university study, no bachelor's d...	White or of European descent	Somewhat agree	Somewhat agree	Disagree	Agree	110000.0	NaN
19101	19102	Professional developer	Yes, I program as a hobby	France	No	Employed full-time	Master's degree	Computer science or software engineering	All or almost all the time (I'm full-time remote)	100 to 499 employees	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

19102 rows × 154 columns

1 2	# na=False means if not found the result, put False filt = df['MajorUndergrad'].str.contains('software', na=False)

1	df.loc[filt]

	Respondent	Professional	ProgramHobby	Country	University	EmploymentStatus	FormalEducation	MajorUndergrad	HomeRemote	CompanySize	...	StackOverflowMakeMoney	Gender	HighestEducationParents	Race	SurveyLong	QuestionsInteresting	QuestionsConfusing	InterestedAnswers	Salary	ExpectedSalary
1	2	Student	Yes, both	United Kingdom	Yes, full-time	Employed part-time	Some college/university study without earning ...	Computer science or software engineering	More than half, but not all, the time	20 to 99 employees	...	Strongly disagree	Male	A master's degree	White or of European descent	Somewhat agree	Somewhat agree	Disagree	Strongly agree	NaN	37500.0
2	3	Professional developer	Yes, both	United Kingdom	No	Employed full-time	Bachelor's degree	Computer science or software engineering	Less than half the time, but at least one day ...	10,000 or more employees	...	Disagree	Male	A professional degree	White or of European descent	Somewhat agree	Agree	Disagree	Agree	113750.0	NaN
4	5	Professional developer	Yes, I program as a hobby	Switzerland	No	Employed full-time	Master's degree	Computer science or software engineering	Never	10 to 19 employees	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
7	8	Professional developer	Yes, both	Poland	No	Employed full-time	Master's degree	Computer science or software engineering	All or almost all the time (I'm full-time remote)	Fewer than 10 employees	...	Somewhat agree	Male	A master's degree	White or of European descent	Agree	Somewhat agree	Disagree	Agree	NaN	NaN
8	9	Professional developer	Yes, I program as a hobby	Colombia	Yes, part-time	Employed full-time	Bachelor's degree	Computer science or software engineering	Less than half the time, but at least one day ...	5,000 to 9,999 employees	...	Strongly disagree	Male	A bachelor's degree	Hispanic or Latino/Latina	Somewhat agree	Strongly agree	Disagree	Strongly agree	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
19091	19092	Professional developer	No	Sweden	No	Employed full-time	Master's degree	Computer science or software engineering	Never	100 to 499 employees	...	Strongly disagree	Male	A master's degree	White or of European descent	Disagree	Agree	Somewhat agree	Strongly agree	NaN	NaN
19095	19096	Professional developer	Yes, both	United States	No	Employed full-time	Bachelor's degree	Computer science or software engineering	A few days each month	100 to 499 employees	...	Disagree	Male	A bachelor's degree	White or of European descent	Somewhat agree	Somewhat agree	Disagree	Somewhat agree	NaN	NaN
19096	19097	Professional developer	No	France	No	Employed full-time	Master's degree	Computer science or software engineering	Never	20 to 99 employees	...	Strongly disagree	Male	High school	White or of European descent	Strongly agree	Somewhat agree	Somewhat agree	Somewhat agree	NaN	NaN
19099	19100	Professional non-developer who sometimes write...	Yes, I program as a hobby	United Kingdom	No	Independent contractor, freelancer, or self-em...	Bachelor's degree	Computer science or software engineering	Never	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
19101	19102	Professional developer	Yes, I program as a hobby	France	No	Employed full-time	Master's degree	Computer science or software engineering	All or almost all the time (I'm full-time remote)	100 to 499 employees	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

8098 rows × 154 columns

2022-10-07

python►c-pandas

Python-c04-indexes

1	import pandas as pd

1 2	df = pd.read_csv('data/survey_results_public.csv', index_col='Respondent') df_schema = pd.read_csv('data/survey_results_schema.csv', index_col='Column')

1 2	# inplace=True will apply the changes to the DataFrame df_schema.sort_index(inplace=True)

df_schema

	Question
Column
AnnoyingUI	It annoys me when software has a poor UI
AssessJobCommute	When you're assessing potential jobs to apply ...
AssessJobCompensation	When you're assessing potential jobs to apply ...
AssessJobDept	When you're assessing potential jobs to apply ...
AssessJobDiversity	When you're assessing potential jobs to apply ...
...	...
WorkPayCare	I don't really care what I work on, so long as...
WorkStart	Suppose you could choose your own working hour...
YearsCodedJob	For how many years have you coded as part of y...
YearsCodedJobPast	For how many years did you code as part of you...
YearsProgram	How long has it been since you first learned h...

154 rows × 1 columns

1 2	# set index to the Country df.set_index('Country', inplace=True)

1 2	# give the list of index elements df.index

Index([‘United States’, ‘United Kingdom’, ‘United Kingdom’, ‘United States’,
‘Switzerland’, ‘New Zealand’, ‘United States’, ‘Poland’, ‘Colombia’,
‘France’,
…
‘United States’, ‘India’, ‘United Kingdom’, ‘United States’, ‘France’,
‘Canada’, ‘India’, ‘United Kingdom’, ‘United States’, ‘France’],
dtype=’object’, name=’Country’, length=19102)

1 2	# pass the element of the index to loc df.loc['United States']

	University	EmploymentStatus	FormalEducation	MajorUndergrad	HomeRemote	CompanySize	CompanyType	YearsProgram	YearsCodedJob	YearsCodedJobPast	...	StackOverflowMakeMoney	Gender	HighestEducationParents	Race	SurveyLong	QuestionsInteresting	QuestionsConfusing	InterestedAnswers	Salary	ExpectedSalary
Country
United States	No	Not employed, and not looking for work	Secondary school	NaN	NaN	NaN	NaN	2 to 3 years	NaN	NaN	...	Strongly disagree	Male	High school	White or of European descent	Strongly disagree	Strongly agree	Disagree	Strongly agree	NaN	NaN
United States	No	Employed full-time	Doctoral degree	A non-computer-focused engineering discipline	Less than half the time, but at least one day ...	10,000 or more employees	Non-profit/non-governmental organization or pr...	14 to 15 years	9 to 10 years	NaN	...	Disagree	Male	A doctoral degree	White or of European descent	Agree	Agree	Somewhat agree	Strongly agree	NaN	NaN
United States	No	Employed full-time	Master's degree	A non-computer-focused engineering discipline	Less than half the time, but at least one day ...	20 to 99 employees	Government agency or public school/university	9 to 10 years	8 to 9 years	NaN	...	Disagree	Male	A doctoral degree	White or of European descent	Disagree	Agree	Disagree	Agree	NaN	NaN
United States	No	Employed full-time	Bachelor's degree	A social science	All or almost all the time (I'm full-time remote)	100 to 499 employees	Venture-funded startup	12 to 13 years	11 to 12 years	NaN	...	Strongly disagree	Female	Some college/university study, no bachelor's d...	White or of European descent	Strongly disagree	Agree	Strongly disagree	Strongly agree	NaN	NaN
United States	Yes, part-time	Independent contractor, freelancer, or self-em...	Primary/elementary school	NaN	All or almost all the time (I'm full-time remote)	NaN	NaN	11 to 12 years	1 to 2 years	NaN	...	Disagree	Male	A doctoral degree	White or of European descent	Disagree	Somewhat agree	Strongly disagree	Agree	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
United States	No	Employed full-time	Bachelor's degree	A natural science	All or almost all the time (I'm full-time remote)	Fewer than 10 employees	Privately-held limited company, not in startup...	6 to 7 years	3 to 4 years	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
United States	No	Employed part-time	Some college/university study without earning ...	Computer science or software engineering	Less than half the time, but at least one day ...	10 to 19 employees	Sole proprietorship or partnership, not in sta...	2 to 3 years	1 to 2 years	NaN	...	NaN	Male	Some college/university study, no bachelor's d...	White or of European descent	Disagree	Agree	Disagree	Strongly agree	NaN	NaN
United States	No	Employed full-time	Bachelor's degree	Management information systems	Never	10,000 or more employees	Publicly-traded corporation	20 or more years	20 or more years	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
United States	No	Employed full-time	Bachelor's degree	Computer science or software engineering	A few days each month	100 to 499 employees	Venture-funded startup	20 or more years	19 to 20 years	NaN	...	Disagree	Male	A bachelor's degree	White or of European descent	Somewhat agree	Somewhat agree	Disagree	Somewhat agree	NaN	NaN
United States	No	Employed full-time	Some college/university study without earning ...	A humanities discipline	Less than half the time, but at least one day ...	100 to 499 employees	Sole proprietorship or partnership, not in sta...	20 or more years	20 or more years	NaN	...	Disagree	Male	Some college/university study, no bachelor's d...	White or of European descent	Somewhat agree	Somewhat agree	Disagree	Agree	110000.0	NaN

4364 rows × 150 columns

1 2	# reset the index to the integer df.reset_index(inplace=True)

df

	Country	University	EmploymentStatus	FormalEducation	MajorUndergrad	HomeRemote	CompanySize	CompanyType	YearsProgram	YearsCodedJob	...	StackOverflowMakeMoney	Gender	HighestEducationParents	Race	SurveyLong	QuestionsInteresting	QuestionsConfusing	InterestedAnswers	Salary	ExpectedSalary
0	United States	No	Not employed, and not looking for work	Secondary school	NaN	NaN	NaN	NaN	2 to 3 years	NaN	...	Strongly disagree	Male	High school	White or of European descent	Strongly disagree	Strongly agree	Disagree	Strongly agree	NaN	NaN
1	United Kingdom	Yes, full-time	Employed part-time	Some college/university study without earning ...	Computer science or software engineering	More than half, but not all, the time	20 to 99 employees	Privately-held limited company, not in startup...	9 to 10 years	NaN	...	Strongly disagree	Male	A master's degree	White or of European descent	Somewhat agree	Somewhat agree	Disagree	Strongly agree	NaN	37500.0
2	United Kingdom	No	Employed full-time	Bachelor's degree	Computer science or software engineering	Less than half the time, but at least one day ...	10,000 or more employees	Publicly-traded corporation	20 or more years	20 or more years	...	Disagree	Male	A professional degree	White or of European descent	Somewhat agree	Agree	Disagree	Agree	113750.0	NaN
3	United States	No	Employed full-time	Doctoral degree	A non-computer-focused engineering discipline	Less than half the time, but at least one day ...	10,000 or more employees	Non-profit/non-governmental organization or pr...	14 to 15 years	9 to 10 years	...	Disagree	Male	A doctoral degree	White or of European descent	Agree	Agree	Somewhat agree	Strongly agree	NaN	NaN
4	Switzerland	No	Employed full-time	Master's degree	Computer science or software engineering	Never	10 to 19 employees	Privately-held limited company, not in startup...	20 or more years	10 to 11 years	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
19097	Canada	No	Employed full-time	Bachelor's degree	A business discipline	A few days each month	10 to 19 employees	Privately-held limited company, not in startup...	1 to 2 years	1 to 2 years	...	Disagree	Male	Some college/university study, no bachelor's d...	White or of European descent	Somewhat agree	Agree	Disagree	Agree	NaN	NaN
19098	India	No	Not employed, and not looking for work	Secondary school	NaN	NaN	NaN	NaN	1 to 2 years	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
19099	United Kingdom	No	Independent contractor, freelancer, or self-em...	Bachelor's degree	Computer science or software engineering	Never	NaN	NaN	14 to 15 years	14 to 15 years	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
19100	United States	No	Employed full-time	Some college/university study without earning ...	A humanities discipline	Less than half the time, but at least one day ...	100 to 499 employees	Sole proprietorship or partnership, not in sta...	20 or more years	20 or more years	...	Disagree	Male	Some college/university study, no bachelor's d...	White or of European descent	Somewhat agree	Somewhat agree	Disagree	Agree	110000.0	NaN
19101	France	No	Employed full-time	Master's degree	Computer science or software engineering	All or almost all the time (I'm full-time remote)	100 to 499 employees	Sole proprietorship or partnership, not in sta...	14 to 15 years	9 to 10 years	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

19102 rows × 151 columns

1	df.sort_index(ascending=False)

	Country	University	EmploymentStatus	FormalEducation	MajorUndergrad	HomeRemote	CompanySize	CompanyType	YearsProgram	YearsCodedJob	...	StackOverflowMakeMoney	Gender	HighestEducationParents	Race	SurveyLong	QuestionsInteresting	QuestionsConfusing	InterestedAnswers	Salary	ExpectedSalary
19101	France	No	Employed full-time	Master's degree	Computer science or software engineering	All or almost all the time (I'm full-time remote)	100 to 499 employees	Sole proprietorship or partnership, not in sta...	14 to 15 years	9 to 10 years	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
19100	United States	No	Employed full-time	Some college/university study without earning ...	A humanities discipline	Less than half the time, but at least one day ...	100 to 499 employees	Sole proprietorship or partnership, not in sta...	20 or more years	20 or more years	...	Disagree	Male	Some college/university study, no bachelor's d...	White or of European descent	Somewhat agree	Somewhat agree	Disagree	Agree	110000.0	NaN
19099	United Kingdom	No	Independent contractor, freelancer, or self-em...	Bachelor's degree	Computer science or software engineering	Never	NaN	NaN	14 to 15 years	14 to 15 years	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
19098	India	No	Not employed, and not looking for work	Secondary school	NaN	NaN	NaN	NaN	1 to 2 years	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
19097	Canada	No	Employed full-time	Bachelor's degree	A business discipline	A few days each month	10 to 19 employees	Privately-held limited company, not in startup...	1 to 2 years	1 to 2 years	...	Disagree	Male	Some college/university study, no bachelor's d...	White or of European descent	Somewhat agree	Agree	Disagree	Agree	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4	Switzerland	No	Employed full-time	Master's degree	Computer science or software engineering	Never	10 to 19 employees	Privately-held limited company, not in startup...	20 or more years	10 to 11 years	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	United States	No	Employed full-time	Doctoral degree	A non-computer-focused engineering discipline	Less than half the time, but at least one day ...	10,000 or more employees	Non-profit/non-governmental organization or pr...	14 to 15 years	9 to 10 years	...	Disagree	Male	A doctoral degree	White or of European descent	Agree	Agree	Somewhat agree	Strongly agree	NaN	NaN
2	United Kingdom	No	Employed full-time	Bachelor's degree	Computer science or software engineering	Less than half the time, but at least one day ...	10,000 or more employees	Publicly-traded corporation	20 or more years	20 or more years	...	Disagree	Male	A professional degree	White or of European descent	Somewhat agree	Agree	Disagree	Agree	113750.0	NaN
1	United Kingdom	Yes, full-time	Employed part-time	Some college/university study without earning ...	Computer science or software engineering	More than half, but not all, the time	20 to 99 employees	Privately-held limited company, not in startup...	9 to 10 years	NaN	...	Strongly disagree	Male	A master's degree	White or of European descent	Somewhat agree	Somewhat agree	Disagree	Strongly agree	NaN	37500.0
0	United States	No	Not employed, and not looking for work	Secondary school	NaN	NaN	NaN	NaN	2 to 3 years	NaN	...	Strongly disagree	Male	High school	White or of European descent	Strongly disagree	Strongly agree	Disagree	Strongly agree	NaN	NaN

19102 rows × 151 columns

2022-10-07

python►c-pandas

Python-c03-DataFrame and Series

1	import pandas as pd

person = {
    "first": "Corey", 
    "last": "Schafer", 
    "email": "CoreyMSchafer@gmail.com"
}

people = {
    "first": ["Corey"], 
    "last": ["Schafer"], 
    "email": ["CoreyMSchafer@gmail.com"]
}

people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

1	people['email']

[‘CoreyMSchafer@gmail.com‘, ‘JaneDoe@email.com‘, ‘JohnDoe@email.com‘]

1	df = pd.DataFrame(people)

df

	first	last	email
0	Corey	Schafer	CoreyMSchafer@gmail.com
1	Jane	Doe	JaneDoe@email.com
2	John	Doe	JohnDoe@email.com

1	df['email']

0 CoreyMSchafer@gmail.com
1 JaneDoe@email.com
2 JohnDoe@email.com
Name: email, dtype: object

1 2	#display multiple columns df[['last', 'email']]

	last	email
0	Schafer	CoreyMSchafer@gmail.com
1	Doe	JaneDoe@email.com
2	Doe	JohnDoe@email.com

1
2
3

# iloc: integer location: put the integer index to get the row
# get the first row in this example
df.iloc[0]

first Corey
last Schafer
email CoreyMSchafer@gmail.com
Name: 0, dtype: object

1	df.iloc[[0, 1]]

	first	last	email
0	Corey	Schafer	CoreyMSchafer@gmail.com
1	Jane	Doe	JaneDoe@email.com

1 2	# get the email and last of the first and second row df.loc[[0, 1], ['email', 'last']]

	email	last
0	CoreyMSchafer@gmail.com	Schafer
1	JaneDoe@email.com	Doe

1	df.iloc[[0, 1], ['email', 'last']]

---------------------------------------------------------------------------

IndexError Traceback (most recent call last)

Cell In [15], line 1
—-> 1 df.iloc[[0, 1], [‘email’, ‘last’]]

File /opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py:1068, in _LocationIndexer.getitem(self, key)
1066 if self._is_scalar_access(key):
1067 return self.obj._get_value(*key, takeable=self._takeable)
-> 1068 return self._getitem_tuple(key)
1069 else:
1070 # we by definition only have the 0th axis
1071 axis = self.axis or 0

File /opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py:1564, in _iLocIndexer._getitem_tuple(self, tup)
1562 def _getitem_tuple(self, tup: tuple):
-> 1564 tup = self._validate_tuple_indexer(tup)
1565 with suppress(IndexingError):
1566 return self._getitem_lowerdim(tup)

File /opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py:874, in _LocationIndexer._validate_tuple_indexer(self, key)
872 for i, k in enumerate(key):
873 try:
–> 874 self._validate_key(k, i)
875 except ValueError as err:
876 raise ValueError(
877 “Location based indexing can only have “
878 f”[{self._valid_types}] types”
879 ) from err

File /opt/conda/lib/python3.10/site-packages/pandas/core/indexing.py:1478, in _iLocIndexer._validate_key(self, key, axis)
1476 # check that the key has a numeric dtype
1477 if not is_numeric_dtype(arr.dtype):
-> 1478 raise IndexError(f”.iloc requires numeric indexers, got {arr}”)
1480 # check that the key does not exceed the maximum size of the index
1481 if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis):

IndexError: .iloc requires numeric indexers, got [‘email’ ‘last’]

1	df.iloc[[0, 1], [1,2]]

	last	email
0	Schafer	CoreyMSchafer@gmail.com
1	Doe	JaneDoe@email.com

MA Jian's Blog

Enthussiasm in developing

Python-c07-Filtering-and-Ordering

Python-d01-selenium

ml-a03-polynomial

ml-a02-multiple_linear_regression

Sklearn

ml-a01-regression

Sklearn

d02-Cloud Functions

Functions-framework

installation

Python-c06-udpate data

Python-c05-Filtering

Python-c04-indexes

Python-c03-DataFrame and Series