Data Manipulation and Visualization#

  1. numpy for array operation

  2. pandas for dataframe operation

  3. matplotlib for visualization

[1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Array Operations#

  1. Create array

  2. Reshape

  3. Filtering

  4. Operations: sort, max, min, mean, sum

  5. Broadcasting

  6. Concatenation

  7. Splitting

  8. Multiplication

    • Element-wise multiplication

    • Matrix multiplication

[4]:
# Create array
arr = np.array([1, 2, 3, 4, 5])
arr
[4]:
array([1, 2, 3, 4, 5])
[5]:
arr.shape
[5]:
(5,)
[7]:
# Reshape
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = arr1.reshape(3, 2)
arr1.shape, arr2.shape
[7]:
((2, 3), (3, 2))
[19]:
# Filtering
arr = np.array([[1, 2, 3], [4, 5, 6]])
arr[arr > 2]
[19]:
array([3, 4, 5, 6])
[23]:
# Operations
arr = np.random.randint(0, 10, 10)
print(arr)
arr.sort()  # in-place sorting, only acs order
print(arr)
arr.max(), arr.min(), arr.mean(), arr.sum()
[7 5 4 0 6 9 1 3 5 6]
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[23], line 4
      2 arr = np.random.randint(0, 10, 10)
      3 print(arr)
----> 4 arr.sort()[::-1]
      5 print(arr)
      6 arr.max(), arr.min(), arr.mean(), arr.sum()

TypeError: 'NoneType' object is not subscriptable
[6]:
# Broadcasting
arr = np.array([1, 2, 3, 4, 5])
arr + 1
[6]:
array([2, 3, 4, 5, 6])
[7]:
# Concatenation
arr1 = np.array([[1, 2], [4, 5]])
arr2 = np.array([[7, 8], [10, 11]])
np.concatenate((arr1, arr2), axis=0), np.concatenate((arr1, arr2), axis=1)
[7]:
(array([[ 1,  2],
        [ 4,  5],
        [ 7,  8],
        [10, 11]]),
 array([[ 1,  2,  7,  8],
        [ 4,  5, 10, 11]]))
[8]:
# Splitting
arr = np.array([[1, 2, 3], [4, 5, 6]])
np.split(arr, 2, axis=0)
[8]:
[array([[1, 2, 3]]), array([[4, 5, 6]])]
[39]:
# Element-wise Multiplication
arr1 = np.array([[1, 2], [3, 4]])
arr2 = np.array([[5, 6], [7, 8]])
arr1 * arr2
[39]:
array([[ 5, 12],
       [21, 32]])
[25]:
# Matrix Multiplication
arr1 = np.array([[1, 2], [3, 4]])
arr2 = np.array([[5, 6], [7, 8]])
arr1.dot(arr2), arr1 @ arr2, np.matmul(arr1, arr2)
[25]:
(array([[19, 22],
        [43, 50]]),
 array([[19, 22],
        [43, 50]]),
 array([[19, 22],
        [43, 50]]))

DataFrame Operations#

  1. Create dataframe

  2. Elements: index, columns, rows

  3. Indexing

  4. Descriptive statistics

  5. Filtering

  6. Operations: sort, max, min, mean, sum

  7. Concatenation

  8. Groupby

  9. Pivot

  10. Merge & Join

[26]:
# Create dataframe
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Helen', 'Ivy', 'Jack'],
    'Age': [24, 27, 22, 32, 29, 35, 20, 26, 25, 30],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Philadelphia', 'Phoenix', 'Philadelphia', 'Los Angeles', 'San Diego', 'Los Angeles', 'San Jose'],
    'Score': [88, 92, 95, 70, 85, 90, 89, 78, 93, 80]
}
df = pd.DataFrame(data)
df
[26]:
Name Age City Score
0 Alice 24 New York 88
1 Bob 27 Los Angeles 92
2 Charlie 22 Chicago 95
3 David 32 Philadelphia 70
4 Eve 29 Phoenix 85
5 Frank 35 Philadelphia 90
6 Grace 20 Los Angeles 89
7 Helen 26 San Diego 78
8 Ivy 25 Los Angeles 93
9 Jack 30 San Jose 80
[27]:
# Elements
df.index, df.columns, df.values
[27]:
(RangeIndex(start=0, stop=10, step=1),
 Index(['Name', 'Age', 'City', 'Score'], dtype='object'),
 array([['Alice', 24, 'New York', 88],
        ['Bob', 27, 'Los Angeles', 92],
        ['Charlie', 22, 'Chicago', 95],
        ['David', 32, 'Philadelphia', 70],
        ['Eve', 29, 'Phoenix', 85],
        ['Frank', 35, 'Philadelphia', 90],
        ['Grace', 20, 'Los Angeles', 89],
        ['Helen', 26, 'San Diego', 78],
        ['Ivy', 25, 'Los Angeles', 93],
        ['Jack', 30, 'San Jose', 80]], dtype=object))
[11]:
# Indexing
df.loc[0, "Name"], df.iloc[0, 0], df["Name"][0]
[11]:
('Alice', 'Alice', 'Alice')
[12]:
# Descriptive statistics
df.describe()
[12]:
Age Score
count 10.000000 10.00000
mean 27.000000 86.00000
std 4.594683 7.83156
min 20.000000 70.00000
25% 24.250000 81.25000
50% 26.500000 88.50000
75% 29.750000 91.50000
max 35.000000 95.00000
[13]:
# Filtering
df[df['Age'] > 25]
df.query('Age > 25')
[13]:
Name Age City Score
1 Bob 27 Los Angeles 92
3 David 32 Philadelphia 70
4 Eve 29 Phoenix 85
5 Frank 35 Philadelphia 90
7 Helen 26 San Diego 78
9 Jack 30 San Jose 80
[14]:
# Operations
df.sort_values(by='Age')
df["Score"].max(), df["Score"].min(), df["Score"].mean(), df["Score"].sum()
[14]:
(95, 70, 86.0, 860)
[15]:
# Concatenation
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data)
pd.concat([df1, df2], axis=0)
[15]:
Name Age City Score
0 Alice 24 New York 88
1 Bob 27 Los Angeles 92
2 Charlie 22 Chicago 95
3 David 32 Philadelphia 70
4 Eve 29 Phoenix 85
5 Frank 35 Philadelphia 90
6 Grace 20 Los Angeles 89
7 Helen 26 San Diego 78
8 Ivy 25 Los Angeles 93
9 Jack 30 San Jose 80
0 Alice 24 New York 88
1 Bob 27 Los Angeles 92
2 Charlie 22 Chicago 95
3 David 32 Philadelphia 70
4 Eve 29 Phoenix 85
5 Frank 35 Philadelphia 90
6 Grace 20 Los Angeles 89
7 Helen 26 San Diego 78
8 Ivy 25 Los Angeles 93
9 Jack 30 San Jose 80
[37]:
# Groupby
df.groupby('City').agg({
    'Name': 'count',
    'Age': ['mean', 'min', 'max'],
    'Score': ['mean', 'sum', 'max']
}).sort_values(by=('Name', 'count'), ascending=False)
[37]:
Name Age Score
count mean min max mean sum max
City
Los Angeles 3 24.0 20 27 91.333333 274 93
Philadelphia 2 33.5 32 35 80.000000 160 90
Chicago 1 22.0 22 22 95.000000 95 95
New York 1 24.0 24 24 88.000000 88 88
Phoenix 1 29.0 29 29 85.000000 85 85
San Diego 1 26.0 26 26 78.000000 78 78
San Jose 1 30.0 30 30 80.000000 80 80
[17]:
# Pivot
df.pivot_table(index='City', values='Score', aggfunc=['mean', 'sum', 'max'])
[17]:
mean sum max
Score Score Score
City
Chicago 95.000000 95 95
Los Angeles 91.333333 274 93
New York 88.000000 88 88
Philadelphia 80.000000 160 90
Phoenix 85.000000 85 85
San Diego 78.000000 78 78
San Jose 80.000000 80 80
[18]:
# Merge
df1 = pd.DataFrame({
    'Name': ['Bob', 'Charlie', 'David', 'Eve'],
    'Age': [27, 22, 32, 29]
})

df2 = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Score': [88, 92, 95, 70]
})
[19]:
# inner join
pd.merge(df1, df2, on='Name', how='inner')  # equivalent to df1.join(df2, on='Name', how='inner')
[19]:
Name Age Score
0 Bob 27 92
1 Charlie 22 95
2 David 32 70
[20]:
# outer join
pd.merge(df1, df2, on='Name', how='outer')  # equivalent to df1.join(df2, on='Name', how='outer')
[20]:
Name Age Score
0 Alice NaN 88.0
1 Bob 27.0 92.0
2 Charlie 22.0 95.0
3 David 32.0 70.0
4 Eve 29.0 NaN
[21]:
# left join
pd.merge(df1, df2, on='Name', how='left')   # equivalent to df1.join(df2, on='Name', how='left')
[21]:
Name Age Score
0 Bob 27 92.0
1 Charlie 22 95.0
2 David 32 70.0
3 Eve 29 NaN
[22]:
# right join
pd.merge(df1, df2, on='Name', how='right')  # equivalent to df1.join(df2, on='Name', how='right')
[22]:
Name Age Score
0 Alice NaN 88
1 Bob 27.0 92
2 Charlie 22.0 95
3 David 32.0 70

Visualization#

Common Figures#

  1. spaghetti plot

  2. histogram

  3. bar plot

  4. box plot

  5. pie plot

  6. heatmap

[23]:
# Spaghetti Plot
x = np.arange(0, 10, 1)
y = np.sin(x)
plt.plot(x, y)
plt.show()
../_images/lectures_data-man-vis_29_0.png
[24]:
# histogram
data = np.random.randn(1000)
plt.hist(data, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
plt.show()
../_images/lectures_data-man-vis_30_0.png
[25]:
# bar plot
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Helen', 'Ivy', 'Jack'],
    'Score': [88, 92, 95, 70, 85, 90, 89, 78, 93, 80]
}
df = pd.DataFrame(data)
plt.bar(df['Name'], df['Score'], alpha=0.7)
plt.show()
../_images/lectures_data-man-vis_31_0.png
[26]:
# box plot with multiple columns
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Helen', 'Ivy', 'Jack'],
    'math_score': [88, 92, 95, 70, 85, 90, 89, 78, 93, 80],
    'history_score': [78, 82, 85, 60, 75, 80, 79, 68, 83, 70]
}

df = pd.DataFrame(data)
plt.boxplot([df['math_score'], df['history_score']])
plt.xticks([1, 2], ['Math', 'History'])
plt.show()
../_images/lectures_data-man-vis_32_0.png
[27]:
# pie plot
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Helen', 'Ivy', 'Jack'],
    'Income': [88, 92, 95, 70, 85, 90, 89, 78, 93, 80]
}
df = pd.DataFrame(data)
plt.pie(df['Income'], labels=df['Name'], autopct='%1.1f%%')
plt.show()
../_images/lectures_data-man-vis_33_0.png
[28]:
# heatmap
data = np.random.randn(10, 10)
plt.imshow(data, cmap='cool', interpolation='nearest')
plt.colorbar()
plt.show()
../_images/lectures_data-man-vis_34_0.png

Other Components#

  1. title

  2. axis

  3. label

  4. legend

  5. ticks

[29]:
x = np.arange(0, 10, 0.1)
y = np.sin(x)
plt.plot(x, y, 'x')

# title
plt.title('Point Plot of sin(x)')
# axis
plt.axis([0, 10, -1.1, 1.1])
# label
plt.xlabel('x')
plt.ylabel('sin(x)')
# legend
plt.legend(['sin(x)'])
# ticks
plt.xticks(np.arange(0, 11, 1))
plt.show()
../_images/lectures_data-man-vis_36_0.png

Subplots#

[32]:
# subplot
x = np.arange(1, 10, 0.1)
y1 = np.sin(x)
y2 = np.log(x)
y3 = np.power(x, 2)
fig, axs = plt.subplots(1, 3, figsize=(15, 5))
plt.suptitle('Subplot Panel')
axs[0].plot(x, y1)
axs[0].title.set_text('$sin(x)$')
axs[1].plot(x, y2)
axs[1].title.set_text('$log(x)$')
axs[2].plot(x, y3)
axs[2].title.set_text('$x^2$')
plt.show()
../_images/lectures_data-man-vis_38_0.png