Data Manipulation and Visualization

Contents

Data Manipulation and Visualization#

numpy for array operation
pandas for dataframe operation
matplotlib for visualization

[1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Array Operations#

Create array
Reshape
Filtering
Operations: sort, max, min, mean, sum
Broadcasting
Concatenation
Splitting
Multiplication
- Element-wise multiplication
- Matrix multiplication

[4]:

# Create array
arr = np.array([1, 2, 3, 4, 5])
arr

[4]:

array([1, 2, 3, 4, 5])

[5]:

arr.shape

[5]:

(5,)

[7]:

# Reshape
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = arr1.reshape(3, 2)
arr1.shape, arr2.shape

[7]:

((2, 3), (3, 2))

[19]:

# Filtering
arr = np.array([[1, 2, 3], [4, 5, 6]])
arr[arr > 2]

[19]:

array([3, 4, 5, 6])

[23]:

# Operations
arr = np.random.randint(0, 10, 10)
print(arr)
arr.sort()  # in-place sorting, only acs order
print(arr)
arr.max(), arr.min(), arr.mean(), arr.sum()

[7 5 4 0 6 9 1 3 5 6]

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[23], line 4
      2 arr = np.random.randint(0, 10, 10)
      3 print(arr)
----> 4 arr.sort()[::-1]
      5 print(arr)
      6 arr.max(), arr.min(), arr.mean(), arr.sum()

TypeError: 'NoneType' object is not subscriptable

[6]:

# Broadcasting
arr = np.array([1, 2, 3, 4, 5])
arr + 1

[6]:

array([2, 3, 4, 5, 6])

[7]:

# Concatenation
arr1 = np.array([[1, 2], [4, 5]])
arr2 = np.array([[7, 8], [10, 11]])
np.concatenate((arr1, arr2), axis=0), np.concatenate((arr1, arr2), axis=1)

[7]:

(array([[ 1,  2],
        [ 4,  5],
        [ 7,  8],
        [10, 11]]),
 array([[ 1,  2,  7,  8],
        [ 4,  5, 10, 11]]))

[8]:

# Splitting
arr = np.array([[1, 2, 3], [4, 5, 6]])
np.split(arr, 2, axis=0)

[8]:

[array([[1, 2, 3]]), array([[4, 5, 6]])]

[39]:

# Element-wise Multiplication
arr1 = np.array([[1, 2], [3, 4]])
arr2 = np.array([[5, 6], [7, 8]])
arr1 * arr2

[39]:

array([[ 5, 12],
       [21, 32]])

[25]:

# Matrix Multiplication
arr1 = np.array([[1, 2], [3, 4]])
arr2 = np.array([[5, 6], [7, 8]])
arr1.dot(arr2), arr1 @ arr2, np.matmul(arr1, arr2)

[25]:

(array([[19, 22],
        [43, 50]]),
 array([[19, 22],
        [43, 50]]),
 array([[19, 22],
        [43, 50]]))

DataFrame Operations#

Create dataframe
Elements: index, columns, rows
Indexing
Descriptive statistics
Filtering
Operations: sort, max, min, mean, sum
Concatenation
Groupby
Pivot
Merge & Join

[26]:

# Create dataframe
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Helen', 'Ivy', 'Jack'],
    'Age': [24, 27, 22, 32, 29, 35, 20, 26, 25, 30],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Philadelphia', 'Phoenix', 'Philadelphia', 'Los Angeles', 'San Diego', 'Los Angeles', 'San Jose'],
    'Score': [88, 92, 95, 70, 85, 90, 89, 78, 93, 80]
}
df = pd.DataFrame(data)
df

[26]:

	Name	Age	City	Score
0	Alice	24	New York	88
1	Bob	27	Los Angeles	92
2	Charlie	22	Chicago	95
3	David	32	Philadelphia	70
4	Eve	29	Phoenix	85
5	Frank	35	Philadelphia	90
6	Grace	20	Los Angeles	89
7	Helen	26	San Diego	78
8	Ivy	25	Los Angeles	93
9	Jack	30	San Jose	80

[27]:

# Elements
df.index, df.columns, df.values

[27]:

(RangeIndex(start=0, stop=10, step=1),
 Index(['Name', 'Age', 'City', 'Score'], dtype='object'),
 array([['Alice', 24, 'New York', 88],
        ['Bob', 27, 'Los Angeles', 92],
        ['Charlie', 22, 'Chicago', 95],
        ['David', 32, 'Philadelphia', 70],
        ['Eve', 29, 'Phoenix', 85],
        ['Frank', 35, 'Philadelphia', 90],
        ['Grace', 20, 'Los Angeles', 89],
        ['Helen', 26, 'San Diego', 78],
        ['Ivy', 25, 'Los Angeles', 93],
        ['Jack', 30, 'San Jose', 80]], dtype=object))

[11]:

# Indexing
df.loc[0, "Name"], df.iloc[0, 0], df["Name"][0]

[11]:

('Alice', 'Alice', 'Alice')

[12]:

# Descriptive statistics
df.describe()

[12]:

	Age	Score
count	10.000000	10.00000
mean	27.000000	86.00000
std	4.594683	7.83156
min	20.000000	70.00000
25%	24.250000	81.25000
50%	26.500000	88.50000
75%	29.750000	91.50000
max	35.000000	95.00000

[13]:

# Filtering
df[df['Age'] > 25]
df.query('Age > 25')

[13]:

	Name	Age	City	Score
1	Bob	27	Los Angeles	92
3	David	32	Philadelphia	70
4	Eve	29	Phoenix	85
5	Frank	35	Philadelphia	90
7	Helen	26	San Diego	78
9	Jack	30	San Jose	80

[14]:

# Operations
df.sort_values(by='Age')
df["Score"].max(), df["Score"].min(), df["Score"].mean(), df["Score"].sum()

[14]:

(95, 70, 86.0, 860)

[15]:

# Concatenation
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data)
pd.concat([df1, df2], axis=0)

[15]:

	Name	Age	City	Score
0	Alice	24	New York	88
1	Bob	27	Los Angeles	92
2	Charlie	22	Chicago	95
3	David	32	Philadelphia	70
4	Eve	29	Phoenix	85
5	Frank	35	Philadelphia	90
6	Grace	20	Los Angeles	89
7	Helen	26	San Diego	78
8	Ivy	25	Los Angeles	93
9	Jack	30	San Jose	80
0	Alice	24	New York	88
1	Bob	27	Los Angeles	92
2	Charlie	22	Chicago	95
3	David	32	Philadelphia	70
4	Eve	29	Phoenix	85
5	Frank	35	Philadelphia	90
6	Grace	20	Los Angeles	89
7	Helen	26	San Diego	78
8	Ivy	25	Los Angeles	93
9	Jack	30	San Jose	80

[37]:

# Groupby
df.groupby('City').agg({
    'Name': 'count',
    'Age': ['mean', 'min', 'max'],
    'Score': ['mean', 'sum', 'max']
}).sort_values(by=('Name', 'count'), ascending=False)

[37]:

	Name	Age			Score
	count	mean	min	max	mean	sum	max
City
Los Angeles	3	24.0	20	27	91.333333	274	93
Philadelphia	2	33.5	32	35	80.000000	160	90
Chicago	1	22.0	22	22	95.000000	95	95
New York	1	24.0	24	24	88.000000	88	88
Phoenix	1	29.0	29	29	85.000000	85	85
San Diego	1	26.0	26	26	78.000000	78	78
San Jose	1	30.0	30	30	80.000000	80	80

[17]:

# Pivot
df.pivot_table(index='City', values='Score', aggfunc=['mean', 'sum', 'max'])

[17]:

	mean	sum	max
	Score	Score	Score
City
Chicago	95.000000	95	95
Los Angeles	91.333333	274	93
New York	88.000000	88	88
Philadelphia	80.000000	160	90
Phoenix	85.000000	85	85
San Diego	78.000000	78	78
San Jose	80.000000	80	80

[18]:

# Merge
df1 = pd.DataFrame({
    'Name': ['Bob', 'Charlie', 'David', 'Eve'],
    'Age': [27, 22, 32, 29]
})

df2 = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Score': [88, 92, 95, 70]
})

[19]:

# inner join
pd.merge(df1, df2, on='Name', how='inner')  # equivalent to df1.join(df2, on='Name', how='inner')

[19]:

	Name	Age	Score
0	Bob	27	92
1	Charlie	22	95
2	David	32	70

[20]:

# outer join
pd.merge(df1, df2, on='Name', how='outer')  # equivalent to df1.join(df2, on='Name', how='outer')

[20]:

	Name	Age	Score
0	Alice	NaN	88.0
1	Bob	27.0	92.0
2	Charlie	22.0	95.0
3	David	32.0	70.0
4	Eve	29.0	NaN

[21]:

# left join
pd.merge(df1, df2, on='Name', how='left')   # equivalent to df1.join(df2, on='Name', how='left')

[21]:

	Name	Age	Score
0	Bob	27	92.0
1	Charlie	22	95.0
2	David	32	70.0
3	Eve	29	NaN

[22]:

# right join
pd.merge(df1, df2, on='Name', how='right')  # equivalent to df1.join(df2, on='Name', how='right')

[22]:

	Name	Age	Score
0	Alice	NaN	88
1	Bob	27.0	92
2	Charlie	22.0	95
3	David	32.0	70

Visualization#

Common Figures#

spaghetti plot
histogram
bar plot
box plot
pie plot
heatmap

[23]:

# Spaghetti Plot
x = np.arange(0, 10, 1)
y = np.sin(x)
plt.plot(x, y)
plt.show()

../_images/lectures_data-man-vis_29_0.png

[24]:

# histogram
data = np.random.randn(1000)
plt.hist(data, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
plt.show()

../_images/lectures_data-man-vis_30_0.png

[25]:

# bar plot
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Helen', 'Ivy', 'Jack'],
    'Score': [88, 92, 95, 70, 85, 90, 89, 78, 93, 80]
}
df = pd.DataFrame(data)
plt.bar(df['Name'], df['Score'], alpha=0.7)
plt.show()

../_images/lectures_data-man-vis_31_0.png

[26]:

# box plot with multiple columns
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Helen', 'Ivy', 'Jack'],
    'math_score': [88, 92, 95, 70, 85, 90, 89, 78, 93, 80],
    'history_score': [78, 82, 85, 60, 75, 80, 79, 68, 83, 70]
}

df = pd.DataFrame(data)
plt.boxplot([df['math_score'], df['history_score']])
plt.xticks([1, 2], ['Math', 'History'])
plt.show()

../_images/lectures_data-man-vis_32_0.png

[27]:

# pie plot
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Helen', 'Ivy', 'Jack'],
    'Income': [88, 92, 95, 70, 85, 90, 89, 78, 93, 80]
}
df = pd.DataFrame(data)
plt.pie(df['Income'], labels=df['Name'], autopct='%1.1f%%')
plt.show()

../_images/lectures_data-man-vis_33_0.png

[28]:

# heatmap
data = np.random.randn(10, 10)
plt.imshow(data, cmap='cool', interpolation='nearest')
plt.colorbar()
plt.show()

../_images/lectures_data-man-vis_34_0.png

Other Components#

title
axis
label
legend
ticks

[29]:

x = np.arange(0, 10, 0.1)
y = np.sin(x)
plt.plot(x, y, 'x')

# title
plt.title('Point Plot of sin(x)')
# axis
plt.axis([0, 10, -1.1, 1.1])
# label
plt.xlabel('x')
plt.ylabel('sin(x)')
# legend
plt.legend(['sin(x)'])
# ticks
plt.xticks(np.arange(0, 11, 1))
plt.show()

../_images/lectures_data-man-vis_36_0.png

Subplots#

[32]:

# subplot
x = np.arange(1, 10, 0.1)
y1 = np.sin(x)
y2 = np.log(x)
y3 = np.power(x, 2)
fig, axs = plt.subplots(1, 3, figsize=(15, 5))
plt.suptitle('Subplot Panel')
axs[0].plot(x, y1)
axs[0].title.set_text('$sin(x)$')
axs[1].plot(x, y2)
axs[1].title.set_text('$log(x)$')
axs[2].plot(x, y3)
axs[2].title.set_text('$x^2$')
plt.show()

../_images/lectures_data-man-vis_38_0.png