Extract month and year from column in Pandas, create new column


import modules

import pandas as pd
import numpy as np
import datetime

create dummy dataframe

raw_data = {'name': ['Willard Morris', 'Al Jennings', 'Omar Mullins', 'Spencer McDaniel'],
'age': [20, 19, 22, 21],
'favorite_color': ['blue', 'red', 'yellow', "green"],
'grade': [88, 92, 95, 70],
'birth_date': ['01-02-1996', '08-05-1997', '04-28-1996', '12-16-1995']}
df = pd.DataFrame(raw_data, index = ['Willard Morris', 'Al Jennings', 'Omar Mullins', 'Spencer McDaniel'])
df
age birth_date favorite_color grade name
Willard Morris 20 01-02-1996 blue 88 Willard Morris
Al Jennings 19 08-05-1997 red 92 Al Jennings
Omar Mullins 22 04-28-1996 yellow 95 Omar Mullins
Spencer McDaniel 21 12-16-1995 green 70 Spencer McDaniel

create a new column with year of date field 'birth_date'

#pandas datetimeindex docs: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DatetimeIndex.html
#efficient way to extract year from string format date
df['year'] = pd.DatetimeIndex(df['birth_date']).year
df.head()
age birth_date favorite_color grade name year
Willard Morris 20 01-02-1996 blue 88 Willard Morris 1996
Al Jennings 19 08-05-1997 red 92 Al Jennings 1997
Omar Mullins 22 04-28-1996 yellow 95 Omar Mullins 1996
Spencer McDaniel 21 12-16-1995 green 70 Spencer McDaniel 1995

create a new column with month of date field 'birth_date'

#pandas datetimeindex docs: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DatetimeIndex.html
df['month'] = pd.DatetimeIndex(df['birth_date']).month
df.head()
age birth_date favorite_color grade name year month
Willard Morris 20 01-02-1996 blue 88 Willard Morris 1996 1
Al Jennings 19 08-05-1997 red 92 Al Jennings 1997 8
Omar Mullins 22 04-28-1996 yellow 95 Omar Mullins 1996 4
Spencer McDaniel 21 12-16-1995 green 70 Spencer McDaniel 1995 12
#if the date format comes in datetime, we can also extract the day/month/year using the to_period function
#where 'D', 'M', 'Y' are inputs
df['month_year'] = pd.to_datetime(df['birth_date']).dt.to_period('M')
df.head()
age birth_date favorite_color grade name year month month_year
Willard Morris 20 01-02-1996 blue 88 Willard Morris 1996 1 1996-01
Al Jennings 19 08-05-1997 red 92 Al Jennings 1997 8 1997-08
Omar Mullins 22 04-28-1996 yellow 95 Omar Mullins 1996 4 1996-04
Spencer McDaniel 21 12-16-1995 green 70 Spencer McDaniel 1995 12 1995-12


Ace your next data science interview

Get better at data science interviews by solving a few questions per week



Find a bug? Submit a suggested change on Github, or message me on Twitter.