import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
ADDITIONAL_TICKERS = ['CAT', 'SNAP', 'TSLA', 'GE', 'GOOG', 'FB', 'AMD', 'ATVI', 'SAP', 'EBAY']
# Example of a full URL with one stock Symbol
# url = "https://finance.yahoo.com/calendar/earnings?symbol=ATVI"
def get_scraped_yahoo_finance_page(url,url_params):
# url = "https://finance.yahoo.com/calendar/earnings"
# url_params = {'symbol':ticker}
r = requests.get(url,params=url_params)
# DEBUG:
# print('Response OK? :',r.ok)
# print('Status code:', r.status_code )
# print('Headers:', r.headers)
# print('Content:', r.content)
soup = BeautifulSoup(r.text)
table = soup.find_all('table')
# DEBUG: If we found just 1 table -> it is good
print('We\'ve found tables:', len(table))
if len(table)!=1:
return None
# Get all column names
if len(soup.table.find_all('thead'))==0:
return None
spans = soup.table.thead.find_all('span')
columns = []
for span in spans:
columns.append(span.text)
# Hack: due to some reason one of the columns are not <span> tag for calendar/earnings and is not discovered in the columns list.
# we manually add this column
if url.find('https://finance.yahoo.com/most-active') != -1 :
columns.insert(len(columns)-2,"Market Cap")
rows = soup.table.tbody.find_all('tr')
# read row by row
stocks_df = pd.DataFrame(columns=columns)
for row in rows:
elems = row.find_all('td')
dict_to_add = {}
for i,elem in enumerate(elems):
dict_to_add[columns[i]] = elem.text
stocks_df = stocks_df.append(dict_to_add, ignore_index=True)
return stocks_df
# The only record per stock appears with this patern : the next earnings date
def get_next_earnings_records(stocks_df):
filter1 = stocks_df['EPS Estimate']!='-'
filter2 = stocks_df['Surprise(%)']=='-'
filter3 = stocks_df['Reported EPS']=='-'
rez_df = stocks_df[filter1 & filter2 & filter3]
return rez_df
# Remove all records with not filled stats and cast to float values
def clean_earnings_history_df(stocks_df):
filter1 = stocks_df['EPS Estimate']!='-'
filter2 = stocks_df['Surprise(%)']!='-'
filter3 = stocks_df['Reported EPS']!='-'
stocks_df_noMissing = stocks_df[filter1 & filter2 & filter3]
stocks_df_noMissing['EPS Estimate'] = stocks_df_noMissing['EPS Estimate'].astype(float)
stocks_df_noMissing['Reported EPS'] = stocks_df_noMissing['Reported EPS'].astype(float)
stocks_df_noMissing['Surprise(%)'] = stocks_df_noMissing['Surprise(%)'].astype(float)
return stocks_df_noMissing
ticker = 'F'
f_df = get_scraped_yahoo_finance_page(url = "https://finance.yahoo.com/calendar/earnings",url_params = {'symbol':ticker})
f_df_clean = clean_earnings_history_df(f_df)
num_stocks = 200
most_active_stocks = get_scraped_yahoo_finance_page(url = "https://finance.yahoo.com/most-active",url_params = {'count':num_stocks})
# https://stackoverflow.com/questions/7312020/why-wont-re-groups-give-me-anything-for-my-one-correctly-matched-group
import re
POWERS = {'T': 10 ** 12,'B': 10 ** 9, 'M': 10 ** 6, '%': 0.01, '1':1}
"""Read a string (with M/B/T/% values)
Return a correct numeric value
"""
def convert_str_to_num(num_str):
match = re.search(r"([0-9\.-]+)(M|B|T|%)?", num_str)
if match is None:
return None
else:
quantity = match.group(1)
if match.group(2) is None:
magnitude = '1' # no modificator in the end -> don't multiply on anything
else:
magnitude = match.group(2)
# print(quantity, magnitude)
return float(quantity) * POWERS[magnitude]
columns_to_apply = [ "Price (Intraday)", "Change", "% Change", "Volume", "Avg Vol (3 month)", "Market Cap", "PE Ratio (TTM)"]
for col in columns_to_apply:
most_active_stocks[col] = most_active_stocks[col].apply(convert_str_to_num)
most_active_stocks["log_market_cap"] = np.log10(most_active_stocks["Market Cap"])
most_active_stocks.log_market_cap.hist(bins = 10)
most_active_stocks["log_market_cap_binned"] = pd.qcut(most_active_stocks.log_market_cap,3)
def remove_outliers(df, column_name, quantile_threshold):
q_low = df[column_name].quantile(quantile_threshold)
q_hi = df[column_name].quantile(1-quantile_threshold)
rez = df[(df[column_name] < q_hi) & (df[column_name] > q_low)]
return rez
tmp = remove_outliers(df = most_active_stocks, column_name = “% Change”,quantile_threshold = 0.02)
tmp[“% Change”].hist(bins=100)
tmp = remove_outliers(df = most_active_stocks, column_name = "Change", quantile_threshold = 0.02)
# The difference vs. previous graph: we draw the abs. daily change here vs. relative change in % in the previous graph
tmp[“Change”].hist(bins=100)
# We want to get some idea of the stock was traded today vs. 3-month average volume
most_active_stocks["relative_volume"] = most_active_stocks["Volume"]/most_active_stocks["Avg Vol (3 month)"]
import seaborn as sns
sns.set(rc={‘figure.figsize’:(10,6)})
sns.kdeplot(
data = most_active_stocks, x=”% Change”, y=”relative_volume”, hue=”log_market_cap_binned”, fill=True,
)
sns.set(rc={‘figure.figsize’:(10,6)})
sns.kdeplot(
data = most_active_stocks, x="% Change", hue="log_market_cap_binned",
fill=True, common_norm=False,
# palette="rocket",
alpha=.5,
linewidth=0,
)
NEW_TICKERS = [x for x in ADDITIONAL_TICKERS if x not in set(most_active_stocks.Symbol)]
TICKERS_LIST = most_active_stocks.Symbol.append(pd.Series(NEW_TICKERS))
from random import randint
from time import sleep
# Empty dataframe
all_tickers_info = pd.DataFrame({‘A’ : []})
for i,ticker in enumerate(TICKERS_LIST):
current_ticker_info = get_scraped_yahoo_finance_page(url = "https://finance.yahoo.com/calendar/earnings",url_params = {‘symbol’:ticker})
print(f’Finished with ticker {ticker}, record no {i}’)
if all_tickers_info.empty:
all_tickers_info = current_ticker_info
else:
all_tickers_info = pd.concat([all_tickers_info,current_ticker_info], ignore_index=True)
# Random sleep 1–3 sec
sleep(randint(1,3))
next_earnings_dates = get_next_earnings_records(all_tickers_info)
next_earnings_dates.sort_values(by=’Earnings Date’).tail(30)
all_tickers_info_clean = clean_earnings_history_df(all_tickers_info)
all_tickers_info_clean = clean_earnings_history_df(all_tickers_info)
all_tickers_info_clean[['EPS Estimate','Reported EPS']].plot.box()
import scipy
# calculate z-scores of `df`
z_scores = scipy.stats.zscore(all_tickers_info_clean[["EPS Estimate","Reported EPS"]])
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 2).all(axis=1)
new_df = all_tickers_info_clean[["EPS Estimate","Reported EPS"]][filtered_entries]
new_df.plot.box()
from datetime import datetime
from datetime import timedelta
all_tickers_info_clean["Earnings Date 2"] = all_tickers_info_clean["Earnings Date"].apply(lambda x:datetime.strptime(x[:-3], "%b %d, %Y, %H %p") )
all_tickers_info_clean["PK"] = all_tickers_info_clean.Symbol + "|"+ all_tickers_info_clean["Earnings Date 2"].apply(lambda x : x.strftime("%Y-%m-%d"))
!pip install yfinance
import yfinance as yf
# Start from an empty dataframe
df_stocks_prices = pd.DataFrame({'A' : []})
# Download all history of stock prices and calculate the future returns for 1–7 days, 30d, 90d, 365d
# That is: we are very interested if we buy stock at some date (e.g. high EPS) -> if it is going to be a profitable decision
for i,ticker in enumerate(TICKERS_LIST):
yf_ticker = yf.Ticker(ticker)
historyPrices = yf_ticker.history(period='max')
historyPrices['Ticker'] = ticker
# Sometimes there is a problem with .index value → use try
# https://stackoverflow.com/questions/610883/how-to-know-if-an-object-has-an-attribute-in-python
try:
historyPrices['Year']= historyPrices.index.year
historyPrices['Month'] = historyPrices.index.month
historyPrices['Weekday'] = historyPrices.index.weekday
historyPrices['Date'] = historyPrices.index.date
except AttributeError:
print(historyPrices.index)
# !!! Important: we do historyPrices['Close'].shift(1) — to get the Close market price 1 day BEFORE
current
# !!! Important: we do historyPrices['Close'].shift(-i)) — to get the Close market price the i days AFTER current
# If you divide second on first -> you get the returns from holding “i days” the stock that you bought the day before financial reporting occurred
for i in [1,2,3,4,5,6,7,30,90,365]:
historyPrices['r_future'+str(i)] = np.log(historyPrices['Close'].shift(-i) / historyPrices['Close'].shift(1) )
historyPrices['years_from_now'] = historyPrices['Year'].max()- historyPrices['Year']
historyPrices['ln_volume']= np.log(historyPrices['Volume'])
if df_stocks_prices.empty:
df_stocks_prices = historyPrices
else:
df_stocks_prices = pd.concat([df_stocks_prices,historyPrices], ignore_index=True)
df_stocks_prices["PK"] = df_stocks_prices.Ticker + "|"+ df_stocks_prices["Date"].apply(lambda x : x.strftime('%Y-%m-%d'))
filter1 = df_stocks_prices.Ticker=='GE'
filter2 = df_stocks_prices.Year == 2020
filter3 = df_stocks_prices.Month == 10
df_stocks_prices[filter1 & filter2 & filter3].head(2)
all_tickers_info_clean = all_tickers_info_clean.drop_duplicates(subset=['PK'], keep='first')
df_stocks_prices = df_stocks_prices.drop_duplicates(subset=['PK'], keep='first')
merged_df = pd.merge(all_tickers_info_clean, df_stocks_prices, on="PK", validate="one_to_one")
def draw_plot(symbol):
filter = (merged_df.Symbol== symbol) & (merged_df.Year>=2010)
df = merged_df[filter][["EPS Estimate","Reported EPS","Surprise(%)","r_future1","r_future7","Date"]]
with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
print(df.head(15))
#Graph1: EPS estimate vs. Reported EPS
df[["EPS Estimate","Reported EPS","Date"]].plot.line(x="Date", figsize=(20,6), title="EPS Estimate vs. Reported")
#Graph2: Surprise in %
df[["Surprise(%)","Date"]].plot.line(x="Date", figsize=(20,6), title="Surprise % (=Reported EPS/EPS Estimate)")
#Graph3: 1- and 7-days returns
df[["r_future1",”r_future7","Date"]].sort_values(by='Date').plot(x="Date", kind='bar', figsize=(20,6), title="Stock jump")
draw_plot("GE")
draw_plot("MSFT")
import matplotlib.pyplot as plt
print('Count observations: ',merged_df.groupby(by='Year').count()['r_future1'])
ax = merged_df.groupby(by='Year').mean()[['r_future1','r_future7']].plot.line(figsize=(20,6))
vals = ax.get_yticks()
ax.set_yticklabels(['{:,.1%}'.format(x) for x in vals])
plt.axhline(y=0, color='r', linestyle='-')
plt.title("1 and 7 days returns of stocks after the quarterly earnings results announcement")
def draw_returns(groupby_factor, filter):
filter_year = merged_df.Year>=2000
print('Count observations: ',merged_df[filter & filter_year].groupby(by=groupby_factor).count()['r_future1'])
ax = merged_df[filter & filter_year].groupby(by=groupby_factor).mean()[['r_future1','r_future7']].plot.line(figsize=(20,6))
vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0%}'.format(x) for x in vals])
plt.axhline(y=0, color='r', linestyle='-')
plt.title("1 and 7 days returns of stocks after the quarterly earnings results announcement")
if groupby_factor=='Year':
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
draw_returns('Year', merged_df["Reported EPS"]<0)
draw_returns('Year', merged_df["Reported EPS"]<-1)
draw_returns('Year', merged_df["Reported EPS"]>0)
draw_returns('Year', merged_df["Reported EPS"]>1)
draw_returns('Year', merged_df["Reported EPS"]>2)
merged_df.ln_volume.replace([np.inf, -np.inf], np.nan).hist(bins=100)
merged_df["ln_volume_binned"] = pd.qcut(merged_df["ln_volume"],10)
merged_df.ln_volume_binned.value_counts()
# OUTPUT:
# (17.674, 21.613] 1226
# (6.396, 14.344] 1226
# (17.111, 17.674] 1225
# (16.74, 17.111] 1225
# (16.436, 16.74] 1225
# (16.157, 16.436] 1225
# (15.85, 16.157] 1225
# (15.49, 15.85] 1225
# (15.037, 15.49] 1225
# (14.344, 15.037] 1225 Name: ln_volume_binned, dtype: int64
draw_returns('ln_volume_binned', merged_df["Year"]>2000)
draw_returns('ln_volume_binned', merged_df["Year"]==2020)
draw_returns('ln_volume_binned', (merged_df["Year"]==2020) & (merged_df["Reported EPS"]<0))
merged_df["surprise_%_binned"] = pd.qcut(merged_df["Surprise(%)"],10)
merged_df["surprise_%_binned"].sort_values().value_counts()
# OUTPUT:
# (-31360.231, -17.356] 1226
# (-17.356, -3.39] 1226
# (-3.39, 0.27] 1229
# (0.27, 1.87] 1221
# (1.87, 3.73] 1226
# (3.73, 6.446] 1223
# (6.446, 10.677] 1225
# (10.677, 17.52] 1226
# (17.52, 36.469] 1224
# (36.469, 6900.0] 1226
# Name: surprise_%_binned, dtype: int64
draw_returns('surprise_%_binned', True)