!pip install beautifulsoup4
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = “https://finance.yahoo.com/calendar/earnings?from=2020-07-26&to=2020-08-01&day=2020-07-27"
r = requests.get(url)
r.ok
#Output: True
r.content
soup = BeautifulSoup(r.text)
table = soup.find_all('table')
len(table)
# output:
# 1
#spans = soup.table.thead.find_all('span')
spans = soup.table.thead.find_all('th')
columns = []
for span in spans:
print(span.text)
columns.append(span.text)
Symbol
Company
Earnings Call Time
EPS Estimate
Reported EPS
Surprise(%)
rows = soup.table.tbody.find_all('tr')
len(rows)
stocks_df = pd.DataFrame(columns=columns)
for row in rows:
elems = row.find_all('td')
dict_to_add = {}
for i,elem in enumerate(elems):
dict_to_add[columns[i]] = elem.text
stocks_df = stocks_df.append(dict_to_add, ignore_index=True)
stocks_df
filter1 = stocks_df['Surprise(%)']!='-'
filter2 = stocks_df['EPS Estimate']!='-'
filter3 = stocks_df['Reported EPS']!='-'
stocks_df_noMissing = stocks_df[filter1 & filter2 & filter3]
len(stocks_df_noMissing)
# Output:
# 79
len(stocks_df_noMissing)
stocks_df_noMissing['EPS Estimate'] = stocks_df_noMissing['EPS Estimate'].astype(float)
stocks_df_noMissing['Reported EPS'] = stocks_df_noMissing['Reported EPS'].astype(float)
stocks_df_noMissing['Surprise(%)'] = stocks_df_noMissing['Surprise(%)'].astype(float)
stocks_df_noMissing.info()
<class ‘pandas.core.frame.DataFrame’>
Int64Index: 79 entries, 0 to 99
Data columns (total 6 columns):
# Column Non-Null Count Dtype
— — — — — — — — — — — — — — -
0 Symbol 79 non-null object
1 Company 79 non-null object
2 Earnings Call Time 79 non-null object
3 EPS Estimate 79 non-null float64
4 Reported EPS 79 non-null float64
5 Surprise(%) 79 non-null float64
# Need to supply weekly stats as you see on the website
# from_dt = '2020-07-26'
# to_dt = '2020-08-01'
def get_scrapped_week(from_dt, to_dt):
# initially look at the first 100 stocks with earnings at the first day of the week (from_dt)
# FULL URL with PARAMS example: url = "https://finance.yahoo.com/calendar/earnings?from=2020-07-26&to=2020-08-01&day=2020-07-27"
url = "https://finance.yahoo.com/calendar/earnings"
offset = 0
size = 100
fst = 1
# scrape every date in the submitted interval
for day_date in (datetime.strptime(from_dt, '%Y-%m-%d') + timedelta(n) for n in range(6)):
day_dt = datetime.strftime(day_date, '%Y-%m-%d')
print(day_dt)
# inner cycle for iteration with offset, if more than 100 stocks earnings happenned that date
while True:
# make URL request with the params
params = {'from': from_dt, 'to': to_dt,'day': day_dt, 'offset':offset, 'size': size}
r = requests.get(url, params=params)
soup = BeautifulSoup(r.text)
# scrape table column names when going first time to create a correct dataframe
if fst == 1:
spans = soup.table.thead.find_all('span')
columns = []
for span in spans:
print(span.text)
columns.append(span.text)
stocks_df = pd.DataFrame(columns=columns)
fst = 0
# scrape body with row values
rows = soup.table.tbody.find_all('tr')
for row in rows:
elems = row.find_all('td')
dict_to_add = {}
dict_to_add['Date'] = day_dt
for i,elem in enumerate(elems):
dict_to_add[columns[i]]=elem.text
stocks_df = stocks_df.append(dict_to_add, ignore_index=True)
if len(rows) != 100:
print(len(rows)+offset)
offset = 0
break
else:
offset = offset + 100
return stocks_df
# stocks_df.to_csv('stocks.csv', index = False)
for span in spans:
print(span.text)
columns.append(span.text)
stocks_df = pd.DataFrame(columns=columns)
fst = 0
# scrape body with row values
rows = soup.table.tbody.find_all(‘tr’)
for row in rows:
elems = row.find_all(‘td’)
dict_to_add = {}
dict_to_add[‘Date’] = day_dt
for i,elem in enumerate(elems):
dict_to_add[columns[i]]=elem.text
stocks_df = stocks_df.append(dict_to_add, ignore_index=True)
if len(rows) != 100:
print(len(rows)+offset)
offset = 0
break
else:
offset = offset + 100
return stocks_df
stocks_df = get_scrapped_week(‘2020–07–05’, ‘2020–07–11’)
2020–07–05
Symbol
Company
Earnings Call Time
EPS Estimate
Reported EPS
Surprise(%)
8
2020–07–06
29
2020–07–07
23
2020–07–08
23
2020–07–09
23
2020–07–10
4
filter1 = stocks_df['Surprise(%)']!='-'
filter2 = stocks_df['EPS Estimate']!='-'
filter3 = stocks_df['Reported EPS']!='-'
stocks_df_noMissing = stocks_df[filter1 & filter2 & filter3]
stocks_df_noMissing['EPS Estimate'] = stocks_df_noMissing['EPS Estimate'].astype(float)
stocks_df_noMissing['Reported EPS'] = stocks_df_noMissing['Reported EPS'].astype(float)
stocks_df_noMissing['Surprise(%)'] = stocks_df_noMissing['Surprise(%)'].astype(float)
stocks_df_noMissing.set_index('Symbol')
Symbol Company Earnings Call Time EPS Estimate Reported EPS Surprise(%) Date
505 GOOGL Alphabet Inc. Time Not Supplied 8.21 10.13 23.42 2020–07–29
!pip install yfinance
import yfinance as yf
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
row = stocks_df_noMissing[stocks_df_noMissing['Symbol'] == 'FB']
print(row)
# Output:
# Symbol Company Earnings Call Time EPS Estimate Reported EPS Surprise(%) Date
# 776 FB Facebook, Inc. Time Not Supplied 1.39 1.8 29.59 2020-07-29
date = row['Date'].values[0]
print(date)
# Output:
# 2020-07-29
date = datetime.strptime(row['Date'].values[0], ‘%Y-%m-%d’)
print(date + timedelta(days=3))
print(date — timedelta(days=1))
ticker = yf.Ticker('FB')
hist = yf.download('FB', start= date — timedelta(days=1), end=date + timedelta(days=3))
# Output:
# 2020–08–01 00:00:00
# 2020–07–28 00:00:00
# [*********************100%***********************] 1 of 1 completed
hist['r2'] = np.log(hist['Open'] / hist['Open'].shift(2))
hist['volume_rise'] = np.log(hist['Volume'] / hist['Volume'].shift(2))
hist.r2.values[-1]
# Output:
# 0.10145051589492579
hist.volume_rise.values[-1]
# Output:
# 1.361648662790037
import pandas_datareader.data as pdr
from datetime import date
start = datetime(2020,7,1)
end = datetime(2020,8,10)
print(f'Period 1 month until today: {start} to {end} ')
spx_index = pdr.get_data_stooq('^SPX', start, end)
# S&P500 index was growing almost all July 2020 → need to adjust stock growth after the reporting date
spx_index['Open'].plot.line()
spx_index['r2'] = np.log(np.divide(spx_index['Open'] , spx_index['Open'].shift(2)))
spx_index['r2'].plot.line()
spx_index.head(30)
# Output:
# Open High Low Close Volume r2 Date
# 2020–08–10 3356.04 3363.29 3335.44 3360.47 2565981272 NaN
# 2020–08–07 3340.05 3352.54 3328.72 3351.28 2279160879 NaN
# 2020–08–06 3323.17 3351.03 3318.14 3349.16 2414075395 -0.009843
# 2020–08–05 3317.37 3330.77 3317.37 3327.77 2452040105 -0.006813
# 2020–08–04 3289.92 3306.84 3286.37 3306.51 2403695283 -0.010056
# 2020–08–03 3288.26 3302.73 3284.53 3294.61 2379546705 -0.008814
# …
array_returns_snp500 = []
for index,row in stocks_df_noMissing.iterrows():
start_dt = datetime.strptime(row['Date'], ‘%Y-%m-%d’) — timedelta(days = 1)
end_dt = datetime.strptime(row['Date'], ‘%Y-%m-%d’) + timedelta(days = 3)
# we don’t have gaps more than 4 days -> try to find the closest value of S&P500 returns in the dataframe:
cur_dt = end_dt
while cur_dt >= start_dt:
rez_df = spx_index[cur_dt.strftime('%Y-%m-%d')]
if len(rez_df)>0:
array_returns_snp500.append(rez_df.r2.values[0])
break
else:
cur_dt = cur_dt — timedelta(days = 1)
len(array_returns_snp500)
# Output:1698
len(stocks_df_noMissing)
# Output:1698
array_tickers = []
array_returns = []
array_volume_rise = []
array_volume_usd = []
array_snp500 = []
for index,row in stocks_df_noMissing.iterrows():
start_dt = datetime.strptime(row['Date'], '%Y-%m-%d') — timedelta(days = 1)
end_dt = datetime.strptime(row['Date'], '%Y-%m-%d') + timedelta(days = 3)
hist = yf.download(row['Symbol'], start = start_dt, end = end_dt)
# We need to have a full data : volume and price for all dates calculate the returns and volume rise
# ALSO: if end_dt is non-trading day (Sat,Sun) → we can’t directly calc the stats of returns
if len(hist)<4:
continue
hist['r2'] = np.log(np.divide(hist['Open'] , hist['Open'].shift(2)))
hist['volume_rise'] = np.log(np.divide(hist['Volume'], hist['Volume'].shift(2)))
hist['volume_usd'] = hist['Volume'] * hist['Open']
print(row)
print(index)
print(' — — — — — — — ')
array_tickers.append(row['Symbol'])
array_returns.append(hist.r2.values[-1])
array_volume_rise.append(hist.volume_rise.values[-1])
array_volume_usd.append(hist.volume_usd.values[-1])
# We only append values S&P for the stocks that have all the data
array_snp500.append(array_returns_snp500[index])
[*********************100%***********************] 1 of 1 completed
Symbol AEOJF
Company AEON Financial Service Co., Ltd.
Earnings Call Time Time Not Supplied
EPS Estimate 14.03
Reported EPS -5
Surprise(%) -135.67
Date 2020–07–07
Name: 37, dtype: object
37
— — — — — — —
[*********************100%***********************] 1 of 1 completed
Symbol BBBY
Company Bed Bath & Beyond Inc.
Earnings Call Time Time Not Supplied
EPS Estimate -1.22
Reported EPS -1.96
Surprise(%) -60.39
Date 2020–07–07
Name: 43, dtype: object
43
— — — — — — —
…
len(array_tickers)
# Output:
# 1003
returns_df = pd.DataFrame(columns=['Ticker', 'Returns','Volume Rise','Volume Trade USD','Returns S&P500'])
returns_df = pd.DataFrame([array_tickers,array_returns,array_volume_rise,array_volume_usd, array_snp500]).transpose()
returns_df.columns=['Ticker','Returns','Volume Rise','Volume Trade USD', 'Returns S&P500']
returns_df.set_index('Ticker', inplace=True)
returns_df.dropna(inplace=True)
returns_df['Returns'] = returns_df['Returns'].astype(float)
returns_df['Volume Rise'] = returns_df['Volume Rise'].astype(float)
returns_df['Volume Trade USD'] = returns_df['Volume Trade USD'].astype(float)
returns_df['Returns S&P500'] = returns_df['Returns S&P500'].astype(float)
returns_df['Returns in %'] = np.exp(returns_df['Returns'])
returns_df['Volume Rise in %'] = np.exp(returns_df['Volume Rise'])
# Returns above S&P500
returns_df['Adj. Returns'] = returns_df['Returns'] — returns_df['Returns S&P500']
returns_df['Adj. Returns in %'] = np.exp(returns_df['Adj. Returns'])
returns_df = returns_df.replace([np.inf, -np.inf], np.nan)
returns_df.hist(figsize=(20,10), bins=100)
stocks_and_returns = stocks_df_noMissing.set_index('Symbol').join(returns_df)
stocks_and_returns.head()
stocks_and_returns_no_missing = stocks_and_returns.replace([np.inf, -np.inf], np.nan).dropna()
stocks_and_returns_no_missing.info()
<class ‘pandas.core.frame.DataFrame’>
Index: 997 entries, AA to ZEN
Data columns (total 14 columns):
# Column Non-Null Count Dtype
— — — — — — — — — — — — — — -
0 Company 997 non-null object
1 Earnings Call Time 997 non-null object
2 EPS Estimate 997 non-null float64
3 Reported EPS 997 non-null float64
4 Surprise(%) 997 non-null float64
5 Date 997 non-null object
6 Returns 997 non-null float64
7 Volume Rise 997 non-null float64
8 Volume Trade USD 997 non-null float64
9 Returns S&P500 997 non-null float64
10 Returns in % 997 non-null float64
11 Volume Rise in % 997 non-null float64
12 Adj. Returns 997 non-null float64
13 Adj. Returns in % 997 non-null float64
dtypes: float64(11), object(3)
memory usage: 116.8+ KB
top50_volume = stocks_and_returns_no_missing.sort_values(by='Volume Trade USD', ascending=False).head(50)
print(top50_volume)
top200_volume = stocks_and_returns_no_missing.sort_values(by='Volume Trade USD', ascending=False).head(200)
print(top200_volume)
top50_volume[['Surprise(%)','Returns in %']].plot.scatter(x='Surprise(%)', y='Returns in %')
top200_volume[['Surprise(%)','Returns in %']].plot.scatter(x='Surprise(%)', y='Returns in %')
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
top50_volume[['Surprise(%)','Reported EPS','Returns in %']].plot.scatter(x='Reported EPS', y='Surprise(%)', c='Returns in %', colormap='RdYlGn', ax=ax)
fig, ax = plt.subplots()
top200_volume[['Surprise(%)','Reported EPS','Returns in %']].plot.scatter(x='Reported EPS', y='Surprise(%)', c='Returns in %', colormap='RdYlGn', ax=ax)
fig, ax = plt.subplots()
top50_volume[['Surprise(%)','Reported EPS','Adj. Returns in %']].plot.scatter(x='Reported EPS', y='Surprise(%)', c='Adj. Returns in %', colormap='RdYlGn', ax=ax)
fig, ax = plt.subplots()
top50_volume.plot.scatter(x='Reported EPS', y='EPS Estimate', c='Returns in %', colormap='RdYlGn', ax=ax)
top200_volume['Adj. Returns in %'].hist(bins=50, alpha=0.5)
top50_volume['Adj. Returns in %'].hist(bins=50, alpha = 0.5)