In the Creating tables for a stock prices database recipe, we defined a schema for a historical stock prices database. In this recipe, we will populate the tables with data from Yahoo Finance and plot average volumes for different time frames and business sectors.
Stock market researchers have found several strange phenomena that have to do with seasonal effects. Also, there are certain recurring events such as earnings announcements, dividend payments, and options expirations. Again, economic theory tells us that any patterns we observe are either illusions or already known to all market participants. Whether this is true or not is hard to confirm; however, this recipe is great as exercise in data analysis. Also, you can use the database to optimize your portfolio as explained in the Optimizing an equal weights 2 asset portfolio recipe.
The code is in the populate_database.ipynb
file in this book's code bundle:
import database_tables as tables import pandas as pd import os import dautil as dl import ch7util import sqlite3 import matplotlib.pyplot as plt import seaborn as sns from IPython.display import HTML
def populate_date_dim(session): for d in pd.date_range(start='19000101', end='20250101'): adate = tables.DateDim(date=d.date(), day_of_month=d.day, day_of_week=d.dayofweek, month=d.month, quarter=d.quarter, year=d.year) session.add(adate) session.commit()
def populate_asset_dim(session): asset = tables.AssetDim(symbol='AAPL', name='Apple Inc.', category='Common Stock', country='USA', sector='Consumer Goods') session.add(asset) asset = tables.AssetDim(symbol='INTC', name='Intel Corporation', category='Common Stock', country='USA', sector='Technology') session.add(asset) asset = tables.AssetDim(symbol='MSFT', name='Microsoft Corporation', category='Common Stock', country='USA', sector='Technology') session.add(asset) asset = tables.AssetDim(symbol='KO', name='The Coca-Cola Company', category='Common Stock', country='USA', sector='Consumer Goods') session.add(asset) asset = tables.AssetDim(symbol='DIS', name='The Walt Disney Company', category='Common Stock', country='USA', sector='Services') session.add(asset) asset = tables.AssetDim(symbol='MCD', name='McDonald's Corp.', category='Common Stock', country='USA', sector='Services') session.add(asset) asset = tables.AssetDim(symbol='NKE', name='NIKE, Inc.', category='Common Stock', country='USA', sector='Consumer Goods') session.add(asset) asset = tables.AssetDim(symbol='IBM', name='International Business Machines Corporation', category='Common Stock', country='USA', sector='Technology') session.add(asset) session.commit()
def populate_source_dim(session): session.add(tables.SourceDim(name='Yahoo Finance', url='https://finance.yahoo.com')) session.commit()
def populate_prices(session): symbols = dl.db.map_to_id(session, tables.AssetDim.symbol) dates = dl.db.map_to_id(session, tables.DateDim.date) source_id = session.query(tables.SourceDim).first().id ohlc = dl.data.OHLC() conn = sqlite3.connect(dbname) c = conn.cursor() insert = '''INSERT INTO stock_price (id, date_id, asset_id, source_id, open_price, high_price, low_price, close_price, adjusted_close, volume) VALUES({id}, {date_id}, {asset_id}, {source_id}, {open_price}, {high_price}, {low_price}, {close_price}, {adj_close}, {volume})''' logger = dl.log_api.conf_logger(__name__) for symbol in ch7util.STOCKS: df = ohlc.get(symbol) i = 0 for index, row in df.iterrows(): date_id = dates[index.date()] asset_id = symbols[symbol] i += 1 stmt = insert.format(id=i, date_id=date_id, asset_id=asset_id, source_id=source_id, open_price=dl.data.centify(row['Open']), high_price=dl.data.centify(row['High']), low_price=dl.data.centify(row['Low']), close_price=dl.data.centify(row['Close']), adj_close=dl.data.centify(row['Adj Close']), volume=int(row['Volume'])) c.execute(stmt) if i % 1000 == 0: logger.info("Progress %s %s", symbol, i) conn.commit() conn.commit() c.close() conn.close()
def populate(session): if session.query(tables.SourceDim).count() == 0: populate_source_dim(session) populate_asset_dim(session) populate_date_dim(session) populate_prices(session)
def plot_volume(col, ax): df = pd.read_sql(sql.format(col=col), conn) sns.barplot(x=col, y='AVG(P.Volume/1000)', data=df, hue='sector', ax=ax) ax.legend(loc='best') dbname = os.path.join(dl.data.get_data_dir(), 'stock_prices.db') session = dl.db.create_session(dbname, tables.Base) populate(session) sql = ''' SELECT A.sector, D.{col}, AVG(P.Volume/1000) FROM stock_price P INNER JOIN date_dim D ON (P.Date_Id = D.Id) INNER JOIN asset_dim A ON (P.asset_id = a.Id) GROUP BY A.sector, D.{col} '''
conn = sqlite3.connect(dbname) sp = dl.plotting.Subplotter(2, 2, context) plot_volume('day_of_week', sp.ax) sp.ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri']) plot_volume('month', sp.next_ax()) sp.ax.set_xticklabels(dl.ts.short_months()) plot_volume('day_of_month', sp.next_ax()) plot_volume('quarter', sp.next_ax()) HTML(sp.exit())
Refer to the following screenshot for the end result: