Populating the stock prices database

In the Creating tables for a stock prices database recipe, we defined a schema for a historical stock prices database. In this recipe, we will populate the tables with data from Yahoo Finance and plot average volumes for different time frames and business sectors.

Stock market researchers have found several strange phenomena that have to do with seasonal effects. Also, there are certain recurring events such as earnings announcements, dividend payments, and options expirations. Again, economic theory tells us that any patterns we observe are either illusions or already known to all market participants. Whether this is true or not is hard to confirm; however, this recipe is great as exercise in data analysis. Also, you can use the database to optimize your portfolio as explained in the Optimizing an equal weights 2 asset portfolio recipe.

How to do it...

The code is in the populate_database.ipynb file in this book's code bundle:

  1. The imports are as follows:
    import database_tables as tables
    import pandas as pd
    import os
    import dautil as dl
    import ch7util
    import sqlite3
    import matplotlib.pyplot as plt
    import seaborn as sns
    from IPython.display import HTML
  2. Define the following function to populate the date dimension table:
    def populate_date_dim(session):
        for d in pd.date_range(start='19000101', end='20250101'):
            adate = tables.DateDim(date=d.date(), day_of_month=d.day,
                                   day_of_week=d.dayofweek, month=d.month,
                                   quarter=d.quarter, year=d.year)
            session.add(adate)
    
        session.commit()
  3. Define the following function to populate the asset dimension table:
    def populate_asset_dim(session):
        asset = tables.AssetDim(symbol='AAPL', name='Apple Inc.',
                                category='Common Stock', country='USA',
                                sector='Consumer Goods')
        session.add(asset)
    
        asset = tables.AssetDim(symbol='INTC', name='Intel Corporation',
                                category='Common Stock', country='USA',
                                sector='Technology')
        session.add(asset)
    
        asset = tables.AssetDim(symbol='MSFT', name='Microsoft Corporation',
                                category='Common Stock', country='USA',
                                sector='Technology')
        session.add(asset)
    
        asset = tables.AssetDim(symbol='KO', name='The Coca-Cola Company',
                                category='Common Stock', country='USA',
                                sector='Consumer Goods')
        session.add(asset)
    
        asset = tables.AssetDim(symbol='DIS', name='The Walt Disney Company',
                                category='Common Stock', country='USA',
                                sector='Services')
        session.add(asset)
    
        asset = tables.AssetDim(symbol='MCD', name='McDonald's Corp.',
                                category='Common Stock', country='USA',
                                sector='Services')
        session.add(asset)
    
        asset = tables.AssetDim(symbol='NKE', name='NIKE, Inc.',
                                category='Common Stock', country='USA',
                                sector='Consumer Goods')
        session.add(asset)
    
        asset = tables.AssetDim(symbol='IBM',
                                name='International Business Machines Corporation',
                                category='Common Stock', country='USA',
                                sector='Technology')
        session.add(asset)
    
        session.commit()
  4. Define the following function to populate the source dimension table:
    def populate_source_dim(session):
        session.add(tables.SourceDim(name='Yahoo Finance',
                                     url='https://finance.yahoo.com'))
        session.commit()
  5. Define the following function to populate the fact table holding stock prices:
    def populate_prices(session):
        symbols = dl.db.map_to_id(session, tables.AssetDim.symbol)
        dates = dl.db.map_to_id(session, tables.DateDim.date)
        source_id = session.query(tables.SourceDim).first().id
        ohlc = dl.data.OHLC()
        conn = sqlite3.connect(dbname)
        c = conn.cursor()
        insert = '''INSERT INTO stock_price (id, date_id,
            asset_id, source_id, open_price, high_price, low_price,
            close_price, adjusted_close, volume)  VALUES({id}, {date_id},
            {asset_id}, {source_id}, {open_price}, {high_price},
            {low_price}, {close_price}, {adj_close}, {volume})'''
        logger = dl.log_api.conf_logger(__name__)
    
        for symbol in ch7util.STOCKS:
            df = ohlc.get(symbol)
            i = 0
    
            for index, row in df.iterrows():
                date_id = dates[index.date()]
                asset_id = symbols[symbol]
                i += 1
                stmt = insert.format(id=i, date_id=date_id,
                                     asset_id=asset_id,
                                     source_id=source_id,
                                     open_price=dl.data.centify(row['Open']),
                                     high_price=dl.data.centify(row['High']),
                                     low_price=dl.data.centify(row['Low']),
                                     close_price=dl.data.centify(row['Close']),
                                     adj_close=dl.data.centify(row['Adj Close']),
                                     volume=int(row['Volume']))
                c.execute(stmt)
    
                if i % 1000 == 0:
                    logger.info("Progress %s %s", symbol, i)
    
                conn.commit()
    
            conn.commit()
    
        c.close()
        conn.close()
  6. Define the following function to populate all the tables:
    def populate(session):
        if session.query(tables.SourceDim).count() == 0:
            populate_source_dim(session)
            populate_asset_dim(session)
            populate_date_dim(session)
            populate_prices(session)
  7. Define the following function to plot the average volumes:
    def plot_volume(col, ax):
        df = pd.read_sql(sql.format(col=col), conn)
        sns.barplot(x=col, y='AVG(P.Volume/1000)', data=df,
                    hue='sector', ax=ax)
    
        ax.legend(loc='best')
    
    dbname = os.path.join(dl.data.get_data_dir(), 'stock_prices.db')
    session = dl.db.create_session(dbname, tables.Base)
    populate(session)
    sql = '''
        SELECT
            A.sector,
            D.{col},
            AVG(P.Volume/1000)
        FROM stock_price P
        INNER JOIN date_dim D  ON (P.Date_Id = D.Id)
        INNER JOIN asset_dim A ON (P.asset_id = a.Id)
        GROUP BY
            A.sector,
            D.{col}
          '''
  8. Plot the average volumes with the following code:
    conn = sqlite3.connect(dbname)
    
    sp = dl.plotting.Subplotter(2, 2, context)
    plot_volume('day_of_week', sp.ax)
    sp.ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri'])
    
    plot_volume('month', sp.next_ax())
    sp.ax.set_xticklabels(dl.ts.short_months())
    
    plot_volume('day_of_month', sp.next_ax())
    plot_volume('quarter', sp.next_ax())
    HTML(sp.exit())

Refer to the following screenshot for the end result:

How to do it...
..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset