import warnings
warnings.filterwarnings('ignore')


                                
                                    %matplotlib inline


                                
                                    from tabulate import tabulate
import matplotlib.pyplot as plt
import sqlite3


                                
                                    con = sqlite3.connect('meetup.db')
cursor = con.cursor()


                                
                                    data = cursor.execute("select * from event limit 30;")
print(tabulate(data.fetchall(), headers=["Event ID", "Status",'Event Date', 'RSVP Count', "Link"], tablefmt='psql'))

+------------+----------+--------------+--------------+-------------------------------------------------------+
|   Event ID | Status   | Event Date   |   RSVP Count | Link                                                  |
|------------+----------+--------------+--------------+-------------------------------------------------------|
|  109370632 | past     | 2013-03-22   |            5 | https://www.meetup.com/1-5GenAsians/events/109370632/ |
|  109868492 | past     | 2013-03-23   |            6 | https://www.meetup.com/1-5GenAsians/events/109868492/ |
|  111080172 | past     | 2013-03-30   |           11 | https://www.meetup.com/1-5GenAsians/events/111080172/ |
|  111088522 | past     | 2013-03-30   |            3 | https://www.meetup.com/1-5GenAsians/events/111088522/ |
|  110749042 | past     | 2013-03-31   |            8 | https://www.meetup.com/1-5GenAsians/events/110749042/ |
|  111871212 | past     | 2013-04-05   |           10 | https://www.meetup.com/1-5GenAsians/events/111871212/ |
|  112504332 | past     | 2013-04-06   |            8 | https://www.meetup.com/1-5GenAsians/events/112504332/ |
|  113192492 | past     | 2013-04-12   |           10 | https://www.meetup.com/1-5GenAsians/events/113192492/ |
|  113176612 | past     | 2013-04-13   |            4 | https://www.meetup.com/1-5GenAsians/events/113176612/ |
|  114838502 | past     | 2013-04-19   |            3 | https://www.meetup.com/1-5GenAsians/events/114838502/ |
|  116882312 | past     | 2013-05-03   |            8 | https://www.meetup.com/1-5GenAsians/events/116882312/ |
|  113174462 | past     | 2013-05-04   |            6 | https://www.meetup.com/1-5GenAsians/events/113174462/ |
|  118200732 | past     | 2013-05-11   |           10 | https://www.meetup.com/1-5GenAsians/events/118200732/ |
|  119185682 | past     | 2013-05-18   |            9 | https://www.meetup.com/1-5GenAsians/events/119185682/ |
|  120390832 | past     | 2013-05-24   |            8 | https://www.meetup.com/1-5GenAsians/events/120390832/ |
|  121853942 | past     | 2013-06-02   |            9 | https://www.meetup.com/1-5GenAsians/events/121853942/ |
|  123022632 | past     | 2013-06-07   |            8 | https://www.meetup.com/1-5GenAsians/events/123022632/ |
|  123844992 | past     | 2013-06-15   |           25 | https://www.meetup.com/1-5GenAsians/events/123844992/ |
|  126498932 | past     | 2013-06-28   |           11 | https://www.meetup.com/1-5GenAsians/events/126498932/ |
|  126318262 | past     | 2013-06-29   |            1 | https://www.meetup.com/1-5GenAsians/events/126318262/ |
|  127652212 | past     | 2013-07-03   |            4 | https://www.meetup.com/1-5GenAsians/events/127652212/ |
|  123571462 | past     | 2013-07-06   |            3 | https://www.meetup.com/1-5GenAsians/events/123571462/ |
|  129354962 | past     | 2013-07-13   |            3 | https://www.meetup.com/1-5GenAsians/events/129354962/ |
|  129003072 | past     | 2013-07-13   |            7 | https://www.meetup.com/1-5GenAsians/events/129003072/ |
|  130608792 | past     | 2013-07-27   |           20 | https://www.meetup.com/1-5GenAsians/events/130608792/ |
|  133462162 | past     | 2013-08-10   |           12 | https://www.meetup.com/1-5GenAsians/events/133462162/ |
|  134488172 | past     | 2013-08-17   |            9 | https://www.meetup.com/1-5GenAsians/events/134488172/ |
|  135768512 | past     | 2013-08-24   |           21 | https://www.meetup.com/1-5GenAsians/events/135768512/ |
|  136619502 | past     | 2013-08-31   |           11 | https://www.meetup.com/1-5GenAsians/events/136619502/ |
|  135478542 | past     | 2013-09-07   |            9 | https://www.meetup.com/1-5GenAsians/events/135478542/ |
+------------+----------+--------------+--------------+-------------------------------------------------------+


                                
                                    data = cursor.execute("select event_date, rsvp_count, link from event order by rsvp_count desc limit 30;")
print(tabulate(data.fetchall(), headers=['Event Date', 'RSVP Count', 'Link'], tablefmt='psql'))

+--------------+--------------+-------------------------------------------------------+
| Event Date   |   RSVP Count | Link                                                  |
|--------------+--------------+-------------------------------------------------------|
| 2015-03-01   |          109 | https://www.meetup.com/1-5GenAsians/events/219920453/ |
| 2014-11-02   |          105 | https://www.meetup.com/1-5GenAsians/events/212049012/ |
| 2016-10-09   |          102 | https://www.meetup.com/1-5GenAsians/events/233570064/ |
| 2016-04-30   |           85 | https://www.meetup.com/1-5GenAsians/events/230305968/ |
| 2019-12-21   |           83 | https://www.meetup.com/1-5GenAsians/events/266398172/ |
| 2018-03-24   |           82 | https://www.meetup.com/1-5GenAsians/events/248285057/ |
| 2018-06-10   |           80 | https://www.meetup.com/1-5GenAsians/events/250424473/ |
| 2021-08-29   |           69 | https://www.meetup.com/1-5GenAsians/events/279628714/ |
| 2014-08-17   |           57 | https://www.meetup.com/1-5GenAsians/events/194901992/ |
| 2019-01-26   |           54 | https://www.meetup.com/1-5GenAsians/events/257948178/ |
| 2015-03-14   |           47 | https://www.meetup.com/1-5GenAsians/events/220103990/ |
| 2021-10-17   |           46 | https://www.meetup.com/1-5GenAsians/events/281053389/ |
| 2014-06-21   |           44 | https://www.meetup.com/1-5GenAsians/events/182543572/ |
| 2017-10-14   |           43 | https://www.meetup.com/1-5GenAsians/events/243442670/ |
| 2014-03-29   |           40 | https://www.meetup.com/1-5GenAsians/events/164683122/ |
| 2017-06-10   |           40 | https://www.meetup.com/1-5GenAsians/events/240287833/ |
| 2014-07-13   |           33 | https://www.meetup.com/1-5GenAsians/events/190799922/ |
| 2018-02-18   |           32 | https://www.meetup.com/1-5GenAsians/events/247445878/ |
| 2019-02-02   |           32 | https://www.meetup.com/1-5GenAsians/events/257999006/ |
| 2021-10-10   |           32 | https://www.meetup.com/1-5GenAsians/events/280631869/ |
| 2015-09-05   |           30 | https://www.meetup.com/1-5GenAsians/events/224725880/ |
| 2015-11-21   |           30 | https://www.meetup.com/1-5GenAsians/events/226542423/ |
| 2018-10-20   |           30 | https://www.meetup.com/1-5GenAsians/events/255446031/ |
| 2019-08-03   |           30 | https://www.meetup.com/1-5GenAsians/events/262715304/ |
| 2019-11-23   |           30 | https://www.meetup.com/1-5GenAsians/events/266412251/ |
| 2021-09-25   |           28 | https://www.meetup.com/1-5GenAsians/events/280609901/ |
| 2015-06-06   |           27 | https://www.meetup.com/1-5GenAsians/events/222782816/ |
| 2021-10-03   |           27 | https://www.meetup.com/1-5GenAsians/events/280505220/ |
| 2018-06-24   |           26 | https://www.meetup.com/1-5GenAsians/events/251467094/ |
| 2013-06-15   |           25 | https://www.meetup.com/1-5GenAsians/events/123844992/ |
+--------------+--------------+-------------------------------------------------------+


                                
                                    data = cursor.execute("select count(*) from event;")
print(f"Total number of events is {data.fetchone()[0]}")

Total number of events is 814


                                
                                    num_event_per_month = list(cursor.execute("""select strftime('%Y-%m', event_date), count(*) from event 
    group by strftime('%Y-%m', event_date);"""))
print(tabulate(num_event_per_month, headers=['Month', 'Number of Events'], tablefmt='psql'))


                                
                                    num_event_per_month = list(num_event_per_month)[-24:]
x = range(len(num_event_per_month))
width = 1/1.5

figure = plt.figure(figsize=(20, 6))

axes = figure.add_subplot(1, 1, 1)
axes.bar(x, [r[1] for r in num_event_per_month], width, align="center")
axes.set_xticks(x)
axes.set_xticklabels([r[0] for r in num_event_per_month])
axes.set_title("Number of Events Per Month")
axes.set_xlabel("Month")
axes.set_ylabel("# of Events")
axes.xaxis.grid(False)

plt.setp(axes.xaxis.get_majorticklabels(), rotation=70)
plt.show()
plt.close();


                                
                                    num_event_per_year = list(cursor.execute("""select strftime('%Y', event_date), count(*) from event 
    group by strftime('%Y', event_date);"""))
print(tabulate(num_event_per_year, headers=['Year', 'Number of Events'], tablefmt='psql'))

figure = plt.figure(figsize=(15, 6))

axes = figure.add_subplot(1, 1, 1)
axes.plot([r[0] for r in num_event_per_year], [r[1] for r in num_event_per_year])
axes.plot([r[0] for r in num_event_per_year], [r[1] for r in num_event_per_year], 'ro')
plt.axis([-1, 10, -5, 180])
plt.show()

+--------+--------------------+
|   Year |   Number of Events |
|--------+--------------------|
|   2013 |                 55 |
|   2014 |                 60 |
|   2015 |                 83 |
|   2016 |                 98 |
|   2017 |                 39 |
|   2018 |                165 |
|   2019 |                139 |
|   2020 |                 42 |
|   2021 |                132 |
|   2022 |                  1 |
+--------+--------------------+


                                
                                    num_event_per_weekday = list(cursor.execute("""select strftime('%w', event_date), count(*) from event 
    group by strftime('%w', event_date);"""))
print(tabulate(num_event_per_weekday, headers=['Day of the Week', 'Number of Events'], tablefmt='psql'))

+-------------------+--------------------+
|   Day of the Week |   Number of Events |
|-------------------+--------------------|
|                 0 |                278 |
|                 1 |                 19 |
|                 2 |                 23 |
|                 3 |                 43 |
|                 4 |                 37 |
|                 5 |                102 |
|                 6 |                312 |
+-------------------+--------------------+


                                
                                    %matplotlib inline


                                
                                    import warnings
        warnings.filterwarnings('ignore')
        
        from tabulate import tabulate
        import matplotlib.pyplot as plt
        import sqlite3
        import pandas as pd
        import numpy as np
        import seaborn as sns
        import random as py_random
        import scipy.stats as stats
        
        sns.set(style="whitegrid")


                                
                                    conn = sqlite3.connect('meetup.db')
        cursor = conn.cursor()


                                
                                    data = cursor.execute("select event_id,event_date, rsvp_count, venue_lat, venue_lon, venue_city from event limit 30;")
        print(tabulate(data.fetchall(), headers=["Event ID", "Event Date", 'RSVP Count', "Lat", "Lon", "City"], tablefmt='psql'))

+------------+--------------+--------------+---------+----------+---------------+
        |   Event ID | Event Date   |   RSVP Count |     Lat |      Lon | City          |
        |------------+--------------+--------------+---------+----------+---------------|
        |  109370632 | 2013-03-22   |            5 | 47.6816 | -122.126 | Redmond       |
        |  109868492 | 2013-03-23   |            6 | 47.5287 | -121.825 | Snoqualmie    |
        |  111080172 | 2013-03-30   |           11 | 47.612  | -122.199 | Bellevue      |
        |  111088522 | 2013-03-30   |            3 | 48.0043 | -122.214 | Everett 98201 |
        |  110749042 | 2013-03-31   |            8 | 47.6034 | -122.201 | Bellevue      |
        |  111871212 | 2013-04-05   |           10 | 47.6098 | -122.204 | Bellevue      |
        |  112504332 | 2013-04-06   |            8 | 47.667  | -122.1   | Redmond       |
        |  113192492 | 2013-04-12   |           10 | 47.62   | -122.202 | Bellevue      |
        |  113176612 | 2013-04-13   |            4 | 47.6573 | -122.318 | Seattle       |
        |  114838502 | 2013-04-19   |            3 |         |          |               |
        |  116882312 | 2013-05-03   |            8 | 47.5772 | -122.169 | Bellevue      |
        |  113174462 | 2013-05-04   |            6 | 47.8593 | -121.971 | Monroe        |
        |  118200732 | 2013-05-11   |           10 | 47.612  | -122.199 | Bellevue      |
        |  119185682 | 2013-05-18   |            9 | 47.5984 | -122.325 | Seattle       |
        |  120390832 | 2013-05-24   |            8 | 47.7127 | -122.187 | Kirkland      |
        |  121853942 | 2013-06-02   |            9 | 47.5836 | -122.131 | Bellevue      |
        |  123022632 | 2013-06-07   |            8 | 47.613  | -122.2   | Bellevue      |
        |  123844992 | 2013-06-15   |           25 | 47.6906 | -122.402 | Seattle       |
        |  126498932 | 2013-06-28   |           11 | 47.6233 | -122.161 | Bellevue      |
        |  126318262 | 2013-06-29   |            1 |         |          |               |
        |  127652212 | 2013-07-03   |            4 | 47.7553 | -122.163 | Woodinville   |
        |  123571462 | 2013-07-06   |            3 | 47.6303 | -122.193 | Bellevue      |
        |  129354962 | 2013-07-13   |            3 | 47.599  | -122.325 | Seattle       |
        |  129003072 | 2013-07-13   |            7 | 47.6747 | -122.208 | Kirkland      |
        |  130608792 | 2013-07-27   |           20 | 47.5839 | -122.129 | Bellevue      |
        |  133462162 | 2013-08-10   |           12 | 47.7051 | -122.213 | Kirkland      |
        |  134488172 | 2013-08-17   |            9 | 47.6167 | -122.201 | Bellevue      |
        |  135768512 | 2013-08-24   |           21 | 47.5984 | -122.323 | Seattle       |
        |  136619502 | 2013-08-31   |           11 | 47.5836 | -122.131 | Bellevue      |
        |  135478542 | 2013-09-07   |            9 | 47.6659 | -122.114 | Redmond       |
        +------------+--------------+--------------+---------+----------+---------------+


                                
                                    data = cursor.execute("select event_date, rsvp_count, event_id from event order by rsvp_count desc limit 15;")
        print(tabulate(data.fetchall(), headers=['Event Date', 'RSVP Count', 'Event ID'], tablefmt='psql'))

+--------------+--------------+------------+
        | Event Date   |   RSVP Count |   Event ID |
        |--------------+--------------+------------|
        | 2015-03-01   |          109 |  219920453 |
        | 2014-11-02   |          105 |  212049012 |
        | 2016-10-09   |          102 |  233570064 |
        | 2016-04-30   |           85 |  230305968 |
        | 2019-12-21   |           83 |  266398172 |
        | 2018-03-24   |           82 |  248285057 |
        | 2018-06-10   |           80 |  250424473 |
        | 2021-08-29   |           69 |  279628714 |
        | 2014-08-17   |           57 |  194901992 |
        | 2019-01-26   |           54 |  257948178 |
        | 2015-03-14   |           47 |  220103990 |
        | 2021-10-17   |           46 |  281053389 |
        | 2014-06-21   |           44 |  182543572 |
        | 2017-10-14   |           43 |  243442670 |
        | 2014-03-29   |           40 |  164683122 |
        +--------------+--------------+------------+


                                
                                    data = cursor.execute("select count(*) from event;")
        print(f"Total number of events is {data.fetchone()[0]}")

Total number of events is 814


                                
                                    num_event_per_month = list(cursor.execute("""select strftime('%Y-%m', event_date), count(*) from event 
            group by strftime('%Y-%m', event_date);"""))
        print(tabulate(num_event_per_month, headers=['Month', 'Number of Events'], tablefmt='psql'))


                                
                                    num_event_per_month = list(num_event_per_month)[-24:]
        x = range(len(num_event_per_month))
        width = 1/1.5
        
        figure = plt.figure(figsize=(20, 6))
        
        axes = figure.add_subplot(1, 1, 1)
        axes.bar(x, [r[1] for r in num_event_per_month], width, align="center")
        axes.set_xticks(x)
        axes.set_xticklabels([r[0] for r in num_event_per_month])
        axes.set_title("Number of Events Per Month")
        axes.set_xlabel("Month")
        axes.set_ylabel("# of Events")
        axes.xaxis.grid(False)
        
        plt.setp(axes.xaxis.get_majorticklabels(), rotation=70)
        plt.show()
        plt.close();


                                
                                    num_event_per_year = list(cursor.execute("""select strftime('%Y', event_date), count(*) from event 
            group by strftime('%Y', event_date);"""))
        print(tabulate(num_event_per_year, headers=['Year', 'Number of Events'], tablefmt='psql'))
        
        figure = plt.figure(figsize=(15, 6))
        
        axes = figure.add_subplot(1, 1, 1)
        axes.plot([r[0] for r in num_event_per_year], [r[1] for r in num_event_per_year])
        axes.plot([r[0] for r in num_event_per_year], [r[1] for r in num_event_per_year], 'ro')
        plt.axis([-1, 10, -5, 180])
        plt.show()

+--------+--------------------+
        |   Year |   Number of Events |
        |--------+--------------------|
        |   2013 |                 55 |
        |   2014 |                 60 |
        |   2015 |                 83 |
        |   2016 |                 98 |
        |   2017 |                 39 |
        |   2018 |                165 |
        |   2019 |                139 |
        |   2020 |                 42 |
        |   2021 |                132 |
        |   2022 |                  1 |
        +--------+--------------------+


                                
                                    num_event_per_weekday = list(cursor.execute("""select strftime('%w', event_date), count(*) from event 
            group by strftime('%w', event_date);"""))
        print(tabulate(num_event_per_weekday, headers=['Day of the Week', 'Number of Events'], tablefmt='psql'))

+-------------------+--------------------+
        |   Day of the Week |   Number of Events |
        |-------------------+--------------------|
        |                 0 |                278 |
        |                 1 |                 19 |
        |                 2 |                 23 |
        |                 3 |                 43 |
        |                 4 |                 37 |
        |                 5 |                102 |
        |                 6 |                312 |
        +-------------------+--------------------+


                                
                                    events = pd.read_sql('SELECT * FROM event', conn)
        events.head()


                                
                                    events.info()

<class 'pandas.core.frame.DataFrame'>
        RangeIndex: 814 entries, 0 to 813
        Data columns (total 11 columns):
         #   Column          Non-Null Count  Dtype  
        ---  ------          --------------  -----  
         0   event_id        814 non-null    object 
         1   status          814 non-null    object 
         2   event_date      814 non-null    object 
         3   event_time      814 non-null    object 
         4   rsvp_count      814 non-null    int64  
         5   waitlist_count  814 non-null    int64  
         6   duration        814 non-null    float64
         7   venue_lat       762 non-null    float64
         8   venue_lon       762 non-null    float64
         9   venue_city      762 non-null    object 
         10  link            814 non-null    object 
        dtypes: float64(3), int64(2), object(6)
        memory usage: 70.1+ KB


                                
                                    events['event_id'] = events['event_id'].astype('str')
        events['venue_city'] = events['venue_city'].astype('category')
        events['event_time'] = events['event_time'].astype('category')
        events = events.drop('status', 1)
        events = events.drop('link', 1)
        events['duration'] = events['duration'] / 3600000
        events.info()

<class 'pandas.core.frame.DataFrame'>
        RangeIndex: 814 entries, 0 to 813
        Data columns (total 9 columns):
         #   Column          Non-Null Count  Dtype   
        ---  ------          --------------  -----   
         0   event_id        814 non-null    object  
         1   event_date      814 non-null    object  
         2   event_time      814 non-null    category
         3   rsvp_count      814 non-null    int64   
         4   waitlist_count  814 non-null    int64   
         5   duration        814 non-null    float64 
         6   venue_lat       762 non-null    float64 
         7   venue_lon       762 non-null    float64 
         8   venue_city      762 non-null    category
        dtypes: category(2), float64(3), int64(2), object(2)
        memory usage: 50.3+ KB


                                
                                    print(pd.DataFrame(events.event_time.value_counts().sort_index()))
        ax = events.event_time.value_counts().sort_values().tail(10).plot.bar(figsize=(20,10),rot = 0)
        ax.set_ylabel("Count")
        ax.set_xlabel("Event Time")
        ax.grid(axis='x')

       event_time
        00:00           3
        03:00           1
        05:00           1
        06:30           3
        07:00           4
        ...           ...
        20:45           1
        21:00           5
        21:15           1
        22:00           1
        23:59           1
        
        [74 rows x 1 columns]


                                
                                    events.event_time.describe()

count       814
        unique       74
        top       18:00
        freq         71
        Name: event_time, dtype: object


                                
                                    figure = plt.figure(figsize=(10, 6))
        axes = figure.add_subplot(1, 1, 1)
        axes.hist(events.rsvp_count)
        axes.set_title("Distribution of RSVP")
        axes.set_xlabel("RSVP")
        axes.set_ylabel("Counts")
        axes.grid(axis='x')
        plt.show()
        plt.close()


                                
                                    events.rsvp_count.describe()

count    814.000000
        mean      10.968059
        std       10.296775
        min        1.000000
        25%        6.000000
        50%        9.000000
        75%       12.000000
        max      109.000000
        Name: rsvp_count, dtype: float64


                                
                                    figure = plt.figure(figsize=(10, 6))
        axes = figure.add_subplot(1, 1, 1)
        axes.hist(events.waitlist_count)
        axes.set_title("Distribution of Waitlist")
        axes.set_xlabel("Waitlist")
        axes.set_ylabel("Counts")
        axes.grid(axis='x')
        plt.show()
        plt.close()


                                
                                    events.waitlist_count.describe()

count    814.000000
        mean       0.108108
        std        0.853678
        min        0.000000
        25%        0.000000
        50%        0.000000
        75%        0.000000
        max       13.000000
        Name: waitlist_count, dtype: float64


                                
                                    print(pd.DataFrame(events.venue_city.value_counts().sort_index()))
        
        # Sort venue cities by the count and then sort before plotting top 10 most frequented cities
        ax = events.venue_city.value_counts().sort_values().tail(10).plot.bar(figsize=(20,10),rot = 0)
        ax.set_ylabel("Count")
        ax.set_xlabel("City")
        ax.grid(axis='x')

                             venue_city
        Auburn                                1
        Beijing                               1
        Bellevue                            228
        Bellevue                              6
        Berlin                                1
        Bothell                               3
        Burien                                1
        Edmonds                               5
        Everett                               3
        Everett 98201                         1
        Federal Way                           6
        Gold Bar                              3
        Index                                 1
        Issaquah                             28
        Kenmore                               1
        Kent                                  1
        Kirkland                             47
        Kirkland                              1
        Lake Forest Park                      1
        Lynnwood                              6
        Marysville                            1
        Mercer Island                         4
        Monroe                                2
        Mount Vernon                          1
        Mountlake Ter                         1
        Mukilteo                              3
        North Bend                            1
        North Bend                            1
        Picnic Point-North Lynnwood           1
        Redmond                              52
        Redmond,                              1
        Renton                                1
        Sammamish                             1
        Seattle                             314
        Seattle                               4
        Shoreline                             1
        Skykomish                             1
        Snohomish                             3
        Snohomish,  98296                     1
        Snoqualmie                            1
        Snoqualmie Pass                       2
        Tacoma                                2
        Tukwila                               7
        Tukwila                               1
        Woodinville                           7
        renton                                1
        seattle                               2


                                
                                    figure = plt.figure(figsize=(10, 6))
        axes = figure.add_subplot(1, 1, 1)
        axes.hist(events.duration)
        axes.set_title("Distribution of RSVP")
        axes.set_xlabel("RSVP")
        axes.set_ylabel("Counts")
        axes.grid(axis='x')
        plt.show()
        plt.close()


                                
                                    events.duration.describe()

count    814.000000
        mean       2.474918
        std        2.740083
        min        0.000000
        25%        1.000000
        50%        2.000000
        75%        3.000000
        max       38.500000
        Name: duration, dtype: float64


                                
                                    print(pd.DataFrame(events.venue_city.value_counts().sort_values().tail(10)))

             venue_city
        Lynnwood              6
        Bellevue              6
        Federal Way           6
        Woodinville           7
        Tukwila               7
        Issaquah             28
        Kirkland             47
        Redmond              52
        Bellevue            228
        Seattle             314


                                
                                    grouped_by_sex = events.groupby("venue_city")
        
        figure = plt.figure(figsize=(10, 6))
        
        axes = figure.add_subplot(1, 1, 1)
        labels = ["Seattle","Issaquah","Kirkland","Redmond","Bellevue"]
        grouped_data = [grouped_by_sex["rsvp_count"].get_group(k) for k in labels]
        patch = axes.boxplot(grouped_data, labels=labels)
        axes.set_xlabel("City")
        axes.set_ylabel("RSVP")
        axes.set_title("RSVP versus City")
        
        plt.show()
        plt.close()


                                
                                    sns.jointplot(data=events, x = "duration", y = "rsvp_count", kind="hex", height=10)

<seaborn.axisgrid.JointGrid at 0x23391854520>


                                
                                    figure = plt.figure(figsize=(18, 6))
        
        axes = figure.add_subplot(1, 1, 1)
        axes.scatter(events.event_time, events.rsvp_count, marker="o")
        
        axes.set_ylabel("RSVP")
        axes.set_xlabel("Time")
        axes.set_title("Scatter Plot of Time vs. RSVP")
        plt.xticks(rotation = 90) 
        plt.show()
        plt.close()


                                
                                    %matplotlib inline


                                
                                    import numpy as np
        import random as py_random
        import numpy.random as np_random
        import time
        import seaborn as sns
        import matplotlib.pyplot as plt
        import pandas as pd
        import scipy.stats as stats
        import sqlite3
        import sklearn.linear_model as linear
        import random
        import patsy
        
        from pprint import pprint
        import warnings
        warnings.filterwarnings('ignore')
        sns.set(style="whitegrid")


                                
                                    conn = sqlite3.connect('../2 - get/meetup.db')
        cursor = conn.cursor()


                                
                                    events = pd.read_sql('SELECT * FROM event', conn)
        events['event_id'] = events['event_id'].astype('str')
        events['venue_city'] = events['venue_city'].astype('category')
        events['event_time'] = events['event_time'].astype('category')
        events = events.drop('status', 1)
        events = events.drop('link', 1)
        events['duration'] = events['duration'] / 3600000
        events.info()

<class 'pandas.core.frame.DataFrame'>
        RangeIndex: 814 entries, 0 to 813
        Data columns (total 9 columns):
         #   Column          Non-Null Count  Dtype   
        ---  ------          --------------  -----   
         0   event_id        814 non-null    object  
         1   event_date      814 non-null    object  
         2   event_time      814 non-null    category
         3   rsvp_count      814 non-null    int64   
         4   waitlist_count  814 non-null    int64   
         5   duration        814 non-null    float64 
         6   venue_lat       762 non-null    float64 
         7   venue_lon       762 non-null    float64 
         8   venue_city      762 non-null    category
        dtypes: category(2), float64(3), int64(2), object(2)
        memory usage: 50.3+ KB


                                
                                    rsvp_predict = events["rsvp_count"].mean()
        rsvp_predict

10.968058968058967


                                
                                    rsvp_std = events["rsvp_count"].std()
        rsvp_error = rsvp_std * 1.96
        print(rsvp_error)

20.18167931988092


                                
                                    rsvp_predict, rsvp_predict - rsvp_error, rsvp_predict + rsvp_error

(10.968058968058967, -9.213620351821953, 31.14973828793989)


                                
                                    from scipy.stats import norm
        
        figure = plt.figure(figsize=(10,6))
        
        axes = figure.add_subplot(1, 1, 1)
        n, bins, patches = axes.hist(events['rsvp_count'], color="DimGray", density=True, bins=11, alpha=0.75)
        axes.set_xlabel("Count of RSVP")
        axes.set_ylabel("Density")
        axes.set_title("Density Histogram of Event RSVP with Normal Plot")
        
        xs = [(b2 + b1)/2 for b1, b2 in zip(bins, bins[1:])]    
        
        rings_mean = events["rsvp_count"].mean()
        rings_std  = events["rsvp_count"].std()
        ys = [norm.pdf( k, loc=rings_mean, scale=rings_std) for k in xs]
        axes.plot(xs, ys, color="darkred")
        
        plt.show()
        plt.close()


                                
                                    figure = plt.figure(figsize=(20, 8))
        
        abalone_rings_mn = events['rsvp_count'].min()
        abalone_rings_mx = events['rsvp_count'].max()
        
        axes = figure.add_subplot(1, 2, 1)
        
        values, base = np.histogram(events['rsvp_count'], bins=11, density=True)
        cumulative = np.cumsum(values)
        axes.plot(base[:-1], cumulative, color="steelblue")
        axes.set_xlim((abalone_rings_mn, abalone_rings_mx))
        
        sampled_data = [rings_mean + r * rings_std for r in np.random.standard_normal(10000)]
        values2, base = np.histogram(sampled_data, bins=base, density=True)
        cumulative2 = np.cumsum(values2)
        axes.plot( base[:-1], cumulative2, color="firebrick")
        axes.set_xlim((abalone_rings_mn, abalone_rings_mx))
        axes.set_xlabel("Empirical v. Theoretical: Normal Distribution")
        
        axes = figure.add_subplot(1, 2, 2)
        
        differences = cumulative2 - cumulative
        axes.plot(base[:-1], differences, color='firebrick')
        axes.set_xlim((abalone_rings_mn, abalone_rings_mx))
        axes.hlines(0, 0, 14000, linestyles="dotted")
        axes.set_xlabel("Empirical v. Theoretical: Normal Distribution, Difference")
        
        plt.show()
        plt.close()


                                
                                    events.rsvp_count.describe()

count    814.000000
        mean      10.968059
        std       10.296775
        min        1.000000
        25%        6.000000
        50%        9.000000
        75%       12.000000
        max      109.000000
        Name: rsvp_count, dtype: float64


                                
                                    def freeman_diaconis( data):
            quartiles = stats.mstats.mquantiles( data, [0.25, 0.5, 0.75])
            iqr = quartiles[2] - quartiles[ 0]
            n = len( data)
            h = 2.0 * (iqr/n**(1.0/3.0))
            return int( h)


                                
                                    h = freeman_diaconis(events.rsvp_count)
        print("Freeman Diaconis: ", h)
        mn = int(events.rsvp_count.min())
        mx = int(events.rsvp_count.max())
        bins = [i for i in range( mn, mx, h)]
        
        figure = plt.figure(figsize=(10, 6))
        
        axes = figure.add_subplot(1, 1, 1)
        axes.hist(events.rsvp_count,bins=bins, color="darkslategray")
        axes.set_title("Event RSVP distribution - Freeman Diaconis")
        axes.set_xlabel("RSVP")
        
        plt.show()
        plt.close()

Freeman Diaconis:  1


                                
                                    def bootstrap_sample( data, f, n=100):
            result = []
            m = len( data)
            for _ in range( n):
                sample = np.random.choice( data, len(data), replace=True)
                r = f( sample)
                result.append( r)
            return np.array( result)


                                
                                    posterior = bootstrap_sample( events.rsvp_count, np.mean, 1000)
        figure = plt.figure(figsize=(10, 6)) # first element is width, second is height.
        
        axes = figure.add_subplot(1, 1, 1)
        
        axes.hist( posterior, density=True)
        axes.set_ylabel( "Density")
        axes.set_xlabel( "$\hat{theta}$")
        axes.set_title( "Posterior Distribution of $\hat{theta}$")
        
        plt.show()
        plt.close()


                                
                                    stats.mstats.mquantiles(posterior, [0.05, 0.95])

array([10.37374693, 11.58130221])


                                
                                    events = pd.concat([events, pd.get_dummies(events["venue_city"], prefix="city")], axis=1)
        events = pd.concat([events, pd.get_dummies(events["event_time"], prefix="time")], axis=1)
        events.describe()


                                
                                    def correlations(data, y, xs):
            rs = []
            rhos = []
            for x in xs:
                r = stats.pearsonr(data[y], data[x])[0]
                rs.append(r)
                rho = stats.spearmanr(data[y], data[x])[0]
                rhos.append(rho)
            return pd.DataFrame({"feature": xs, "r": rs, "rho": rhos})


                                
                                    correlations(events, "rsvp_count", ["city_Seattle","city_Issaquah","city_Redmond","city_Bellevue",
                                            "time_16:00","time_18:00","time_10:00","time_19:00", "duration"])


                                
                                    import models
        model = "rsvp_count ~ duration + city_Seattle"
        result1 = models.bootstrap_linear_regression(model, data=events)
        models.describe_bootstrap_lr(result1)


                                
                                    def plot_residuals(result, variables):
            figure = plt.figure(figsize=(20,6))
        
            plots = len( variables)
            rows = (plots // 3) + 1
        
            residuals = np.array([r[0] for r in result["residuals"]])
            limits = max(np.abs(residuals.min()), residuals.max())
            
            n = result["n"]
            for i, variable in enumerate( variables):
                axes = figure.add_subplot(rows, 3, i + 1)
        
                keyed_values = sorted(zip(events[variable].values, residuals), key=lambda x: x[ 0])
                ordered_residuals = [x[ 1] for x in keyed_values]
        
                axes.plot(list(range(0, n)), ordered_residuals, '.', color="dimgray", alpha=0.75)
                axes.axhline(y=0.0, xmin=0, xmax=n, c="firebrick", alpha=0.5)
                axes.set_ylim((-limits, limits))
                axes.set_ylabel("residuals")
                axes.set_xlabel(variable)
        
            plt.show()
            plt.close()
            
            return residuals


                                
                                    residuals1 = plot_residuals(result1, ["duration", "city_Seattle", "time_18:00"])


                                
                                    events["log_rsvp"] = events["rsvp_count"].apply(np.log)
        figure = plt.figure(figsize=(20,6))
        axes = figure.add_subplot(1,1,1)
        axes.hist(events.log_rsvp)
        axes.set_title("Log(Charges) Histogram")
        plt.show()
        plt.close()


                                
                                    model = "log_rsvp ~ duration + city_Seattle"
        result2 = models.bootstrap_linear_regression(model, data=events)
        models.describe_bootstrap_lr(result2)


                                
                                    residuals_final= plot_residuals(result2, ["duration", "city_Seattle"])


                                
                                    figure = plt.figure(figsize=(20,6))
        axes = figure.add_subplot(1,1,1)
        axes.hist(residuals_final, bins=50)
        axes.set_title("Residuals Histogram")
        plt.show()
        plt.close()


                                
                                    def chunk(xs, n):
            k, m = divmod(len(xs), n)
            return [xs[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]


                                
                                    def cross_validation(algorithm, formula, data, evaluate, fold_count=10, repetitions=3):
            indices = list(range(len( data)))
            metrics = []
            for _ in range(repetitions):
                random.shuffle(indices)
                folds = chunk(indices, fold_count)
                for fold in folds:
                    test_data = data.iloc[fold]
                    train_indices = [idx not in fold for idx in indices]
                    train_data = data.iloc[train_indices]
                    result = algorithm(formula, data=train_data)
                    model = result["model"]
                    y, X = patsy.dmatrices(formula, test_data, return_type="matrix")
                    # y = np.ravel( y) # might need for logistic regression
                    results = models.summarize(formula, X, y, model)
                    metric = evaluate(results)
                    metrics.append(metric)
            return metrics


                                
                                    formula = "rsvp_count ~ duration + city_Seattle"
        result = cross_validation(models.linear_regression, formula, events, lambda r: (r["sigma"], r["r_squared"]))


                                
                                    print(r"95% CI for sigma:", stats.mstats.mquantiles([r[0] for r in result], [0.025, 0.975]))
        print(r"95% CI for R^2:", stats.mstats.mquantiles([r[1] for r in result], [0.025, 0.975]))

95% CI for sigma: [ 5.07680505 16.32984892]
        95% CI for R^2: [-0.14224067  0.01117915]


                                
                                    from collections import defaultdict
        
        def data_collection():
            result = dict()
            result[ "train"] = defaultdict( list)
            result[ "test"] = defaultdict( list)
            return result
        
        def learning_curves(algorithm, formula, data, evaluate, fold_count=10, repetitions=3, increment=1):
            indices = list(range(len( data)))
            results = data_collection()
            for _ in range(repetitions):
                random.shuffle(indices)
                folds = chunk(indices, fold_count)
                for fold in folds:
                    test_data = data.iloc[ fold]
                    train_indices = [idx for idx in indices if idx not in fold]
                    train_data = data.iloc[train_indices]
                    for i in list(range(increment, 100, increment)) + [100]: # ensures 100% is always picked.
                        # the indices are already shuffled so we only need to take ever increasing chunks
                        train_chunk_size = int( np.ceil((i/100)*len( train_indices)))
                        train_data_chunk = data.iloc[train_indices[0:train_chunk_size]]
                        # we calculate the model
                        result = algorithm(formula, data=train_data_chunk)
                        model = result["model"]
                        # we calculate the results for the training data subset
                        y, X = patsy.dmatrices( formula, train_data_chunk, return_type="matrix")
                        result = models.summarize(formula, X, y, model)
                        metric = evaluate(result)
                        results["train"][i].append( metric)
                        
                        # we calculate the results for the test data.
                        y, X = patsy.dmatrices( formula, test_data, return_type="matrix")
                        result = models.summarize(formula, X, y, model)
                        metric = evaluate(result)
                        results["test"][i].append( metric)
                    #
                #
            # process results
            # Rely on the CLT...
            statistics = {}
            for k, v in results["train"].items():
                statistics[ k] = (np.mean(v), np.std(v))
            results["train"] = statistics
            statistics = {}
            for k, v in results["test"].items():
                statistics[ k] = (np.mean(v), np.std(v))
            results["test"] = statistics
            return results
        
        def results_to_curves( curve, results):
            all_statistics = results[ curve]
            keys = list( all_statistics.keys())
            keys.sort()
            mean = []
            upper = []
            lower = []
            for k in keys:
                m, s = all_statistics[ k]
                mean.append( m)
                upper.append( m + 2 * s)
                lower.append( m - 2 * s)
            return keys, lower, mean, upper
        
        def plot_learning_curves( results, metric, zoom=False):
            figure = plt.figure(figsize=(10,6))
        
            axes = figure.add_subplot(1, 1, 1)
        
            xs, train_lower, train_mean, train_upper = results_to_curves( "train", results)
            _, test_lower, test_mean, test_upper = results_to_curves( "test", results)
        
            axes.plot( xs, train_mean, color="steelblue")
            axes.fill_between( xs, train_upper, train_lower, color="steelblue", alpha=0.25, label="train")
            axes.plot( xs, test_mean, color="firebrick")
            axes.fill_between( xs, test_upper, test_lower, color="firebrick", alpha=0.25, label="test")
            axes.legend()
            axes.set_xlabel( "training set (%)")
            axes.set_ylabel( metric)
            axes.set_title("Learning Curves")
        
            if zoom:
                y_lower = int( 0.9 * np.amin([train_lower[-1], test_lower[-1]]))
                y_upper = int( 1.1 * np.amax([train_upper[-1], test_upper[-1]]))
                axes.set_ylim((y_lower, y_upper))
        
            plt.show()
            plt.close()


                                
                                    formula = "rsvp_count ~ city_Seattle"
        result = learning_curves(models.linear_regression, formula, events, lambda r: r["sigma"])


                                
                                    plot_learning_curves(result, r"$\sigma$")


                                
                                    from sklearn.model_selection import train_test_split
        from sklearn import tree
        
        y = events['rsvp_count']
        X = events[['city_Seattle', 'city_Redmond', 'city_Bellevue']]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)
        clf = tree.DecisionTreeRegressor(random_state=0)
        clf = clf.fit(X_train, y_train)
        clf.score(X_test, y_test)

0.014441419978661107


                                
                                    from sklearn.tree import DecisionTreeRegressor
        from sklearn.model_selection import validation_curve
        
        # Depth of one to ten
        param_range = range(1, 11)
        
        train_scores, test_scores = validation_curve(
            tree.DecisionTreeRegressor(),
            X,
            y,
            param_name="max_depth",
            param_range=param_range,
        )
        
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        
        plt.figure(figsize = (10,6))
        plt.title(f"Validation Curve with sklearn.tree.DecisionTreeRegressor")
        plt.xlabel("Depth of Tree")
        plt.ylabel("$R^2$")
        plt.plot(param_range, train_scores_mean, label="Training score", color="blue")
        plt.fill_between(param_range, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.2, color="darkorange")
        plt.plot(param_range, test_scores_mean, label="Cross-validation score", color="red")
        plt.fill_between(param_range, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.2, color="navy")
        plt.legend()

<matplotlib.legend.Legend at 0x166ba08f400>


                                
                                    from sklearn.metrics import r2_score
        
        # 4 is assume to be the optimal depth from the validation curves above
        regressor = DecisionTreeRegressor(max_depth = 4)
        regressor.fit(X_train, y_train)
        
        y_pred = regressor.predict(X_test)
        r2_score(y_test, y_pred)

0.014441419978661107


                                
                                    model = "rsvp_count ~ duration + city_Seattle"
        result1 = models.bootstrap_linear_regression(model, data=events)
        models.describe_bootstrap_lr(result1)


                                
                                    feature_list = list(X.columns)
        importances = list(regressor.feature_importances_)
        feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
        
        # Sort and print the feature importances
        feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
        [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: city_Bellevue        Importance: 0.8
        Variable: city_Seattle         Importance: 0.11
        Variable: city_Redmond         Importance: 0.09


                                
                                    x_values = list(range(len(importances)))
        plt.bar(x_values, importances, orientation = 'vertical')
        plt.xticks(x_values, feature_list, rotation='vertical')
        plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');

	feature	r	rho
0	city_Seattle	-0.029668	0.005457
1	city_Issaquah	-0.062954	-0.084100
2	city_Redmond	0.025708	0.087630
3	city_Bellevue	0.155868	0.128478
4	time_16:00	0.079179	0.050022
5	time_18:00	0.182046	0.087944
6	time_10:00	-0.007607	0.019013
7	time_19:00	-0.050407	-0.028687
8	duration	0.030478	0.106092

			95% BCI
Coefficients		Mean	Lo	Hi
	$\beta_{0}$	10.93	10.06	12.18
duration	$\beta_{1}$	0.11	-0.03	0.37
city_Seattle	$\beta_{2}$	-0.60	-1.91	0.69

Metrics	Mean	Lo	Hi
$\sigma$	10.30	8.11	13.08
$R^2$	0.00	0.00	0.01

			95% BCI
Coefficients		Mean	Lo	Hi
	$\beta_{0}$	10.93	9.96	11.96
duration	$\beta_{1}$	0.11	-0.02	0.32
city_Seattle	$\beta_{2}$	-0.60	-1.71	0.55

Metrics	Mean	Lo	Hi
$\sigma$	10.30	7.62	12.37
$R^2$	0.00	0.00	0.01

Extract, Transform, Load (ETL)¶

Exploratory Data Analysis (EDA)¶

Single Variable Exploration¶

Event Time¶

RSVP¶

Waitlist Count¶

Venue City¶

Duration¶

Pairwise Relationships¶

RSVP versus Cities¶

RSVP versus Event Duration¶

RSVP versus Event Time¶

	event_id	status	event_date	event_time	rsvp_count	venue_lat	venue_lon	venue_city	link
0	109370632	past	2013-03-22	19:00	5	47.681561	-122.125504	Redmond	https://www.meetup.com/1-5GenAsians/events/109...
1	109868492	past	2013-03-23	12:30	6	47.528713	-121.825394	Snoqualmie	https://www.meetup.com/1-5GenAsians/events/109...
2	111080172	past	2013-03-30	10:00	11	47.612015	-122.198830	Bellevue	https://www.meetup.com/1-5GenAsians/events/111...
3	111088522	past	2013-03-30	12:00	3	48.004272	-122.214149	Everett 98201	https://www.meetup.com/1-5GenAsians/events/111...
4	110749042	past	2013-03-31	17:00	8	47.603401	-122.201317	Bellevue	https://www.meetup.com/1-5GenAsians/events/110...

	rsvp_count	waitlist_count	duration	venue_lat	venue_lon	city_Auburn	city_Beijing	city_Bellevue	city_Bellevue	city_Berlin	...	time_19:15	time_19:30	time_19:45	time_20:00	time_20:30	time_20:45	time_21:00	time_21:15	time_22:00	time_23:59
count	814.000000	814.000000	814.000000	762.000000	762.000000	814.000000	814.000000	814.000000	814.000000	814.000000	...	814.000000	814.000000	814.000000	814.000000	814.000000	814.000000	814.000000	814.000000	814.000000	814.000000
mean	10.968059	0.108108	2.474918	47.431800	-121.433413	0.001229	0.001229	0.280098	0.007371	0.001229	...	0.003686	0.029484	0.001229	0.013514	0.003686	0.001229	0.006143	0.001229	0.001229	0.001229
std	10.296775	0.853678	2.740083	2.998231	11.537341	0.035050	0.035050	0.449323	0.085590	0.035050	...	0.060634	0.169263	0.035050	0.115530	0.060634	0.035050	0.078181	0.035050	0.035050	0.035050
min	1.000000	0.000000	0.000000	0.000000	-122.519081	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	6.000000	0.000000	1.000000	47.598366	-122.325502	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	9.000000	0.000000	2.000000	47.618492	-122.206394	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	12.000000	0.000000	3.000000	47.659306	-122.152388	0.000000	0.000000	1.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
max	109.000000	13.000000	38.500000	48.415180	116.407394	1.000000	1.000000	1.000000	1.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000