# Initialize an empty list to store trimmed DataFrames
= []
trimmed_dfs
# Iterate through the list of XLSX file names
"ignore", category=UserWarning, message="Cannot parse header or footer so it will be ignored")
warnings.filterwarnings(
for file_name in xlsx_df['file_name']:
# Load the XLSX file into a DataFrame
= os.path.join(directory_path, file_name)
full_path = pd.read_excel(full_path )
df = df
trimmed_df
if (0):
# Find the row index where "TERM" is in the first column
= df.index[df.iloc[:, 0] == "TERM"].tolist()[0]
term_index
# # Find the row index where "N=" is in the first column
# n_index = df.index[df.iloc[:, 0].fillna('x').astype(str).str.startswith("N =")].tolist()[0]
# Clip the desired block and add column names
= df.loc[term_index + 1 : n_index - 1]
trimmed_df = df.iloc[term_index].values
trimmed_df.columns
# Append the trimmed DataFrame to the list
trimmed_dfs.append(trimmed_df)
# Combine the individual dataframes into one big one.
= pd.concat(trimmed_dfs, ignore_index=True) sections_df
Cleaning the data
This script describes the data and walks through a process of data cleaning.
At completion of this script all the individual semester data files will be combined into a single dataframe for analysis, and the dataframe will be refactored for cleaner analysis.
TL;DR: Here is a link to the cleaned dataframe. This dataframe contains one tuple per (TERM,CRN,INSTRUCTOR,MEETING_CODE).
TL;DR: Here is a link to the google sheet with the cleaned data loaded in the “source data” tab and other sheets referencing the source data.
Data sources included in the cleaned data stack.
The files are named appropriately and stored in the data folder in this repository.
Data Cleaning
The following sections combine the individual sources of data and clean them up.
Combining to single data frame
The following section combines each XLSX into a single data frame. The contents of the files are variable in length depending on the number of sections taught during any given semester.
The appropriate block starts one row after the row with the keyword TERM in the first column, denoting the header row. The appropriate block ends with the row prior to the row with the keyword N =.
We’re assuming that all xlsx workbooks share the same columns and names.
Merge in the term and period data from the xlsx_df dataframe
The following code merges in the term and period data with the section data. First ensure that the key columns are strings, then merge away.
'SECT'] = sections_df['SECT'].astype(str)
sections_df['TERM'] = sections_df['TERM'].astype(str)
sections_df['term_code'] = xlsx_df['term_code'].astype(str)
xlsx_df[= pd.merge(sections_df,xlsx_df,left_on='TERM', right_on='term_code', how='left') sections_df
Clean up column names
The new VCU report added new-line characters to a few of the column names. This code cleans them up.
= sections_df.columns.str.replace('\n', ' ')
sections_df.columns "TITLE"] = sections_df["TITLE"].str.replace("\n"," ")
sections_df["CROSS-LIST"] = sections_df["CROSS-LIST"].str.replace("\n"," ")
sections_df["MODALITY TEXT"] = sections_df["MODALITY TEXT"].str.replace("\n"," ") sections_df[
Remove duplicate instructor1 and instructor2
def replace_matches(row):
if row["PRIMARY INSTRUCTOR ID"] == row["SECONDARY INSTRUCTOR ID"]:
'SECONDARY INSTRUCTOR ID'], row['SECONDARY INSTRUCTOR LAST NAME'] = '', ''
row[return row
= sections_df.apply( replace_matches,axis=1) sections_df
Add instructor first name columns in appropriate spot
These fields were dropped when the report changed layouts in Fall 2024. Instructor first name is good to have, so these fields were added back and can be remapped later.
= sections_df.columns.get_loc("PRIMARY INSTRUCTOR LAST NAME") + 1
insert_position "PRIMARY INSTRUCTOR FIRST NAME", "")
sections_df.insert(insert_position, = sections_df.columns.get_loc("SECONDARY INSTRUCTOR LAST NAME") + 1
insert_position "SECONDARY INSTRUCTOR FIRST NAME", "") sections_df.insert(insert_position,
The logic for identifying instructors throughout the report will need to be fixed.
Fix known errors
The data in Banner doesn’t always reflect reality. This step corrects known errors in the data.
Note that the section data are stored one row per term-crn-meeting period.
The purpose of this section is to demonstrate an auditable process for documenting changes to the source data.
There is nothing magic about the changes below. As I shared the data with others, they found that the banner data did not reflect reality. For this workload analysis we want the data to reflect reality, so we fix it.
Additional fixes are expected to be added as new eyes look at their data.
if (0):
# Spring 2023: CMSC475 John Leonard taught for David Shepherd
1,'Swap Leonard for Shepherd',{'TERM':'202320','CRN':'43471'},{'PRIMARY INSTRUCTOR FIRST NAME':'John','PRIMARY INSTRUCTOR LAST NAME':'Leonard'})
fix_sections(
# Fall 2024: CMSC391 is cross listed with COAR463.
# Total enrollment 34 across two instructors (Bennett and Leonard)
# Currently Banner shows only 20 in the JL section and doesn't mention the COAR section.
2,'Fix incorrect cross listing',{'TERM':'202410','CRN':'46263'},{'ACTUAL ENROLLMENT':34})
fix_sections(
3,"Capstone common course is LEC not SEM",{'TERM':'202320','CRN':'45290'},{'TYPE':'LEC'})
fix_sections(
4,"Rename instructor",
fix_names('first_name':'Sheng-Chieh','last_name':'Chen'},{'first_name':'Shawn','last_name':'Chen'})
{
5,"Rename instructor",
fix_names('first_name':'Bernard','last_name':'Gupton'},{'first_name':'Frank','last_name':'Gupton'})
{
6,"Rename instructor",
fix_names('first_name':'Anathea','last_name':'Pepperl'},{'first_name':'Thea','last_name':'Pepperl'})
{
7,"Rename instructor",
fix_names('first_name':'Robert','last_name':'Dahlberg'},{'first_name':'Bob','last_name':'Dahlberg'})
{
8,"Rename instructor",
fix_names('first_name':'Philip','last_name':'Longest'},{'first_name':'Worth','last_name':'Longest'})
{
9,"Rename instructor",
fix_names('first_name':'Jonathan','last_name':'Buffkin'},{'first_name':'Seth','last_name':'Buffkin'})
{
10,"Rename instructor",
fix_names('first_name':'Jonathan','last_name':'Buffkin'},{'first_name':'Seth','last_name':'Buffkin'}) {
Reshape the data
The current dataframe sections_df contains one record per term-crn-meeting period tuple. Within each tuple there can be up to 2 instructors. We need to reshape the data with the instructors in a single column.
While we’re here we can combine instructor first and last name, and drop verbose columns.
= sections_df.columns
cols = ['PRIMARY INSTRUCTOR ID','PRIMARY INSTRUCTOR FIRST NAME','PRIMARY INSTRUCTOR LAST NAME','SECONDARY INSTRUCTOR ID','SECONDARY INSTRUCTOR FIRST NAME','SECONDARY INSTRUCTOR LAST NAME']
values_to_remove = [x for x in cols if x not in values_to_remove]
cols
"ins1_id"] = sections_df['PRIMARY INSTRUCTOR ID']
sections_df['ins1_last'] = sections_df['PRIMARY INSTRUCTOR LAST NAME']
sections_df['ins1_first'] = sections_df['PRIMARY INSTRUCTOR FIRST NAME']
sections_df["ins2_id"] = sections_df['SECONDARY INSTRUCTOR ID']
sections_df['ins2_last'] = sections_df['SECONDARY INSTRUCTOR LAST NAME']
sections_df['ins2_first'] = sections_df['SECONDARY INSTRUCTOR FIRST NAME']
sections_df['instructor_1'] = sections_df['ins1_last']+sections_df['ins1_first'].apply(lambda x: ',' if x != "" else "")+sections_df['ins1_first']
sections_df['instructor_2'] = sections_df['ins2_last']+sections_df['ins2_first'].apply(lambda x: ',' if x != "" else "")+sections_df['ins2_first']
sections_df[
= sections_df.fillna( {'instructor_1':'','instructor_2':''}) sections_df
We’ve got two variable we’d like to stack: ins1_id and ins2_id into instructor_id and instructor_1 and instructor_2 into instructor_name.
We do this by melting each individually, then concatenating the data frames.
= pd.melt(sections_df,
df1 =cols,
id_vars=['instructor_1','instructor_2'],
value_vars='instructor source',
var_name='instructor_name_banner'
value_name )
= pd.melt(sections_df,
df2 =cols,
id_vars=['ins1_id','ins2_id'],
value_vars='instructor id source',
var_name='instructor_id'
value_name )
= pd.concat([df2[cols+["instructor_id"]], df1[['instructor_name_banner','instructor source']]], axis=1) stacked_df
Clean up rows
The process above introduces records with missing instructor_2. This code removes records missing instructor_2.
# remove rows with empty instructor 2. Keep rows with empty instructor 1.
= stacked_df[ ~ ((stacked_df['instructor source']=='instructor_2') & (stacked_df['instructor_id'].isna() | (stacked_df['instructor_id']==''))) ] stacked_df
In some summer courses an instructor was not listed in the data. An instructor name is generated using the course and semester. This can be cleaned later.
# replace any missing instructors with note
"instructor_id"].isna(),"instructor_id"] = stacked_df[stacked_df["instructor_id"].isna()]['COURSE']+'-'+stacked_df[stacked_df["instructor_id"].isna()]['TERM']+'-'+stacked_df[stacked_df["instructor_id"].isna()]['CRN'] stacked_df.loc[stacked_df[
Some sections, for example RES sections or cancelled sections have zero enrollments. These are removed, keeping only sections with positive enrollments.
# drop rows with zero enrollments
= stacked_df[stacked_df['ACTUAL ENROLLMENT']>0] stacked_df
While not necessary, the resulting dataframe can be sorted so that it looks pretty when saved to a CSV.
# Sort the data frame so it looks pretty in the output file.
= stacked_df.sort_values(['TERM','DEPT','COURSE','SECT','instructor_name_banner']) stacked_df
Add additional data columns
Here I’ll add a few buffer columns, in case the structure of the report changes, I can add or delete columns here to keep the spreadsheet from breaking.
"buffer_1"] = ""
stacked_df["buffer_2"] = ""
stacked_df["buffer_3"] = ""
stacked_df["buffer_4"] = ""
stacked_df["buffer_5"] = ""
stacked_df["buffer_6"] = "" stacked_df[
We may need the subject and the course number in the workload model analysis.
"course_subject"] = stacked_df["COURSE"].str[:4]
stacked_df["course_number"] = stacked_df["COURSE"].str[4:] stacked_df[
For each section, add a new field with the count of instructors sharing this section. This helps with workload and other computations later.
# Number of instructors sharing a common CRN.
"instructor_cnt"] = stacked_df.groupby(['TERM','COURSE','CRN'])['CRN'].transform('count') / stacked_df.groupby(['TERM','COURSE','CRN','instructor_id'])['instructor_id'].transform('count')
stacked_df[
"capstone_cnt"] = 0.0 # used downstream to provide an actual capstone count rather than an estimated count of capstone sections. stacked_df[
It’s getting old adding new columns, then having to rework the formuluas in the rather brittle google sheet. I’m adding a few spare columns here to be used as necessary. This is the best place to add the new columns, just prior to the calculation of the aggregate measures and workload attributes.
"lec_only_course_list"] = ""
stacked_df["sum_term_crse_crn_hours_lec"] = 0 # used 4/27/2024. Search for "Store LEC-only workload", below, after workload calcs.
stacked_df["spare_col_3"] = 0
stacked_df["spare_col_4"] = 0
stacked_df["spare_col_5"] = 0
stacked_df["spare_col_6"] = 0 stacked_df[
Save instructor names to file for later use
if (0):
= stacked_df[["instructor_id","instructor_name_banner"]].drop_duplicates().sort_values(by="instructor_name_banner")
unique_instructors "unique_instructors.csv",index=False) unique_instructors.to_csv(
Merge persistent instructor data
The Banner report that we’re using for this analysis does not include all the necessary data for a proper analysis. The report infers the offering department of the course (e.g., ENGR497 or CMSC475), but not the home department of the instructor. For example, capstone courses are coded ad ENGR497, but the record won’t include the home department of the instructor, making it difficult to roll up all the courses by home department of the instructor.
We store persistent instructor and course data in a separate google sheet. As these persistent data are changed or corrected, this analysis should be regenerated to use these amended data.
The code below merges the persistent instructor data from the google sheet into the working dataframe.
# define scope
= ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
scope
# create credentials object
= os.path.join(os.path.expanduser("~"), ".gsecrets", "gsheets-credentials.json")
credential_file if not os.path.isfile( credential_file ):
print("Missing credential file:",credential_file)
sys.exit()
# authorize the client
= ServiceAccountCredentials.from_json_keyfile_name(credential_file, scope)
creds = gspread.authorize(creds)
client
= "1ZK7k8M85CXLof6FdeJYJuGFbfjsOXrCv5mc7OgUInWw"
spreadsheet_key = "Instructor data"
worksheet_name
= client.open_by_key(spreadsheet_key).worksheet(worksheet_name)
sheet = get_as_dataframe(worksheet=sheet, evaluate_formulas=True) instructor_df
= pd.merge( stacked_df,instructor_df,left_on='instructor_id',right_on='instructor_id',how='left') stacked_df
= stacked_df[stacked_df['ins_dept'].isna()]
missing_records 'instructor_id','COURSE','TERM']].drop_duplicates(subset='instructor_id') missing_records[[
instructor_id | COURSE | TERM | |
---|---|---|---|
2351 | EGRB697-202230-10626 | EGRB697 | 202230 |
2354 | EGRB697-202230-35073 | EGRB697 | 202230 |
5256 | EGRE697-202510-43183 | EGRE697 | 202510 |
= "Instructor notes"
worksheet_name = client.open_by_key(spreadsheet_key).worksheet(worksheet_name)
sheet = get_as_dataframe(worksheet=sheet, evaluate_formulas=True )
instructor_notes_df = instructor_notes_df.dropna(subset=['instructor_id']) instructor_notes_df
= pd.merge( stacked_df,instructor_notes_df,left_on='instructor_id',right_on='instructor_id',how='left') stacked_df
Merge persistent course data
Persistent course data is also stored in the google sheet and merged with the working dataframe.
# spreadsheet_key = "1ZK7k8M85CXLof6FdeJYJuGFbfjsOXrCv5mc7OgUInWw" from above!
= "Course notes"
worksheet_name = client.open_by_key(spreadsheet_key).worksheet(worksheet_name)
sheet = get_as_dataframe(worksheet=sheet, evaluate_formulas=True )
course_notes_df = course_notes_df.dropna(subset=['crse']) course_notes_df
= pd.merge( stacked_df,course_notes_df,left_on='COURSE',right_on='crse',how='left') stacked_df
= stacked_df[stacked_df['crse_url'].isna()].drop_duplicates(subset='COURSE')
missing_records 'COURSE','TERM']] missing_records[[
COURSE | TERM | |
---|---|---|
5440 | ENGR698 | 202510 |
Find all courses taught by same instructor at the same time
This code builds a list of separate courses taught by an instructor at the same time. This code is similar to the code above with the enhancement that it produces an explicit list of courses that share the same room and time.
def concat_courses(group):
return ','.join(group['COURSE'])
= pd.DataFrame();
temp_df = ['TERM','instructor_id','time_code']
keys "concat"] = stacked_df.groupby(keys).apply(concat_courses,include_groups=False)
temp_df["list"] = temp_df["concat"].apply( lambda x : list(set(x.split(","))) )
temp_df["combined_count"] = temp_df["list"].apply( lambda x : len(x) )
temp_df["combined_sections"] = temp_df["list"].apply( lambda x : ",".join(x) )
temp_df[= temp_df.reset_index()
temp_df = temp_df[ (temp_df['time_code']!='.........')&(temp_df['combined_count']>1)]
temp_df = temp_df[['TERM','instructor_id','time_code','combined_count','combined_sections']]
temp_df = pd.merge(stacked_df,temp_df,left_on=keys, right_on=keys,how="left") stacked_df
CGEP: convert to IND rather than LEC from workload perspective
Cardinal Education (formally known as the Commonwealth Graduate Engineering Program (CGEP)) is a collaborative effort among six participating universities and institutions throughout the Commonwealth of Virginia. It utilizes virtual learning classrooms and education technology in synchronous and asynchronous formats to provide working engineers the opportunity to earn a master’s degree in engineering from any of the six participating universities/institutions.
CGEP courses hosted at VCU are coded as LEC in Banner and must be associated with a VCU instructor. This ensures that the student receives proper credit towards graduation. However, from a workload perspective, these courses do not require local instructor preparation or teaching effort.
This section recodes CGEP courses to independent study (IND).
# Function to set the 'TYPE' column based on a string search in 'TITLE'
def set_type_based_on_search(row):
= row['TITLE']
title = row['TYPE']
newtype if 'CGEP' in title:
= 'IND'
newtype return newtype
# Apply the function to the entire DataFrame
'TYPE'] = stacked_df.apply(set_type_based_on_search, axis=1) stacked_df[
Note that additional spare columns are added to persistent instructor and course google sheets.
Compute appropriate aggregate measures
These measures are used in later calculations for numbers of courses, sections, instructors, etc.
These computations are normalized to the appropriate group so that they sum to the correct values when aggregated at the specified group level.
# Create various aggregate columns
'sum_term'] = 1.0 / stacked_df.groupby(['TERM'])['TERM'].transform('count')
stacked_df['sum_term_crse'] = 1.0 / stacked_df.groupby(['TERM','COURSE'])['COURSE'].transform('count')
stacked_df['sum_term_crse_crn'] = 1.0 / stacked_df.groupby(['TERM','COURSE','CRN','time_code'])['CRN'].transform('count') / stacked_df['mtgs_per_crn']
stacked_df['sum_term_crse_crn_mtg_students'] = stacked_df['ACTUAL ENROLLMENT'] * stacked_df['sum_term_crse_crn'] * stacked_df['mtgs_per_crn']
stacked_df['sum_term_crse_crn_hours'] = stacked_df['sum_term_crse_crn_mtg_students'] * stacked_df['MAX CREDITS'] / stacked_df['mtgs_per_crn'] stacked_df[
Placeholder Workload model assignments
These are placeholders for any proposed workload model based on course and instructor attributes.
For each workload model, there is a:
- sum_term_crse_wrkld containing the numeric assignment of workload for that entry in the dataframe.
- wrkld__type* containing the recoded course type. For example, some LEC sections that are actually labs are coded labs. Capstones are also highlighted.
- wrkld_note containing a note about the specific workload assignment.
'wrkld_sample_type'] = stacked_df['TYPE']
stacked_df['sum_term_crse_wrkld_sample'] = 0.0
stacked_df['sum_term_crse_wrkld_sample_lec'] = 0.0
stacked_df['wrkld_sample_note'] = ""
stacked_df['wrkld_a_type'] = stacked_df['TYPE']
stacked_df['sum_term_crse_wrkld_a'] = 0.0
stacked_df['sum_term_crse_wrkld_a_lec'] = 0.0
stacked_df['wrkld_a_note'] = ""
stacked_df['wrkld_b_type'] = stacked_df['TYPE']
stacked_df['sum_term_crse_wrkld_b'] = 0.0
stacked_df['sum_term_crse_wrkld_b_lec'] = 0.0
stacked_df['wrkld_b_note'] = ""
stacked_df['wrkld_c_type'] = stacked_df['TYPE']
stacked_df['sum_term_crse_wrkld_c'] = 0.0
stacked_df['sum_term_crse_wrkld_c_lec'] = 0.0
stacked_df['wrkld_c_note'] = "" stacked_df[
Sample workload model
This is the sample workload model. The assigned faculty workoad will be stored in the sum_term_crse_wrklod_sample field. Here is the model:
= {
model_sample 'base_lecture_value':1.0,
'labs_per_lecture':3.0,
'vips_per_lecture':3.0,
'capstones_per_lecture': 3.0,
'students_per_capstone': 4.0,
'seminars_per_lecture': 1.0,
'res_ind_fld_per_lecture': 0.0,
'large_classes': [ [80,1.5],[160,2.0],[240,2.5] ]
}
- Each section/CRN is worth a full teaching credit.
- Research, indepedent study and coop/intern sections receive zero teaching credit in this workload model. Instructors receive credit through their salary if they’re staff. Research active faculty get reduced teaching loads.
- Lab sections including actual course labs, capstone/senior design, and VIP receive 1/3 credit. This is consistent with the model that 3 lab hours is equivalent to 1 teaching hour. (e.g., 3-3-4 courses.) NOTE that laboratory sections are NOT coded as separate labs, rather they are coded as LEC making it difficult to discern these. See EGRE306 for an example.
- Seminar sections get full section credit as a positive incentive. There is a limited number of SEM courses, they are important to the curriculum, and we want them covered.
- Capstone designs are scaled to give one LAB (0.33 per above) unit per groups of 4 students.
Assign BASE workload to all sections
Assign base workload to all records. All sections independent of section type start with a base workload of 1.
Adjust base workload for courses shared by multiple instructors.
In some cases, a shared instructor was included in Banner so that the shared instructor could monitor the course in Canvas only, without offering additional teaching effort. In these cases, I recommend adding a “FIX” (listed above) to remove the secondary instructor from the workload data, ensuring that they don’t get credit for a class they aren’t actually co-teaching.
# Assign standard workload : One teaching course shared across multiple instructors
"sum_term_crse_wrkld_sample"] = model_sample['base_lecture_value']
stacked_df[# Update note
'wrkld_sample_type']=='LEC'),'wrkld_sample_note'] = "BASE: [wrkld] "
stacked_df.loc[(stacked_df[
# Adjust for multiple instructors
"sum_term_crse_wrkld_sample"] = stacked_df["sum_term_crse_wrkld_sample"] / stacked_df["instructor_cnt"]
stacked_df[# Update note
'wrkld_sample_type']=='LEC')&(stacked_df['instructor_cnt']>1.0),'wrkld_sample_note'] = "BASE: [wrkld] / (2 co-teaching) " stacked_df.loc[(stacked_df[
Adjust for courses taught at same time by same instructor
# adjust workload for CRN sharing same teaching time
'shared_mtgs_cnt']>1),'sum_term_crse_wrkld_sample'] = stacked_df.loc[(stacked_df['shared_mtgs_cnt']>1),'sum_term_crse_wrkld_sample'] / stacked_df.loc[(stacked_df['shared_mtgs_cnt']>1),'shared_mtgs_cnt']
stacked_df.loc[(stacked_df[
# Update note to reflect adjustment
'shared_mtgs_cnt']>1),'wrkld_sample_note'] = (
stacked_df.loc[(stacked_df['shared_mtgs_cnt']>1),'wrkld_sample_note'] + "/ " +
stacked_df.loc[(stacked_df['shared_mtgs_cnt']>1),'shared_mtgs_cnt'].astype(str) +" CRN at same time"
stacked_df.loc[(stacked_df[ )
Adjust for capstone workloads
Capstone / senior design courses are coded differently in each department within the college. Further, some of these course numbers change over time (across semesters).
A list of capstone courses was assembled manually and then applied to convert the capstone sections to CAP.
This code is applied in two steps with an opportunity to introduce adjustments to the estimate of capstones.
In this sample workload model, estimates of the number of capstones are rounded up (CEIL) to the next highest integer number of capstones.
# Assign CAP workload
= model_sample['capstones_per_lecture']
capstones_per_lecture = model_sample['students_per_capstone']
students_per_capstone
= ['CLSC403','EGRB401','EGRB402','CMSC441','CMSC442','CMSC451','CMSC452','EGMN402','EGMN403','ENGR402','ENGR403','EGRE404','EGRE405']
senior_design_courses for course in senior_design_courses:
# Set section type
'COURSE']==course)&(stacked_df['TYPE']=='LAB'),'wrkld_sample_type'] = 'CAP'
stacked_df.loc[(stacked_df[
# Estimate number of capstone sections using load of "students_per_capstone". Round up or down.
# This code was added after the initial code build, so we're storing the capstone_cnt in one of the spares we created above.
'COURSE']==course)&(stacked_df['wrkld_sample_type']=='CAP'),'capstone_cnt'] = ( 1.0* stacked_df.loc[(stacked_df['COURSE']==course)&(stacked_df['wrkld_sample_type']=='CAP'),'ACTUAL ENROLLMENT'].astype(float) / students_per_capstone)
stacked_df.loc[(stacked_df[
# apply CEIL function
'COURSE']==course)&(stacked_df['wrkld_sample_type']=='CAP'),'capstone_cnt'] = stacked_df.loc[(stacked_df['COURSE']==course)&(stacked_df['wrkld_sample_type']=='CAP'),'capstone_cnt'].apply( np.ceil ) stacked_df.loc[(stacked_df[
Now, apply any necessary fixes.
## apply any fixes to number of capstone sections. Use TERM,CRN as keys
Now, actually compute the workload based on the capstone_cnt.
for course in senior_design_courses:
# Using the capstone count above, calculate the capstone workload
'COURSE']==course)&(stacked_df['wrkld_sample_type']=='CAP'),'sum_term_crse_wrkld_sample'] = (
stacked_df.loc[(stacked_df['COURSE']==course)&(stacked_df['wrkld_sample_type']=='CAP'),'sum_term_crse_crn'] / capstones_per_lecture * ( stacked_df.loc[(stacked_df['COURSE']==course)&(stacked_df['wrkld_sample_type']=='CAP'),'capstone_cnt'] ) )
stacked_df.loc[(stacked_df[
# Update note to show adjustments
'COURSE']==course)&(stacked_df['wrkld_sample_type']=='CAP'),'wrkld_sample_note'] = (f"RULE: [wrkld]={(1.0/capstones_per_lecture):0.2f} for every {students_per_capstone} students in CAP (CAP=" +
stacked_df.loc[(stacked_df['COURSE']==course)&(stacked_df['wrkld_sample_type']=='CAP'),'capstone_cnt'].astype(str) + ")") stacked_df.loc[(stacked_df[
Distinguish VIP workloads
Vertically Integrated Programs (VIP) provide undergraduate students the opportunity to participate in course-based, multiyear, multidisciplinary, team-based projects under the guidance of faculty and graduate students. These projects are in the faculty areas of expertise, with the main criterion for participation being that of mutual interest.
Managing a VIP requires effort on the part of the faculty, yet at a lower rate than a standard class.
# Assign VIP workload
= model_sample['vips_per_lecture']
vips_per_lecture
'course_number'].isin(['497']),'wrkld_sample_type'] = "VIP"
stacked_df.loc[stacked_df['wrkld_sample_type'].isin(['VIP']),'wrkld_sample_note'] = f"RULE: [wrkld]={(1.0/vips_per_lecture):0.2f} for VIP"
stacked_df.loc[stacked_df['wrkld_sample_type'].isin(['VIP']))&(stacked_df['instructor_cnt']>1.0),'wrkld_sample_note'] = f"RULE: [wrkld]={(1.0/vips_per_lecture):0.2f} for VIP (co-teaching)"
stacked_df.loc[(stacked_df['wrkld_sample_type'].isin(['VIP']),'sum_term_crse_wrkld_sample'] = stacked_df.loc[stacked_df['wrkld_sample_type'].isin(['VIP']),'sum_term_crse_wrkld_sample'] / vips_per_lecture stacked_df.loc[stacked_df[
Distinguish LAB workloads
Laboratory sections (LAB) are an important part of an engineering and computer science curriculum.
LAB sections require effort on the part of the faculty member, but at a reduced rate.
Identifying lab sections is tricky. The College often codes lab sections differently across departments. Some are coded properly as LAB sections.
In other cases, LAB sections are coded as LEC because they share the same CRN as their owner lecture. To capture these cases a special logic is introduced. If a section is taught once per week (e.g., M or W or TH) and the CRN has more than one meeting period, then the section with one meeting period is designated a LAB section.
In reviewing the data I found some really odd coding. To accomodate this, I added an override block to turn specific tuples into lab sections. This logic must be maintained manually until the College decides to rework the schedule.
# Assign LAB workload
= model_sample['labs_per_lecture']
labs_per_lecture
# Assign LAB sections
'mtgs_per_crn']>1)&(stacked_df['wrkld_sample_type']=='LEC')&(stacked_df['mtgs_per_wk']==1),'wrkld_sample_type'] = 'LAB'
stacked_df.loc[(stacked_df[
# Apply overrides for LAB sections
if (0):
100,"2x lab mtg",
fix_stacked('TERM':'202410','CRN':12151,'time_code':'.T.R...14511559','instructor_id':'Abdelwahed,Sherif'},
{'wrkld_sample_type':'LAB'}
{
)
# Create a note for all LAB sections
'wrkld_sample_type'].isin(['LAB']),'wrkld_sample_note'] = f"RULE: [wrkld]={(1.0/labs_per_lecture):0.2f} for LAB"
stacked_df.loc[stacked_df[
# Assign workload
'wrkld_sample_type'].isin(['LAB']),'sum_term_crse_wrkld_sample'] = stacked_df.loc[stacked_df['wrkld_sample_type'].isin(['LAB']),'sum_term_crse_wrkld_sample'] / labs_per_lecture
stacked_df.loc[stacked_df[
# Create a special note for LAB sections that used the special logic to be identified.
'mtgs_per_crn']>1)&(stacked_df['wrkld_sample_type']=='LAB')&(stacked_df['mtgs_per_wk']==1),'wrkld_sample_note'] = stacked_df.loc[(stacked_df['mtgs_per_crn']>1)&(stacked_df['wrkld_sample_type']=='LAB')&(stacked_df['mtgs_per_wk']==1),'wrkld_sample_note'] + " (LAB: 1 mtg per wk rule)" stacked_df.loc[(stacked_df[
Distinguish RES, IND and FLD
RES, IND and FLD sections are variable unit placeholders in banner that contribute to the full-time status calculation of a student, but don’t necessarily reflect effort of the faculty mentor.
Research active faculty are most associated with RES, IND and FLD. To capture the workload we have two options:
- We can associate these units with a non-zero workload value, or
- We can reduce the workloads of research active faculty.
To do both would double-count the effort of RES, IND and FLD sections.
For purposes of this workload model, we’ll zero these out and let research active faculty have reduced workloads.
# Exclude teaching credit for research (RES), independent study (IND), and intern/coop (FLD).
# Credit is given for these activities in reduced teaching (research active) or summer pay.
'wrkld_sample_type'].isin(['RES','IND','FLD']),'wrkld_sample_note'] = "RULE: [wrkld]=0.00 for INS, RES and FLD"
stacked_df.loc[stacked_df['wrkld_sample_type'].isin(['RES','IND','FLD']),'sum_term_crse_wrkld_sample'] = model_sample['res_ind_fld_per_lecture'] stacked_df.loc[stacked_df[
Distinguish SEM workloads
The seminar class is an important part of our curriculum and requires effort of those organizing the course.
In this workload model, the seminar counts as a full workload course.
# Assign SEM workload
# Ensure that SEM get full credit because we want to reward the faculty member for doing it!
= model_sample['seminars_per_lecture']
seminars_per_lecture
'wrkld_sample_type'].isin(['SEM']),'wrkld_sample_note'] = f"RULE: [wrkld] / {seminars_per_lecture} for SEM"
stacked_df.loc[stacked_df['wrkld_sample_type'].isin(['SEM']))&(stacked_df['instructor_cnt']>1.0),'wrkld_sample_note'] = f"RULE: [wrkld] / {seminars_per_lecture} for SEM (co-teaching)"
stacked_df.loc[(stacked_df['wrkld_sample_type'].isin(['SEM']),'sum_term_crse_wrkld_sample'] = stacked_df.loc[stacked_df['wrkld_sample_type'].isin(['SEM']),'sum_term_crse_wrkld_sample'] / seminars_per_lecture stacked_df.loc[stacked_df[
Adjust for large lectures
Larger lectures require more effort. The code blocks below adjust the base workload for large classes.
# Process large classes using thresholds and weights in model_sample parameters.
for threshhold,weight in model_sample['large_classes']:
'ACTUAL ENROLLMENT']>=threshhold)&(stacked_df['wrkld_sample_type']=='LEC'),"sum_term_crse_wrkld_sample"] = weight / stacked_df["instructor_cnt"]
stacked_df.loc[(stacked_df['ACTUAL ENROLLMENT']>=threshhold)&(stacked_df['wrkld_sample_type']=='LEC'),"wrkld_sample_note"] = f"BASE: [wrkld] = {weight} per CRN ENRL>={threshhold}"
stacked_df.loc[(stacked_df['ACTUAL ENROLLMENT']>=threshhold)&(stacked_df['wrkld_sample_type']=='LEC')&(stacked_df['instructor_cnt']>1.0),'wrkld_sample_note'] = f"BASE: [wrkld]={weight/2.0} per CRN ENRL>={threshhold} (co-teaching)" stacked_df.loc[(stacked_df[
Store LEC-only workload in separate column.
'sum_term_crse_wrkld_sample_lec'] = 0.0
stacked_df['wrkld_sample_type']=='LEC'),'sum_term_crse_wrkld_sample_lec'] = stacked_df.loc[(stacked_df['wrkld_sample_type']=='LEC'),'sum_term_crse_wrkld_sample']
stacked_df.loc[(stacked_df[
# Also, store lecture-only hours (based on workload assignment) to spare value
'sum_term_crse_crn_hours_lec'] = 0.0
stacked_df['wrkld_sample_type']=='LEC'),'sum_term_crse_crn_hours_lec'] = stacked_df.loc[(stacked_df['wrkld_sample_type']=='LEC'),'sum_term_crse_crn_hours'] stacked_df.loc[(stacked_df[
Store list of LECT-only courses
def list_courses( x ):
= x.unique().tolist()
l = list(filter(lambda item: item != "", l))
l return ",".join(l)
= stacked_df.copy()
temp "lec-only-course"] = ""
temp['wrkld_sample_type']=='LEC',"lec-only-course"] = temp.loc[temp['wrkld_sample_type']=='LEC',"COURSE"]
temp.loc[temp['lec_only_course_list'] = temp.groupby(['TERM','instructor_id'])['lec-only-course'].transform( list_courses )
stacked_df[
## Do fixes
if (0):
'instructor_id']=='Leonard,John')&(stacked_df['TERM'].isin(['202410','202320']))][['TERM','COURSE','lec_only_course_list']] stacked_df[(stacked_df[
Summary
At this point the model building is done and the workload assignments are stored in the appropriate columns.
Examples, checks and verifications
The following code blocks are added as unique situations are discovered. The code below shows how these situations are handled and demonstrates how they were corrected.
As odd situations are revealed, add a code block here to clearly show the inconsistency. Then, change the workload code above until the situation clears up AND you don’t screw up any of the other situations.
Example: LECT / LAB
In this example, a lecture/lab course was coded as 3 separate CRN. Each CRN has a main lecture and a single-meeting-period lab.
All three lectures share the same meeting time and are combined into a single workload LECT. Each lab section is assigned 1/3 workload.
However, two of the lab sections share a common meeting time, so they are combined into a single 1/3 workload section.
def show_block( df,keys,cols):
= df
tdf for key in keys.keys():
= tdf[tdf[key]==keys[key]]
tdf return tdf[cols]
show_block(stacked_df,'TERM':'202410','COURSE':'EGMN416'},
{'COURSE','CRN','TYPE','TITLE','instructor_id','instructor_name','time_code',
['sum_term_crse_crn',
'sum_term_crse_crn_mtg_students',
'sum_term_crse_crn_hours',
'sum_term_crse_wrkld_sample','wrkld_sample_type','wrkld_sample_note']
)
COURSE | CRN | TYPE | TITLE | instructor_id | instructor_name | time_code | sum_term_crse_crn | sum_term_crse_crn_mtg_students | sum_term_crse_crn_hours | sum_term_crse_wrkld_sample | wrkld_sample_type | wrkld_sample_note | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4067 | EGMN416 | 45899 | LEC | MECHATRONICS | V00768493 | Hadimani,Ravi | ....F..1400.01650.0 | 0.5 | 31.0 | 46.5 | 0.333333 | LAB | RULE: [wrkld]=0.33 for LAB (LAB: 1 mtg per wk ... |
4068 | EGMN416 | 45899 | LEC | MECHATRONICS | V00768493 | Hadimani,Ravi | M...F..1300.01350.0 | 0.5 | 31.0 | 46.5 | 0.333333 | LEC | BASE: [wrkld] / 3 CRN at same time |
4069 | EGMN416 | 32709 | LEC | MECHATRONICS | V00768493 | Hadimani,Ravi | M......1400.01650.0 | 0.5 | 30.0 | 45.0 | 0.166667 | LAB | RULE: [wrkld]=0.33 for LAB (LAB: 1 mtg per wk ... |
4070 | EGMN416 | 32709 | LEC | MECHATRONICS | V00768493 | Hadimani,Ravi | M...F..1300.01350.0 | 0.5 | 30.0 | 45.0 | 0.333333 | LEC | BASE: [wrkld] / 3 CRN at same time |
4071 | EGMN416 | 46922 | LEC | MECHATRONICS | V00768493 | Hadimani,Ravi | M......1400.01650.0 | 0.5 | 29.0 | 43.5 | 0.166667 | LAB | RULE: [wrkld]=0.33 for LAB (LAB: 1 mtg per wk ... |
4072 | EGMN416 | 46922 | LEC | MECHATRONICS | V00768493 | Hadimani,Ravi | M...F..1300.01350.0 | 0.5 | 29.0 | 43.5 | 0.333333 | LEC | BASE: [wrkld] / 3 CRN at same time |
sum_term_crse_crn represents the number of unique CRNs. This value is split across multiple meeting times per CRN.
sum_term_crse_crn_mtg_students represents the number of students in each crn-meeting time. This value is useful to determine how many students are in any given meeting time.
sum_term_crse_crn_hours represents the number of credit hours associated with a CRN. This value is split across multiple meeting times.
Example: LECT sharing a common time
In this example, two CRN are sharing a common teaching time and their workloads are split across the two CRN.
show_block(stacked_df,'TERM':'202320','instructor_name':"Heise,Rebecca"},
{'COURSE','CRN','TYPE','TITLE','instructor_id','instructor_name','time_code','sum_term_crse_wrkld_sample','wrkld_sample_type','wrkld_sample_note']
[ )
COURSE | CRN | TYPE | TITLE | instructor_id | instructor_name | time_code | sum_term_crse_wrkld_sample | wrkld_sample_type | wrkld_sample_note | |
---|---|---|---|---|---|---|---|---|---|---|
3079 | EGRB111 | 43476 | LEC | INTRO TO BIOL SYS ENGINEERING | V00541480 | Heise,Rebecca | M.W.F..900.0950.0 | 1.0 | LEC | BASE: [wrkld] |
3101 | EGRB411 | 40539 | LEC | CELL MECHANICS & MECHANOBIOL | V00541480 | Heise,Rebecca | M.W.F..1000.01050.0 | 0.5 | LEC | BASE: [wrkld] / 2 CRN at same time |
3105 | EGRB491 | 45797 | LEC | ST: PULMONARY LAB RESEARCH | V00541480 | Heise,Rebecca | ......... | 1.0 | LEC | BASE: [wrkld] |
3107 | EGRB517 | 40540 | LEC | CELL MECHANICS & MECHANOBIOLOG | V00541480 | Heise,Rebecca | M.W.F..1000.01050.0 | 0.5 | LEC | BASE: [wrkld] / 2 CRN at same time |
3119 | EGRB697 | 45891 | RES | DIRECTED RES IN BIOMEDICAL EGR | V00541480 | Heise,Rebecca | ......... | 0.0 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
3127 | EGRB697 | 28930 | RES | DIRECTED RES IN BIOMEDICAL EGR | V00541480 | Heise,Rebecca | ......... | 0.0 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
3128 | EGRB697 | 28934 | RES | DIRECTED RES IN BIOMEDICAL EGR | V00541480 | Heise,Rebecca | ......... | 0.0 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
3129 | EGRB697 | 45890 | RES | DIRECTED RES IN BIOMEDICAL EGR | V00541480 | Heise,Rebecca | ......... | 0.0 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
3130 | EGRB697 | 28942 | RES | DIRECTED RES IN BIOMEDICAL EGR | V00541480 | Heise,Rebecca | ......... | 0.0 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
3526 | ENGR701 | 43901 | RES | POST-CANDIDACY DOCT RESEARCH | V00541480 | Heise,Rebecca | ......... | 0.0 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
Example: Miscode?
This record shows two CRN overlapping in teaching time, but not exactly matching teaching times. Is this for real? Is this a miscode? My initial guess is that this is a miscode and the times should be corrected. This can be accomplished with a FIX record above.
show_block(stacked_df,'TERM':'202310','instructor_name':"Ferri,James"},
{'COURSE','CRN','TYPE','TITLE','instructor_name','time_code','MODALITY CODE','sum_term_crse_wrkld_sample','wrkld_sample_type','wrkld_sample_note']
[ )
COURSE | CRN | TYPE | TITLE | instructor_name | time_code | MODALITY CODE | sum_term_crse_wrkld_sample | wrkld_sample_type | wrkld_sample_note | |
---|---|---|---|---|---|---|---|---|---|---|
2636 | CLSE301 | 42633 | LEC | TRANSPORT PHENOMENA I | Ferri,James | .T.R...1530.01645.0 | RINP | 1.0 | LEC | BASE: [wrkld] |
2648 | CLSE585 | 44879 | LEC | INTERFACIAL PHENOMENA | Ferri,James | ..W....1700.01930.0 | RINP | 1.0 | LEC | BASE: [wrkld] |
2657 | CLSE697 | 37073 | RES | DIRECTED RESEARCH | Ferri,James | ......... | RINS | 0.0 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
2664 | CLSE697 | 37469 | RES | DIRECTED RESEARCH | Ferri,James | ......... | RINS | 0.0 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
2941 | ENGR591 | 44935 | LEC | TOP: SYSTEMS ENGINEERING I | Ferri,James | ......... | RONA | 1.0 | LEC | BASE: [wrkld] |
2949 | ENGR691 | 44989 | LEC | ST:ADV TOP IN INTERFACIAL PHEN | Ferri,James | ..W....1700.01900.0 | RINP | 1.0 | LEC | BASE: [wrkld] |
2956 | ENGR701 | 43323 | RES | POST-CANDIDACY DOCT RESEARCH | Ferri,James | ......... | RINS | 0.0 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
Example: Singleton ROND
This shows a singleton ROND sections. Synchronous on-line sections (ROND) should always be taught with an in-person section. In this case I can only imagine that no students enrolled in the RINP/face-to-face section and it was dropped during earlier data cleaning.
show_block(stacked_df,'TERM':'202320','instructor_name':"Manic,Milos"},
{'COURSE','CRN','TYPE','TITLE','instructor_name','time_code','MODALITY CODE','sum_term_crse_wrkld_sample','wrkld_sample_type','wrkld_sample_note']
[ )
COURSE | CRN | TYPE | TITLE | instructor_name | time_code | MODALITY CODE | sum_term_crse_wrkld_sample | wrkld_sample_type | wrkld_sample_note | |
---|---|---|---|---|---|---|---|---|---|---|
3036 | CMSC636 | 44831 | LEC | NEURAL NETS AND DEEP LEARNING | Manic,Milos | M.W....1200.01315.0 | ROND | 1.000000 | LEC | BASE: [wrkld] |
3052 | CMSC697 | 36849 | RES | DIRECTED RESEARCH | Manic,Milos | ......... | RINS | 0.000000 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
3053 | CMSC697 | 36851 | RES | DIRECTED RESEARCH | Manic,Milos | ......... | RINS | 0.000000 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
3510 | ENGR497 | 45526 | LAB | VERTICALLY INTEGRATED PROJECTS | Manic,Milos | ......... | RINS | 0.333333 | VIP | RULE: [wrkld]=0.33 for VIP |
3511 | ENGR497 | 45527 | LAB | VERTICALLY INTEGRATED PROJECTS | Manic,Milos | ......... | RINS | 0.333333 | VIP | RULE: [wrkld]=0.33 for VIP |
Example: Doubleton ROND+RINP
This second example shows a doubleton ROND+RINP section.
show_block(stacked_df,'TERM':'202410','instructor_name':"Ghosh,Preetam"},
{'COURSE','CRN','TYPE','TITLE','instructor_name','time_code','MODALITY CODE','sum_term_crse_wrkld_sample','wrkld_sample_type','wrkld_sample_note']
[ )
COURSE | CRN | TYPE | TITLE | instructor_name | time_code | MODALITY CODE | sum_term_crse_wrkld_sample | wrkld_sample_type | wrkld_sample_note | |
---|---|---|---|---|---|---|---|---|---|---|
3703 | CMSC441 | 43750 | LAB | SR DESIGN STUDIO I (LAB/PROJ) | Ghosh,Preetam | ......... | RONL | 0.666667 | CAP | RULE: [wrkld]=0.33 for every 4.0 students in C... |
3725 | CMSC502 | 37708 | LEC | PARALLEL ALGORITHMS | Ghosh,Preetam | .T.R...1530.01645.0 | RINP | 0.500000 | LEC | BASE: [wrkld] / 2 CRN at same time |
3726 | CMSC502 | 37709 | LEC | PARALLEL ALGORITHMS | Ghosh,Preetam | .T.R...1530.01645.0 | ROND | 0.500000 | LEC | BASE: [wrkld] / 2 CRN at same time |
3739 | CMSC697 | 37384 | RES | DIRECTED RESEARCH | Ghosh,Preetam | ......... | RINS | 0.000000 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
3740 | CMSC697 | 37385 | RES | DIRECTED RESEARCH | Ghosh,Preetam | ......... | RINS | 0.000000 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
3741 | CMSC697 | 37386 | RES | DIRECTED RESEARCH | Ghosh,Preetam | ......... | RINS | 0.000000 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
4192 | ENGR701 | 46939 | RES | POST-CANDIDACY DOCT RESEARCH | Ghosh,Preetam | ......... | RINS | 0.000000 | RES | RULE: [wrkld]=0.00 for INS, RES and FLD |
Example: odd counting
Verifying actual enrollment vs sum_term_crse_wrkld_sample
show_block(stacked_df,'TERM':'202410','instructor_name':"Duke,Debra",'COURSE':'CMSC256'},
{'COURSE','CRN','TYPE','TITLE','instructor_name','time_code','MODALITY CODE','ACTUAL ENROLLMENT','MAX CREDITS','time_code','sum_term_crse_crn_mtg_students','sum_term_crse_crn_hours',
['sum_term_crse_wrkld_sample','wrkld_sample_type','wrkld_sample_note']
)
COURSE | CRN | TYPE | TITLE | instructor_name | time_code | MODALITY CODE | ACTUAL ENROLLMENT | MAX CREDITS | time_code | sum_term_crse_crn_mtg_students | sum_term_crse_crn_hours | sum_term_crse_wrkld_sample | wrkld_sample_type | wrkld_sample_note | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3670 | CMSC256 | 36067 | LEC | DATA STRUCTURE & OBJECT PROG | Duke,Debra | ..W....1300.01450.0 | RONB | 83 | 4.0 | ..W....1300.01450.0 | 83.0 | 166.0 | 0.333333 | LAB | RULE: [wrkld]=0.33 for LAB (LAB: 1 mtg per wk ... |
3671 | CMSC256 | 36067 | LEC | DATA STRUCTURE & OBJECT PROG | Duke,Debra | .T.R...1230.01345.0 | RONB | 83 | 4.0 | .T.R...1230.01345.0 | 83.0 | 166.0 | 1.500000 | LEC | BASE: [wrkld] = 1.5 per CRN ENRL>=80 |
3672 | CMSC256 | 41689 | LEC | DATA STRUCTURE & OBJECT PROG | Duke,Debra | ..W....1600.01750.0 | ROND | 31 | 4.0 | ..W....1600.01750.0 | 31.0 | 62.0 | 0.333333 | LAB | RULE: [wrkld]=0.33 for LAB (LAB: 1 mtg per wk ... |
3673 | CMSC256 | 41689 | LEC | DATA STRUCTURE & OBJECT PROG | Duke,Debra | .T.R...1600.01715.0 | ROND | 31 | 4.0 | .T.R...1600.01715.0 | 31.0 | 62.0 | 1.000000 | LEC | BASE: [wrkld] |
Store the dataframe
We’re storing both the stacked and unstacked data. Note that the aggregate measures are stored with the stacked data only.
Store to a local CSV file
# Store as CSV files
'sections_df.csv', index=False)
sections_df.to_csv('stacked_df.csv', index=False) stacked_df.to_csv(
Store the data in google sheets
# Open the worksheet
= "1ZK7k8M85CXLof6FdeJYJuGFbfjsOXrCv5mc7OgUInWw"
spreadsheet_key = "Source data"
worksheet_name
= stacked_df.to_records(index=False)
data_to_write try:
= client.open_by_key(spreadsheet_key).worksheet(worksheet_name)
sheet except SpreadsheetNotFound as e:
= client.open_by_key(spreadsheet_key).add_worksheet(
sheet = worksheet_name,nrows=100,ncols=10
title
)except APIError as e:
if 'insufficient permissions' in str(e).lower():
print(f"Permission denied error: {e}")
raise PermissionError(f"Permission denied: {e}") # Creating a new PermissionError
else:
print(f"Failed to fetch data due to API error: {e}")
raise
except ValueError as e:
print(f"Data retrieval issue: {e}")
raise
except Exception as e:
# A generic catch-all to handle unexpected errors
print(f"An unexpected error occurred: {e}")
raise
try:
sheet.clear()=sheet, dataframe=stacked_df, include_index=False,include_column_header=True, resize=True)
set_with_dataframe(worksheetexcept:
print(f"Data can't be written")
Freshen persistent instructor data
This block identifies any instructors not found in the persistent instructor data and adds them to the list with default values.
This code is not working yet. It should be fixed when a new semester is added and new instructors and courses are discovered in the imported worksheets.
#worksheet_name = "Instructor data"
#summary_df = stacked_df.groupby('instructor')[['COLLEGE','DEPT']].apply(lambda x: x.mode().iloc[0]).#reset_index()
#summary_df = summary_df.sort_values(by=['COLLEGE','DEPT','instructor'])
#data_to_write = summary_df.to_records(index=False)
#try:
# sheet = client.open_by_key(spreadsheet_key).worksheet(worksheet_name)
#except:
# nrows,ncols = summary_df.shape
# sheet = client.open_by_key(spreadsheet_key).add_worksheet(
# title = worksheet_name,rows=nrows+1,cols=ncols+1
# )
#sheet.clear()
#set_with_dataframe(worksheet=sheet, dataframe=summary_df, include_index=False,include_column_header=True, resize=True)
# Find names in df2 that are not in df1
##names_to_add = df2[~df2['name'].isin(df1['name'])]
# Add the rows with missing names from df2 to df1
## df1 = pd.concat([df1, names_to_add], ignore_index=True)
Summary
That’s all folks! Additional models can be added and the google sheets can be reviewed.
Now the real analysis can begin:
- here is a link to a google sheet that shows tables created from the dataframe.
- here is a HTML report that is still evolving.
- here is the CSV for the cleaned dataframe. This dataframe contains one tuple per (TERM,CRN,INSTRUCTOR,MEETING_CODE).