{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "First, we will generate a version of the dataset that maps text to number values as defined in the dataset dictionary. Then, several copies of this dataset will be generated based on their usage, and whether they are shareable or not." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import re as re\n", "\n", "def find_number(text):\n", " if type(text) == str:\n", " num = re.findall(r'[0-9]+',text)\n", " return \" \".join(num)\n", " else:\n", " return(text)\n", "\n", "raw = pd.read_csv('data_raw.csv')\n", "raw.columns = ['ID','ID_survey','Time_start','Consent','Complete_1','Time_complete',\n", " 'Age','Gender','Gender_other','Role','Role_other','School',\n", " 'School_other','Department','Affiliation_other','UoR_tenure',\n", " 'Research_active','Discipline','Methods','OR_meaning',\n", " 'OAP_use','OAP_12m','OAP_12mPlus','OAP_NU','OAP_NH','OAP_NA',\n", " 'OD_use','OD_12m','OD_12mPlus','OD_NU','OD_NH','OD_NA',\n", " 'OSS_use','OSS_12m','OSS_12mPlus','OSS_NU','OSS_NH','OSS_NA',\n", " 'OC_use','OC_12m','OC_12mPlus','OC_NU','OC_NH','OC_NA',\n", " 'OH_use','OH_12m','OH_12mPlus','OH_NU','OH_NH','OH_NA',\n", " 'PP_use','PP_12m','PP_12mPlus','PP_NU','PP_NH','PP_NA',\n", " 'OPR_use','OPR_12m','OPR_12mPlus','OPR_NU','OPR_NH','OPR_NA',\n", " 'OL_use','OL_12m','OL_12mPlus','OL_NU','OL_NH','OL_NA',\n", " 'AM_use','AM_12m','AM_12mPlus','AM_NU','AM_NH','AM_NA',\n", " 'OER_use','OER_12m','OER_12mPlus','OER_NU','OER_NH','OER_NA',\n", " 'CWT_use','CWT_12m','CWT_12mPlus','CWT_NU','CWT_NH','CWT_NA',\n", " 'PR_use','PR_12m','PR_12mPlus','PR_NU','PR_NH','PR_NA',\n", " 'RR_use','RR_12m','RR_12mPlus','RR_NU','RR_NH','RR_NA',\n", " 'OPP_use','OPP_12m','OPP_12mPlus','OPP_NU','OPP_NH','OPP_NA',\n", " 'OEN_use','OEN_12m','OEN_12mPlus','OEN_NU','OEN_NH','OEN_NA',\n", " 'Practices_other','Why_OR','Why_not_OR','Relevance_OR',\n", " 'Advice_OR','NPS','NPS_why','Importance_OR','Integrity_OR',\n", " 'Reproducibility_OR','Organise_OR','UoR_support','Colleague_support',\n", " 'Importance_discipline','OPP_Vis','OPP_Cite','OPP_PE','OPP_Impact',\n", " 'OPP_intD','OPP_Qual','OPP_TransInt','OPP_Rep','OPP_InnKT','OPP_Econ',\n", " 'OPP_NH','OPP_Other','OPP_OtherAns','CHAL_Incent','CHAL_Regs','CHAL_Train',\n", " 'CHAL_Infra','CHAL_Time','CHAL_Fund','CHAL_Senior','CHAL_IntJR','CHAL_NA',\n", " 'CHAL_Adopt','CHAL_NH','CHAL_Other','CHAL_OtherAns','FAM_OAPF','FAM_ORS',\n", " 'FAM_ORC','FAM_ReproTea','FAM_RDA','FAM_CentAUR','FAM_Other','Learn',\n", " 'Train_OAP','Train_RDM','Train_DS','Train_OD','Train_OSS','Train_OC',\n", " 'Train_OH','Train_PP','Train_OPR','Train_OL','Train_AM','Train_OER',\n", " 'Train_CWT','Train_PR','Train_RR','Train_OPP','Train_OEN','Train_RI',\n", " 'Train_Stats','Train_None','Train_Other','Train_OtherAns','Train_Deets',\n", " 'FollowUp','MailingList','Prize','Complete_2']\n", "raw['Age_cat'] = pd.cut(raw['Age'], bins=[0,17.5,25.5,30.5,35.5,40.5,45.5,50.5,55.5,60.5,65.5,70.5,75.5,np.inf],labels=[0,1,2,3,4,5,6,7,8,9,10,11,12])\n", "raw = raw.replace(['Unchecked','Incomplete','No','Prefer not to say',\"Don't know\",\"Don't Know\",\n", " 'Other','Less than 1 year'], 0)\n", "raw = raw.replace(['Checked','Complete','Woman','Yes','Strongly Disagree','Not at all familiar',\n", " 'Undergraduate student','Agriculture, Policy and Development'], 1)\n", "raw = raw.replace(['Checked','Complete','Man','Yes','Disagree','Slightly familiar',\n", " 'Master student','Archaeology, Geography and Environmental Science'], 2)\n", "raw = raw.replace(['Non-binary','Agree','PhD student','Fairly familiar','Arts and Communication Design'], 3)\n", "raw = raw.replace(['Post-doctoral researcher','Strongly Agree','Familiar','Biological Sciences'], 4)\n", "raw = raw.replace(['Teaching fellow','Very familiar','Built Environment'], 5)\n", "raw = raw.replace(['Research assistant','Chemistry, Food and Pharmacy'], 6)\n", "raw = raw.replace(['Lecturer','Henley Business School'], 7)\n", "raw = raw.replace(['Associate Professor','Humanities'], 8)\n", "raw = raw.replace(['Professor','Institute of Education'], 9)\n", "raw = raw.replace(['International Study and Language Institute'], 10)\n", "raw = raw.replace(['Law'], 11)\n", "raw = raw.replace(['Literature and Languages'], 12)\n", "raw = raw.replace(['Mathematical, Physical and Computational Sciences'], 13)\n", "raw = raw.replace(['Politics, Economics and International Relations'], 14)\n", "raw = raw.replace(['Psychology and Clinical Language Sciences'], 15)\n", "raw = raw[raw['Consent']==1]\n", "raw['UoR_tenure'] = raw['UoR_tenure'].dropna().apply(lambda x: find_number(x)) \n", "raw = raw[~raw[\"ID\"].isin([44, 116, 149, 272, 355])]\n", "\n", "qual = raw [['ID','Consent','Complete_1',\n", " 'Age','Age_cat','Gender','Gender_other','Role','Role_other','School',\n", " 'UoR_tenure',\n", " 'Research_active','Discipline','Methods','OR_meaning',\n", " 'Practices_other','Why_OR','Why_not_OR','Relevance_OR',\n", " 'Advice_OR','NPS','NPS_why',\n", " 'Importance_discipline','OPP_OtherAns','CHAL_OtherAns','FAM_Other','Learn',\n", " 'Train_OtherAns','Train_Deets',\n", " 'FollowUp','Complete_2']]\n", "\n", "quant = raw [['ID','Consent','Complete_1',\n", " 'Age','Age_cat','Gender','Role','School',\n", " 'UoR_tenure',\n", " 'Research_active',\n", " 'OAP_use','OAP_12m','OAP_12mPlus','OAP_NU','OAP_NH','OAP_NA',\n", " 'OD_use','OD_12m','OD_12mPlus','OD_NU','OD_NH','OD_NA',\n", " 'OSS_use','OSS_12m','OSS_12mPlus','OSS_NU','OSS_NH','OSS_NA',\n", " 'OC_use','OC_12m','OC_12mPlus','OC_NU','OC_NH','OC_NA',\n", " 'OH_use','OH_12m','OH_12mPlus','OH_NU','OH_NH','OH_NA',\n", " 'PP_use','PP_12m','PP_12mPlus','PP_NU','PP_NH','PP_NA',\n", " 'OPR_use','OPR_12m','OPR_12mPlus','OPR_NU','OPR_NH','OPR_NA',\n", " 'OL_use','OL_12m','OL_12mPlus','OL_NU','OL_NH','OL_NA',\n", " 'AM_use','AM_12m','AM_12mPlus','AM_NU','AM_NH','AM_NA',\n", " 'OER_use','OER_12m','OER_12mPlus','OER_NU','OER_NH','OER_NA',\n", " 'CWT_use','CWT_12m','CWT_12mPlus','CWT_NU','CWT_NH','CWT_NA',\n", " 'PR_use','PR_12m','PR_12mPlus','PR_NU','PR_NH','PR_NA',\n", " 'RR_use','RR_12m','RR_12mPlus','RR_NU','RR_NH','RR_NA',\n", " 'OPP_use','OPP_12m','OPP_12mPlus','OPP_NU','OPP_NH','OPP_NA',\n", " 'OEN_use','OEN_12m','OEN_12mPlus','OEN_NU','OEN_NH','OEN_NA',\n", " 'NPS','Importance_OR','Integrity_OR',\n", " 'Reproducibility_OR','Organise_OR','UoR_support','Colleague_support',\n", " 'OPP_Vis','OPP_Cite','OPP_PE','OPP_Impact',\n", " 'OPP_intD','OPP_Qual','OPP_TransInt','OPP_Rep','OPP_InnKT','OPP_Econ',\n", " 'OPP_NH','OPP_Other','CHAL_Incent','CHAL_Regs','CHAL_Train',\n", " 'CHAL_Infra','CHAL_Time','CHAL_Fund','CHAL_Senior','CHAL_IntJR','CHAL_NA',\n", " 'CHAL_Adopt','CHAL_NH','CHAL_Other','FAM_OAPF','FAM_ORS',\n", " 'FAM_ORC','FAM_ReproTea','FAM_RDA','FAM_CentAUR',\n", " 'Train_OAP','Train_RDM','Train_DS','Train_OD','Train_OSS','Train_OC',\n", " 'Train_OH','Train_PP','Train_OPR','Train_OL','Train_AM','Train_OER',\n", " 'Train_CWT','Train_PR','Train_RR','Train_OPP','Train_OEN','Train_RI',\n", " 'Train_Stats','Train_None','Train_Other',\n", " 'FollowUp','Complete_2']]\n", "\n", "share = raw [['ID','Consent','Complete_1',\n", " 'Age_cat','Gender','Role','School',\n", " 'UoR_tenure',\n", " 'Research_active',\n", " 'OAP_use','OAP_12m','OAP_12mPlus','OAP_NU','OAP_NH','OAP_NA',\n", " 'OD_use','OD_12m','OD_12mPlus','OD_NU','OD_NH','OD_NA',\n", " 'OSS_use','OSS_12m','OSS_12mPlus','OSS_NU','OSS_NH','OSS_NA',\n", " 'OC_use','OC_12m','OC_12mPlus','OC_NU','OC_NH','OC_NA',\n", " 'OH_use','OH_12m','OH_12mPlus','OH_NU','OH_NH','OH_NA',\n", " 'PP_use','PP_12m','PP_12mPlus','PP_NU','PP_NH','PP_NA',\n", " 'OPR_use','OPR_12m','OPR_12mPlus','OPR_NU','OPR_NH','OPR_NA',\n", " 'OL_use','OL_12m','OL_12mPlus','OL_NU','OL_NH','OL_NA',\n", " 'AM_use','AM_12m','AM_12mPlus','AM_NU','AM_NH','AM_NA',\n", " 'OER_use','OER_12m','OER_12mPlus','OER_NU','OER_NH','OER_NA',\n", " 'CWT_use','CWT_12m','CWT_12mPlus','CWT_NU','CWT_NH','CWT_NA',\n", " 'PR_use','PR_12m','PR_12mPlus','PR_NU','PR_NH','PR_NA',\n", " 'RR_use','RR_12m','RR_12mPlus','RR_NU','RR_NH','RR_NA',\n", " 'OPP_use','OPP_12m','OPP_12mPlus','OPP_NU','OPP_NH','OPP_NA',\n", " 'OEN_use','OEN_12m','OEN_12mPlus','OEN_NU','OEN_NH','OEN_NA',\n", " 'NPS','Importance_OR','Integrity_OR',\n", " 'Reproducibility_OR','Organise_OR','UoR_support','Colleague_support',\n", " 'OPP_Vis','OPP_Cite','OPP_PE','OPP_Impact',\n", " 'OPP_intD','OPP_Qual','OPP_TransInt','OPP_Rep','OPP_InnKT','OPP_Econ',\n", " 'OPP_NH','OPP_Other','CHAL_Incent','CHAL_Regs','CHAL_Train',\n", " 'CHAL_Infra','CHAL_Time','CHAL_Fund','CHAL_Senior','CHAL_IntJR','CHAL_NA',\n", " 'CHAL_Adopt','CHAL_NH','CHAL_Other','FAM_OAPF','FAM_ORS',\n", " 'FAM_ORC','FAM_ReproTea','FAM_RDA','FAM_CentAUR',\n", " 'Train_OAP','Train_RDM','Train_DS','Train_OD','Train_OSS','Train_OC',\n", " 'Train_OH','Train_PP','Train_OPR','Train_OL','Train_AM','Train_OER',\n", " 'Train_CWT','Train_PR','Train_RR','Train_OPP','Train_OEN','Train_RI',\n", " 'Train_Stats','Train_None','Train_Other',\n", " 'FollowUp','Complete_2']]\n", "\n", "\n", "\n", "raw.to_csv('data_pp.csv', index=False)\n", "qual.to_csv('qual_pp.csv', index=False)\n", "quant.to_csv('quant_pp.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, we will add the qualitiative data to the shareable output, and produce a csv file detailing which cells have been changed during anonymisation" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "raw = pd.read_excel('Qualitative_processing/qual_pp_checked.xlsx', sheet_name = 'Raw', index_col = 'ID')\n", "anon = pd.read_excel('Qualitative_processing/qual_pp_checked.xlsx', sheet_name = 'anonymised', index_col = 'ID')\n", "diff = raw.compare(anon).to_csv('Qualitative_processing/changes.csv') #note, self is used for 'raw' and other is used for 'anon'\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "share = share.set_index('ID')\n", "anon = anon.drop(labels = ['Consent','Complete_1','Age','Age_cat','Gender','Gender_other','Role','Role_other','School','UoR_tenure','Research_active','NPS','FollowUp','Complete_2'], axis = 1)\n", "share.join(anon).to_csv('data_share.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 5 }