{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First, we will generate a version of the dataset that maps text to number values as defined in the dataset dictionary. Then, several copies of this dataset will be generated based on their usage, and whether they are shareable or not."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import re as re\n",
    "\n",
    "def find_number(text):\n",
    "    if type(text) == str:\n",
    "        num = re.findall(r'[0-9]+',text)\n",
    "        return \" \".join(num)\n",
    "    else:\n",
    "        return(text)\n",
    "\n",
    "raw = pd.read_csv('data_raw.csv')\n",
    "raw.columns = ['ID','ID_survey','Time_start','Consent','Complete_1','Time_complete',\n",
    "               'Age','Gender','Gender_other','Role','Role_other','School',\n",
    "               'School_other','Department','Affiliation_other','UoR_tenure',\n",
    "               'Research_active','Discipline','Methods','OR_meaning',\n",
    "               'OAP_use','OAP_12m','OAP_12mPlus','OAP_NU','OAP_NH','OAP_NA',\n",
    "               'OD_use','OD_12m','OD_12mPlus','OD_NU','OD_NH','OD_NA',\n",
    "               'OSS_use','OSS_12m','OSS_12mPlus','OSS_NU','OSS_NH','OSS_NA',\n",
    "               'OC_use','OC_12m','OC_12mPlus','OC_NU','OC_NH','OC_NA',\n",
    "               'OH_use','OH_12m','OH_12mPlus','OH_NU','OH_NH','OH_NA',\n",
    "               'PP_use','PP_12m','PP_12mPlus','PP_NU','PP_NH','PP_NA',\n",
    "               'OPR_use','OPR_12m','OPR_12mPlus','OPR_NU','OPR_NH','OPR_NA',\n",
    "               'OL_use','OL_12m','OL_12mPlus','OL_NU','OL_NH','OL_NA',\n",
    "               'AM_use','AM_12m','AM_12mPlus','AM_NU','AM_NH','AM_NA',\n",
    "               'OER_use','OER_12m','OER_12mPlus','OER_NU','OER_NH','OER_NA',\n",
    "               'CWT_use','CWT_12m','CWT_12mPlus','CWT_NU','CWT_NH','CWT_NA',\n",
    "               'PR_use','PR_12m','PR_12mPlus','PR_NU','PR_NH','PR_NA',\n",
    "               'RR_use','RR_12m','RR_12mPlus','RR_NU','RR_NH','RR_NA',\n",
    "               'OPP_use','OPP_12m','OPP_12mPlus','OPP_NU','OPP_NH','OPP_NA',\n",
    "               'OEN_use','OEN_12m','OEN_12mPlus','OEN_NU','OEN_NH','OEN_NA',\n",
    "               'Practices_other','Why_OR','Why_not_OR','Relevance_OR',\n",
    "               'Advice_OR','NPS','NPS_why','Importance_OR','Integrity_OR',\n",
    "               'Reproducibility_OR','Organise_OR','UoR_support','Colleague_support',\n",
    "               'Importance_discipline','OPP_Vis','OPP_Cite','OPP_PE','OPP_Impact',\n",
    "               'OPP_intD','OPP_Qual','OPP_TransInt','OPP_Rep','OPP_InnKT','OPP_Econ',\n",
    "               'OPP_NH','OPP_Other','OPP_OtherAns','CHAL_Incent','CHAL_Regs','CHAL_Train',\n",
    "               'CHAL_Infra','CHAL_Time','CHAL_Fund','CHAL_Senior','CHAL_IntJR','CHAL_NA',\n",
    "               'CHAL_Adopt','CHAL_NH','CHAL_Other','CHAL_OtherAns','FAM_OAPF','FAM_ORS',\n",
    "               'FAM_ORC','FAM_ReproTea','FAM_RDA','FAM_CentAUR','FAM_Other','Learn',\n",
    "               'Train_OAP','Train_RDM','Train_DS','Train_OD','Train_OSS','Train_OC',\n",
    "               'Train_OH','Train_PP','Train_OPR','Train_OL','Train_AM','Train_OER',\n",
    "               'Train_CWT','Train_PR','Train_RR','Train_OPP','Train_OEN','Train_RI',\n",
    "               'Train_Stats','Train_None','Train_Other','Train_OtherAns','Train_Deets',\n",
    "               'FollowUp','MailingList','Prize','Complete_2']\n",
    "raw['Age_cat'] = pd.cut(raw['Age'], bins=[0,17.5,25.5,30.5,35.5,40.5,45.5,50.5,55.5,60.5,65.5,70.5,75.5,np.inf],labels=[0,1,2,3,4,5,6,7,8,9,10,11,12])\n",
    "raw = raw.replace(['Unchecked','Incomplete','No','Prefer not to say',\"Don't know\",\"Don't Know\",\n",
    "                   'Other','Less than 1 year'], 0)\n",
    "raw = raw.replace(['Checked','Complete','Woman','Yes','Strongly Disagree','Not at all familiar',\n",
    "                   'Undergraduate student','Agriculture, Policy and Development'], 1)\n",
    "raw = raw.replace(['Checked','Complete','Man','Yes','Disagree','Slightly familiar',\n",
    "                   'Master student','Archaeology, Geography and Environmental Science'], 2)\n",
    "raw = raw.replace(['Non-binary','Agree','PhD student','Fairly familiar','Arts and Communication Design'], 3)\n",
    "raw = raw.replace(['Post-doctoral researcher','Strongly Agree','Familiar','Biological Sciences'], 4)\n",
    "raw = raw.replace(['Teaching fellow','Very familiar','Built Environment'], 5)\n",
    "raw = raw.replace(['Research assistant','Chemistry, Food and Pharmacy'], 6)\n",
    "raw = raw.replace(['Lecturer','Henley Business School'], 7)\n",
    "raw = raw.replace(['Associate Professor','Humanities'], 8)\n",
    "raw = raw.replace(['Professor','Institute of Education'], 9)\n",
    "raw = raw.replace(['International Study and Language Institute'], 10)\n",
    "raw = raw.replace(['Law'], 11)\n",
    "raw = raw.replace(['Literature and Languages'], 12)\n",
    "raw = raw.replace(['Mathematical, Physical and Computational Sciences'], 13)\n",
    "raw = raw.replace(['Politics, Economics and International Relations'], 14)\n",
    "raw = raw.replace(['Psychology and Clinical Language Sciences'], 15)\n",
    "raw = raw[raw['Consent']==1]\n",
    "raw['UoR_tenure'] = raw['UoR_tenure'].dropna().apply(lambda x: find_number(x)) \n",
    "raw = raw[~raw[\"ID\"].isin([44, 116, 149, 272, 355])]\n",
    "\n",
    "qual = raw [['ID','Consent','Complete_1',\n",
    "               'Age','Age_cat','Gender','Gender_other','Role','Role_other','School',\n",
    "               'UoR_tenure',\n",
    "               'Research_active','Discipline','Methods','OR_meaning',\n",
    "               'Practices_other','Why_OR','Why_not_OR','Relevance_OR',\n",
    "               'Advice_OR','NPS','NPS_why',\n",
    "               'Importance_discipline','OPP_OtherAns','CHAL_OtherAns','FAM_Other','Learn',\n",
    "               'Train_OtherAns','Train_Deets',\n",
    "               'FollowUp','Complete_2']]\n",
    "\n",
    "quant = raw [['ID','Consent','Complete_1',\n",
    "               'Age','Age_cat','Gender','Role','School',\n",
    "               'UoR_tenure',\n",
    "               'Research_active',\n",
    "               'OAP_use','OAP_12m','OAP_12mPlus','OAP_NU','OAP_NH','OAP_NA',\n",
    "               'OD_use','OD_12m','OD_12mPlus','OD_NU','OD_NH','OD_NA',\n",
    "               'OSS_use','OSS_12m','OSS_12mPlus','OSS_NU','OSS_NH','OSS_NA',\n",
    "               'OC_use','OC_12m','OC_12mPlus','OC_NU','OC_NH','OC_NA',\n",
    "               'OH_use','OH_12m','OH_12mPlus','OH_NU','OH_NH','OH_NA',\n",
    "               'PP_use','PP_12m','PP_12mPlus','PP_NU','PP_NH','PP_NA',\n",
    "               'OPR_use','OPR_12m','OPR_12mPlus','OPR_NU','OPR_NH','OPR_NA',\n",
    "               'OL_use','OL_12m','OL_12mPlus','OL_NU','OL_NH','OL_NA',\n",
    "               'AM_use','AM_12m','AM_12mPlus','AM_NU','AM_NH','AM_NA',\n",
    "               'OER_use','OER_12m','OER_12mPlus','OER_NU','OER_NH','OER_NA',\n",
    "               'CWT_use','CWT_12m','CWT_12mPlus','CWT_NU','CWT_NH','CWT_NA',\n",
    "               'PR_use','PR_12m','PR_12mPlus','PR_NU','PR_NH','PR_NA',\n",
    "               'RR_use','RR_12m','RR_12mPlus','RR_NU','RR_NH','RR_NA',\n",
    "               'OPP_use','OPP_12m','OPP_12mPlus','OPP_NU','OPP_NH','OPP_NA',\n",
    "               'OEN_use','OEN_12m','OEN_12mPlus','OEN_NU','OEN_NH','OEN_NA',\n",
    "               'NPS','Importance_OR','Integrity_OR',\n",
    "               'Reproducibility_OR','Organise_OR','UoR_support','Colleague_support',\n",
    "               'OPP_Vis','OPP_Cite','OPP_PE','OPP_Impact',\n",
    "               'OPP_intD','OPP_Qual','OPP_TransInt','OPP_Rep','OPP_InnKT','OPP_Econ',\n",
    "               'OPP_NH','OPP_Other','CHAL_Incent','CHAL_Regs','CHAL_Train',\n",
    "               'CHAL_Infra','CHAL_Time','CHAL_Fund','CHAL_Senior','CHAL_IntJR','CHAL_NA',\n",
    "               'CHAL_Adopt','CHAL_NH','CHAL_Other','FAM_OAPF','FAM_ORS',\n",
    "               'FAM_ORC','FAM_ReproTea','FAM_RDA','FAM_CentAUR',\n",
    "               'Train_OAP','Train_RDM','Train_DS','Train_OD','Train_OSS','Train_OC',\n",
    "               'Train_OH','Train_PP','Train_OPR','Train_OL','Train_AM','Train_OER',\n",
    "               'Train_CWT','Train_PR','Train_RR','Train_OPP','Train_OEN','Train_RI',\n",
    "               'Train_Stats','Train_None','Train_Other',\n",
    "               'FollowUp','Complete_2']]\n",
    "\n",
    "share = raw [['ID','Consent','Complete_1',\n",
    "               'Age_cat','Gender','Role','School',\n",
    "               'UoR_tenure',\n",
    "               'Research_active',\n",
    "               'OAP_use','OAP_12m','OAP_12mPlus','OAP_NU','OAP_NH','OAP_NA',\n",
    "               'OD_use','OD_12m','OD_12mPlus','OD_NU','OD_NH','OD_NA',\n",
    "               'OSS_use','OSS_12m','OSS_12mPlus','OSS_NU','OSS_NH','OSS_NA',\n",
    "               'OC_use','OC_12m','OC_12mPlus','OC_NU','OC_NH','OC_NA',\n",
    "               'OH_use','OH_12m','OH_12mPlus','OH_NU','OH_NH','OH_NA',\n",
    "               'PP_use','PP_12m','PP_12mPlus','PP_NU','PP_NH','PP_NA',\n",
    "               'OPR_use','OPR_12m','OPR_12mPlus','OPR_NU','OPR_NH','OPR_NA',\n",
    "               'OL_use','OL_12m','OL_12mPlus','OL_NU','OL_NH','OL_NA',\n",
    "               'AM_use','AM_12m','AM_12mPlus','AM_NU','AM_NH','AM_NA',\n",
    "               'OER_use','OER_12m','OER_12mPlus','OER_NU','OER_NH','OER_NA',\n",
    "               'CWT_use','CWT_12m','CWT_12mPlus','CWT_NU','CWT_NH','CWT_NA',\n",
    "               'PR_use','PR_12m','PR_12mPlus','PR_NU','PR_NH','PR_NA',\n",
    "               'RR_use','RR_12m','RR_12mPlus','RR_NU','RR_NH','RR_NA',\n",
    "               'OPP_use','OPP_12m','OPP_12mPlus','OPP_NU','OPP_NH','OPP_NA',\n",
    "               'OEN_use','OEN_12m','OEN_12mPlus','OEN_NU','OEN_NH','OEN_NA',\n",
    "               'NPS','Importance_OR','Integrity_OR',\n",
    "               'Reproducibility_OR','Organise_OR','UoR_support','Colleague_support',\n",
    "               'OPP_Vis','OPP_Cite','OPP_PE','OPP_Impact',\n",
    "               'OPP_intD','OPP_Qual','OPP_TransInt','OPP_Rep','OPP_InnKT','OPP_Econ',\n",
    "               'OPP_NH','OPP_Other','CHAL_Incent','CHAL_Regs','CHAL_Train',\n",
    "               'CHAL_Infra','CHAL_Time','CHAL_Fund','CHAL_Senior','CHAL_IntJR','CHAL_NA',\n",
    "               'CHAL_Adopt','CHAL_NH','CHAL_Other','FAM_OAPF','FAM_ORS',\n",
    "               'FAM_ORC','FAM_ReproTea','FAM_RDA','FAM_CentAUR',\n",
    "               'Train_OAP','Train_RDM','Train_DS','Train_OD','Train_OSS','Train_OC',\n",
    "               'Train_OH','Train_PP','Train_OPR','Train_OL','Train_AM','Train_OER',\n",
    "               'Train_CWT','Train_PR','Train_RR','Train_OPP','Train_OEN','Train_RI',\n",
    "               'Train_Stats','Train_None','Train_Other',\n",
    "               'FollowUp','Complete_2']]\n",
    "\n",
    "\n",
    "\n",
    "raw.to_csv('data_pp.csv', index=False)\n",
    "qual.to_csv('qual_pp.csv', index=False)\n",
    "quant.to_csv('quant_pp.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, we will add the qualitiative data to the shareable output, and produce a csv file detailing which cells have been changed during anonymisation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = pd.read_excel('Qualitative_processing/qual_pp_checked.xlsx', sheet_name = 'Raw', index_col = 'ID')\n",
    "anon = pd.read_excel('Qualitative_processing/qual_pp_checked.xlsx', sheet_name = 'anonymised', index_col = 'ID')\n",
    "diff = raw.compare(anon).to_csv('Qualitative_processing/changes.csv') #note, self is used for 'raw' and other is used for 'anon'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "share = share.set_index('ID')\n",
    "anon = anon.drop(labels = ['Consent','Complete_1','Age','Age_cat','Gender','Gender_other','Role','Role_other','School','UoR_tenure','Research_active','NPS','FollowUp','Complete_2'], axis = 1)\n",
    "share.join(anon).to_csv('data_share.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}