# -*- coding: utf-8 -*- #generates folders with the corresponding textfiles and R program for the models listed in lis_model, working directory must contain a test_iid.txt and the excel file with the models # Here are the setup lines, which specify the input data file, the names of worksheets with the datasets, # the number of lines of data to be sampled for each so-called "subject", and the number of "subjects" to sample excel_file = 'MARTER_data.xlsx' lis_model = ['Trans1', 'Trans2', 'Trans3', 'Intrans1', 'Intrans2', 'iid_data'] lis_reps = ['10','20','100'] n_subs = 20 lis_start= {"Trans1":[5001,5201,5601], "Trans2":[5001,5201,5601], "Trans3":[5001,5201,5601], "Intrans1":[5001,5201,5601], "Intrans2":[5001,5201,5601], "iid_data":[5001,5201,5601]} # Note that the program samples, starting on line 5001 for each number of reps (blocks). # To start at a different line number, please see the guide ##import the packages import numpy as np import pandas as pd import os import shutil def subsample_iid_test_setup(lis_model, lis_reps, n_subs, excel_file, lis_start): #adds the data from the excel sheet to a dataframe df_dict = {} for item in lis_model: df_dict[item] = pd.read_excel(excel_file, sheet_name = item, names = ['response1','response2']) #function changes the inputs to the correct format def toColumns(df, n_reps, n_subs, start): frame = pd.DataFrame() frame['v1']=df['response1'].astype(str).str[0].astype(int) frame['v2']=df['response1'].astype(str).str[1].astype(int) frame['v3']=df['response1'].astype(str).str[2].astype(int) frame['v4']=df['response2'].astype(str).str[0].astype(int) frame['v5']=df['response2'].astype(str).str[1].astype(int) frame['v6']=df['response2'].astype(str).str[2].astype(int) lis = [] #start = start-2 (adjust start to match Excel columns?) for i in range(n_subs): start_index = i*n_reps+start end_index = (i+1)*n_reps +start sub = frame[start_index:end_index] lis.append(sub) first_row = pd.DataFrame({'v1': 'v1','v2': 'v2','v3': 'v3','v4': 'v4','v5': 'v5','v6': 'v6'},index=[0]) for j in range(len(lis)): lis[j] = pd.concat([first_row, lis[j]]) lis[j]=lis[j].reset_index() lis[j]=lis[j].drop('index',axis= 1) lis[j]=lis[j].reset_index() lis[j].iloc[0,0]="" return(lis) #function which writes the data frames as text files def to_txt(dict, lis_reps, n_subs, lis_start): text_file_names_all = {} for key, value in dict.items(): for i in range(len(lis_reps)): start = lis_start[key][i] lis = toColumns(value, int(lis_reps[i]), n_subs, start) a = key+"_rep_"+lis_reps[i]+"_subs_"+str(n_subs) os.mkdir(a) text_file_names = [] for i in range(len(lis)): if (i+1)<10: text_name = a+"_0"+str(i+1)+'.txt' np.savetxt(a + "/"+text_name, lis[i].values, fmt='%s', delimiter="\t") text_file_names.append(text_name) else: text_name = a+"_"+str(i+1)+'.txt' np.savetxt(a + "/"+text_name, lis[i].values, fmt='%s', delimiter="\t") text_file_names.append(text_name) text_file_names_all[a] = text_file_names return(text_file_names_all) #put in folder #calling the to_txt function name_dict = to_txt(df_dict, lis_reps, n_subs, lis_start) #finds the number of reps based on the file name def find_sub(word): a = word.find('subs_') return(word[a+5:]) #finds the number of subs based on the file name def find_rep(word): a = word.find('rep_') b = word.find('_subs') return(word[a+4:b]) #function which writes the R files based on test_iid.txt and places them in the correct folders for key, value in name_dict.items(): shutil.copy2('test_iid.txt', key) txt_names = str(value).replace("[","(").replace("]",")") s = open(key+"/test_iid.txt").read() s = s.replace('files1<-c()', 'files1<-c'+txt_names) s = s.replace('nsubs<-3', 'nsubs<-'+find_sub(key)) s = s.replace('nreps<-10','nreps<-'+find_rep(key)) f = open(key+"/test_iid.txt", 'w') f.write(s) f.close() os.rename(key+"/test_iid.txt",key+"/test_iid.R") subsample_iid_test_setup(lis_model, lis_reps, n_subs, excel_file, lis_start)