# import pandas and numpy, and load the nls and land temperatures data
import pandas as pd
import numpy as np
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 35)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format
nls97 = pd.read_csv("dane/nls97b.csv")
nls97.set_index("personid", inplace=True)
landtemps = pd.read_csv("dane/landtemps2019avgs.csv")

# use the numpy where function to create a categorical series with 2 values

landtemps.elevation.quantile(np.arange(0.2,1.1,0.2))

landtemps['elevation_group'] = np.where(landtemps.elevation>\
  landtemps.elevation.quantile(0.8),'Wysoko','Nisko')
landtemps.elevation_group = landtemps.elevation_group.astype('category')
landtemps.groupby(['elevation_group'])['elevation'].agg(['count','min','max'])

# use the numpy where function to create a categorical series with 3 values
landtemps.elevation.median()
landtemps['elevation_group'] = np.where(landtemps.elevation>
  landtemps.elevation.quantile(0.8),'Wysoko',np.where(landtemps.elevation>
  landtemps.elevation.median(),'Średnio','Nisko'))
landtemps.elevation_group = landtemps.elevation_group.astype('category')
landtemps.groupby(['elevation_group'])['elevation'].agg(['count','min','max'])

# use numpy select to evaluate a list of conditions
test = [(nls97.gpaoverall<2) & (nls97.highestdegree=='0. None'), nls97.highestdegree=='0. None', nls97.gpaoverall<2]
result = ['1. Niska średnia i brak dyplomu','2. Brak dyplomu','3. Niska średnia']
nls97['hsachieve'] = np.select(test, result, '4. Dyplom i wysoka średnia')
nls97[['hsachieve','gpaoverall','highestdegree']].head()
nls97.hsachieve.value_counts().sort_index()

# create a flag if individual ever had bachelor degree enrollment
nls97.loc[[100292,100583,100139], 'colenrfeb00':'colenroct04'].T
nls97['baenrollment'] = nls97.filter(like="colenr").\
  apply(lambda x: x.str[0:1]=='3').\
  any(axis=1)

nls97.loc[[100292,100583,100139], ['baenrollment']].T
nls97.baenrollment.value_counts()

# use apply and lambda to create a more complicated categorical series
def getsleepdeprivedreason(row):
  sleepdeprivedreason = "Nieznany"
  if (row.nightlyhrssleep>=6):
    sleepdeprivedreason = "Śpi wystarczająco długo"
  elif (row.nightlyhrssleep>0):
    if (row.weeksworked16+row.weeksworked17 < 80):
      if (row.childathome>2):
        sleepdeprivedreason = "Dzieci"
      else:
        sleepdeprivedreason = "Inne powody"
    else:
      if (row.wageincome>=62000 or row.highestgradecompleted>=16):
        sleepdeprivedreason = "Wpływ pracy"
      else:
        sleepdeprivedreason = "Kwestia zarobków"
  else:
    sleepdeprivedreason = "Nieznany"
  return sleepdeprivedreason

nls97['sleepdeprivedreason'] = nls97.apply(getsleepdeprivedreason, axis=1)
nls97.sleepdeprivedreason = nls97.sleepdeprivedreason.astype('category')
nls97.sleepdeprivedreason.value_counts()


def getsleepdeprivedreason(childathome, nightlyhrssleep, wageincome, weeksworked16, weeksworked17, highestgradecompleted):
    sleepdeprivedreason = "Nieznany"
    if (nightlyhrssleep>=6):
        sleepdeprivedreason = "Śpi wystarczająco długo"
    elif (nightlyhrssleep>0):
        if (weeksworked16+weeksworked17 < 80):
            if (childathome>2):
                sleepdeprivedreason = "Dzieci"
            else:
                sleepdeprivedreason = "Inne powody"
        else:
            if (wageincome>=62000 or highestgradecompleted>=16):
                sleepdeprivedreason = "Wpływ pracy"
            else:
                sleepdeprivedreason = "Kwestia zarobków"
    else:
        sleepdeprivedreason = "Nieznany"
    return sleepdeprivedreason

nls97['sleepdeprivedreason'] = nls97.apply(lambda x: getsleepdeprivedreason(x.childathome, x.nightlyhrssleep, x.wageincome, x.weeksworked16, x.weeksworked17, x.highestgradecompleted), axis=1)
nls97.sleepdeprivedreason = nls97.sleepdeprivedreason.astype('category')
nls97.sleepdeprivedreason.value_counts()