Riid Questions Correct


Riiid Questions Correct Prediction

캐글대회 좋은 노트북 정리

출처 : ‘Riiid: Comprehensive EDA + Baseline’ https://www.kaggle.com/erikbruin/riiid-comprehensive-eda-baseline

1.EDA

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
style.use('fivethirtyeight')
import seaborn as sns
import os
from matplotlib.ticker import FuncFormatter

import os
for dirname, _, filenames in os.walk('/kaggle/input/riiid-test-answer-prediction'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

As the train dataset is huge, I am gladly using the pickle that Rohan Rao prepared in this kernel: https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets/ (Thanks Rohan!). I actually do this at work all the time, and in this case it reduces the time to load the dataset (with the data types specified in the file description) from close to 9 minutes to about 16 seconds.

As we can see, we have over 101 million rows the the train set.

%%time

train = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")

print("Train size:", train.shape)

train.memory_usage(deep=True)
Index                                    128
row_id                             809842656
timestamp                          809842656
user_id                            404921328
content_id                         202460664
content_type_id                    101230332
task_container_id                  202460664
user_answer                        101230332
answered_correctly                 101230332
prior_question_elapsed_time        404921328
prior_question_had_explanation    3594972816
dtype: int64
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101230332 entries, 0 to 101230331
Data columns (total 10 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   row_id                          int64  
 1   timestamp                       int64  
 2   user_id                         int32  
 3   content_id                      int16  
 4   content_type_id                 bool   
 5   task_container_id               int16  
 6   user_answer                     int8   
 7   answered_correctly              int8   
 8   prior_question_elapsed_time     float32
 9   prior_question_had_explanation  object 
dtypes: bool(1), float32(1), int16(2), int32(1), int64(2), int8(2), object(1)
memory usage: 3.7+ GB
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype('boolean')

train.memory_usage(deep=True)
Index                                   128
row_id                            809842656
timestamp                         809842656
user_id                           404921328
content_id                        202460664
content_type_id                   101230332
task_container_id                 202460664
user_answer                       101230332
answered_correctly                101230332
prior_question_elapsed_time       404921328
prior_question_had_explanation    202460664
dtype: int64
%%time

questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
example_test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
example_sample_submission = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv')

CPU times: user 15.3 ms, sys: 4.43 ms, total: 19.7 ms
Wall time: 22.7 ms

1.1 Exploring Train set

print(f'We have {train.user_id.nunique()} unique users in our train set')

We have 393656 unique users in our train set
train.content_type_id.value_counts()
False    99271300
True      1959032
Name: content_type_id, dtype: int64
print(f'We have {train.content_id.nunique()} content ids in our train set, of which {train[train.content_type_id == False].content_id.nunique()} are questions.')
We have 13782 content ids in our train set, of which 13523 are questions.
cids = train.content_id.value_counts()[:30]

fig = plt.figure(figsize=(12,6))
ax = cids.plot.bar()
plt.title("Thirty most used content id's")
plt.xticks(rotation=90)
ax.get_yaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ','))) #add thousands separator
plt.show()

graph

print(f'We have {train.task_container_id.nunique()} unique Batches of questions or lectures.')
We have 10000 unique Batches of questions or lectures.
train.user_answer.value_counts()
 0    28186489
 1    26990007
 3    26084784
 2    18010020
-1     1959032
Name: user_answer, dtype: int64
#1 year = 31536000000 ms
ts = train['timestamp']/(31536000000/12)
fig = plt.figure(figsize=(12,6))
ts.plot.hist(bins=100)
plt.title("Histogram of timestamp")
plt.xticks(rotation=0)
plt.xlabel("Months between this user interaction and the first event completion from that user")
plt.show()

histogram

print(f'Of the {train.user_id.nunique()} users in train we have {train[train.timestamp == 0].user_id.nunique()} users with a timestamp zero row.')

Of the 393656 users in train we have 393656 users with a timestamp zero row.

The target : answered_correctly

correct = train[train.answered_correctly != -1].answered_correctly.value_counts(ascending=True)

fig = plt.figure(figsize=(12,4))
correct.plot.barh()
for i, v in zip(correct.index, correct.values):
    plt.text(v, i, '{:,}'.format(v), color='white', fontweight='bold', fontsize=14, ha='right', va='center')
plt.title("Questions answered correctly")
plt.xticks(rotation=0)
plt.show()

graph

bin_labels_5 = ['Bin_1', 'Bin_2', 'Bin_3', 'Bin_4', 'Bin_5']
train['ts_bin'] = pd.qcut(train['timestamp'], q=5, labels=bin_labels_5)

#make function that can also be used for other fields
def correct(field):
    correct = train[train.answered_correctly != -1].groupby([field, 'answered_correctly'], as_index=False).size()
    correct = correct.pivot(index= field, columns='answered_correctly', values='size')
    correct['Percent_correct'] = round(correct.iloc[:,1]/(correct.iloc[:,0] + correct.iloc[:,1]),2)
    correct = correct.sort_values(by = "Percent_correct", ascending = False)
    correct = correct.iloc[:,2]
    return(correct)

bins_correct = correct("ts_bin")
bins_correct = bins_correct.sort_index()

fig = plt.figure(figsize=(12,6))
plt.bar(bins_correct.index, bins_correct.values)
for i, v in zip(bins_correct.index, bins_correct.values):
    plt.text(i, v, v, color='white', fontweight='bold', fontsize=14, va='top', ha='center')
plt.title("Percent answered_correctly for 5 bins of timestamp")
plt.xticks(rotation=0)
plt.show()

task_id_correct = correct("task_container_id")

fig = plt.figure(figsize=(12,6))
task_id_correct.plot.hist(bins=40)
plt.title("Histogram of percent_correct grouped by task_container_id")
plt.xticks(rotation=0)
plt.show()

user_percent = train[train.answered_correctly != -1].groupby('user_id')['answered_correctly'].agg(Mean='mean', Answers='count')
print(f'the highest number of questions answered by a user is {user_percent.Answers.max()}')
the highest number of questions answered by a user is 17609
user_percent = user_percent.query('Answers <= 1000').sample(n=200, random_state=1)

fig = plt.figure(figsize=(12,6))
x = user_percent.Answers
y = user_percent.Mean
plt.scatter(x, y, marker='o')
plt.title("Percent answered correctly versus number of questions answered User")
plt.xticks(rotation=0)
plt.xlabel("Number of questions answered")
plt.ylabel("Percent answered correctly")
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")

plt.show()

content_percent = train[train.answered_correctly != -1].groupby('content_id')['answered_correctly'].agg(Mean='mean', Answers='count')
print(f'The highest number of questions asked by content_id is {content_percent.Answers.max()}.')
print(f'Of {len(content_percent)} content_ids, {len(content_percent[content_percent.Answers > 25000])} content_ids had more than 25,000 questions asked.')
The highest number of questions asked by content_id is 213605.
Of 13523 content_ids, 529 content_ids had more than 25,000 questions asked.
content_percent = content_percent.query('Answers <= 25000').sample(n=200, random_state=1)

fig = plt.figure(figsize=(12,6))
x = content_percent.Answers
y = content_percent.Mean
plt.scatter(x, y, marker='o')
plt.title("Percent answered correctly versus number of questions answered Content_id")
plt.xticks(rotation=0)
plt.xlabel("Number of questions answered")
plt.ylabel("Percent answered correctly")
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")

plt.show()

img

pq = train[train.answered_correctly != -1].groupby(['prior_question_had_explanation'], dropna=False).agg({'answered_correctly': ['mean', 'count']})
#pq.index = pq.index.astype(str)
print(pq.iloc[:,1])
pq = pq.iloc[:,0]

fig = plt.figure(figsize=(12,4))
pq.plot.barh()
# for i, v in zip(pq.index, pq.values):
#     plt.text(v, i, round(v,2), color='white', fontweight='bold', fontsize=14, ha='right', va='center')
plt.title("Answered_correctly versus Prior Question had explanation")
plt.xlabel("Percent answered correctly")
plt.ylabel("Prior question had explanation")
plt.xticks(rotation=0)
plt.show()
prior_question_had_explanation
False     9193234
True     89685560
NaN        392506
Name: (answered_correctly, count), dtype: int64

pq = train[train.answered_correctly != -1]
pq = pq[['prior_question_elapsed_time', 'answered_correctly']]
pq = pq.groupby(['answered_correctly']).agg({'answered_correctly': ['count'], 'prior_question_elapsed_time': ['mean']})

pq
answered_correctlyanswered_correctlyprior_question_elapsed_time
 countmean
answered_correctly  
03402667325641.992188
16524462725309.976562
#please be aware that there is an issues with train.prior_question_elapsed_time.mean()
#see https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/195032
mean_pq = train.prior_question_elapsed_time.astype("float64").mean()

condition = ((train.answered_correctly != -1) & (train.prior_question_elapsed_time.notna()))
pq = train[condition][['prior_question_elapsed_time', 'answered_correctly']].sample(n=200, random_state=1)
pq = pq.set_index('prior_question_elapsed_time').iloc[:,0]

fig = plt.figure(figsize=(12,6))
x = pq.index
y = pq.values
plt.scatter(x, y, marker='o')
plt.title("Answered_correctly versus prior_question_elapsed_time")
plt.xticks(rotation=0)
plt.xlabel("Prior_question_elapsed_time")
plt.ylabel("Answered_correctly")
plt.vlines(mean_pq, ymin=-0.1, ymax=1.1)
plt.text(x= 27000, y=0.4, s='mean')
plt.text(x=80000, y=0.6, s='trend')
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")
plt.show()

img

1.2 Exploring Questions

1.3 Exploring Lectures

Example Test

2. Model - Baseline

import numpy as np
import pandas as pd
import riiideducation
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
style.use('fivethirtyeight')
import seaborn as sns
import os
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import gc
import sys
pd.set_option('display.max_rows', None)

%%time
cols_to_load = ['row_id', 'user_id', 'answered_correctly', 'content_id', 'prior_question_had_explanation', 'prior_question_elapsed_time']
train = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")[cols_to_load]
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype('boolean')

print("Train size:", train.shape)
%%time

questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
example_test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
example_sample_submission = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv')
train.head()
 row_iduser_idanswered_correctlycontent_idprior_question_had_explanationprior_question_elapsed_time
0011515692NaN
1111515716False37000.0
221151128False55000.0
3311517860False19000.0
4411517922False11000.0
train.shape
(101230332, 6)
%%time
#adding user features
user_df = train[train.answered_correctly != -1].groupby('user_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
user_df.columns = ['user_id', 'user_questions', 'user_mean']

user_lect = train.groupby(["user_id", "answered_correctly"]).size().unstack()
user_lect.columns = ['Lecture', 'Wrong', 'Right']
user_lect['Lecture'] = user_lect['Lecture'].fillna(0)
user_lect = user_lect.astype('Int64')
user_lect['watches_lecture'] = np.where(user_lect.Lecture > 0, 1, 0)
user_lect = user_lect.reset_index()
user_lect = user_lect[['user_id', 'watches_lecture']]

user_df = user_df.merge(user_lect, on = "user_id", how = "left")
del user_lect
user_df.head()
CPU times: user 19.9 s, sys: 4.77 s, total: 24.7 s
Wall time: 24.7 s
 user_iduser_questionsuser_meanwatches_lecture
0115460.6956520
1124300.2333330
22746190.5789471
353821250.6720001
486231090.6422021
%%time
#adding content features
content_df = train[train.answered_correctly != -1].groupby('content_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
content_df.columns = ['content_id', 'content_questions', 'content_mean']
CPU times: user 15.8 s, sys: 2.79 s, total: 18.6 s
Wall time: 18.6 s
%%time
#using one of the validation sets composed by tito
cv2_train = pd.read_pickle("../input/riiid-cross-validation-files/cv2_train.pickle")['row_id']
cv2_valid = pd.read_pickle("../input/riiid-cross-validation-files/cv2_valid.pickle")['row_id']
CPU times: user 1.77 s, sys: 7.91 s, total: 9.68 s
Wall time: 14.1 s
train = train[train.answered_correctly != -1]

#save mean before splitting
#please be aware that there is an issues with train.prior_question_elapsed_time.mean()
#see https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/195032
mean_prior = train.prior_question_elapsed_time.astype("float64").mean()

validation = train[train.row_id.isin(cv2_valid)]
train = train[train.row_id.isin(cv2_train)]

validation = validation.drop(columns = "row_id")
train = train.drop(columns = "row_id")

del cv2_train, cv2_valid
gc.collect()
64
label_enc = LabelEncoder()

train = train.merge(user_df, on = "user_id", how = "left")
train = train.merge(content_df, on = "content_id", how = "left")
train['content_questions'].fillna(0, inplace = True)
train['content_mean'].fillna(0.5, inplace = True)
train['watches_lecture'].fillna(0, inplace = True)
train['user_questions'].fillna(0, inplace = True)
train['user_mean'].fillna(0.5, inplace = True)
train['prior_question_elapsed_time'].fillna(mean_prior, inplace = True)
train['prior_question_had_explanation'].fillna(False, inplace = True)
train['prior_question_had_explanation'] = label_enc.fit_transform(train['prior_question_had_explanation'])
train[['content_questions', 'user_questions']] = train[['content_questions', 'user_questions']].astype(int)
train.sample(5)
 user_idanswered_correctlycontent_idprior_question_had_explanationprior_question_elapsed_timeuser_questionsuser_meanwatches_lecturecontent_questionscontent_mean
8137486818570103590871016000.011780.726655145610.650296
52733143119910687513871113000.028710.5586901583200.795953
71174462162570235506257114000.050320.6486491172000.717791
9354058821305178501860116000.05180.762548199110.797800
2600687259530562515010116000.04790.6492691107660.789987
validation = validation.merge(user_df, on = "user_id", how = "left")
validation = validation.merge(content_df, on = "content_id", how = "left")
validation['content_questions'].fillna(0, inplace = True)
validation['content_mean'].fillna(0.5, inplace = True)
validation['watches_lecture'].fillna(0, inplace = True)
validation['user_questions'].fillna(0, inplace = True)
validation['user_mean'].fillna(0.5, inplace = True)
validation['prior_question_elapsed_time'].fillna(mean_prior, inplace = True)
validation['prior_question_had_explanation'].fillna(False, inplace = True)
validation['prior_question_had_explanation'] = label_enc.fit_transform(validation['prior_question_had_explanation'])
validation[['content_questions', 'user_questions']] = validation[['content_questions', 'user_questions']].astype(int)
validation.sample(5)
 user_idanswered_correctlycontent_idprior_question_had_explanationprior_question_elapsed_timeuser_questionsuser_meanwatches_lecturecontent_questionscontent_mean
16392315250711919825128000.01380.514493145270.776894
2206154192929394906786135500.04340.4493091182300.758640
2261289198002456709875116000.01040.625000163130.369555
2433892213077570606043033000.0310.5806450153240.662099
1978050173912236914120026000.02490.77108411993720.275585
# features = ['user_questions', 'user_mean', 'content_questions', 'content_mean', 'watches_lecture',
#             'prior_question_elapsed_time', 'prior_question_had_explanation']

features = ['user_questions', 'user_mean', 'content_questions', 'content_mean', 'prior_question_elapsed_time']


#for now just taking 10.000.000 rows for training
train = train.sample(n=10000000, random_state = 1)

y_train = train['answered_correctly']
train = train[features]

y_val = validation['answered_correctly']
validation = validation[features]
params = {'objective': 'binary',
          'metric': 'auc',
          'seed': 2020,
          'learning_rate': 0.1, #default
          "boosting_type": "gbdt" #default
         }
lgb_train = lgb.Dataset(train, y_train, categorical_feature = None)
lgb_eval = lgb.Dataset(validation, y_val, categorical_feature = None)
del train, y_train, validation, y_val
gc.collect()
80
%%time
model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=50,
    num_boost_round=10000,
    early_stopping_rounds=8
)
Training until validation scores don't improve for 8 rounds
[50]	training's auc: 0.757093	valid_1's auc: 0.762699
[100]	training's auc: 0.757629	valid_1's auc: 0.763182
[150]	training's auc: 0.75783	valid_1's auc: 0.76329
[200]	training's auc: 0.757989	valid_1's auc: 0.763359
[250]	training's auc: 0.758123	valid_1's auc: 0.763419
Early stopping, best iteration is:
[284]	training's auc: 0.758218	valid_1's auc: 0.763463
CPU times: user 38min 23s, sys: 46.6 s, total: 39min 9s
Wall time: 12min 11s
lgb.plot_importance(model)
plt.show()

env = riiideducation.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df.merge(user_df, on = "user_id", how = "left")
    test_df = test_df.merge(content_df, on = "content_id", how = "left")
    test_df['content_questions'].fillna(0, inplace = True)
    test_df['content_mean'].fillna(0.5, inplace = True)
    test_df['watches_lecture'].fillna(0, inplace = True)
    test_df['user_questions'].fillna(0, inplace = True)
    test_df['user_mean'].fillna(0.5, inplace = True)
    test_df['prior_question_elapsed_time'].fillna(mean_prior, inplace = True)
    test_df['prior_question_had_explanation'].fillna(False, inplace = True)
    test_df['prior_question_had_explanation'] = label_enc.fit_transform(test_df['prior_question_had_explanation'])
    test_df[['content_questions', 'user_questions']] = test_df[['content_questions', 'user_questions']].astype(int)
    test_df['answered_correctly'] =  model.predict(test_df[features])
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])


이 글이 도움이 되셨다면 추천 클릭을 부탁드립니다 :)

Buy me a coffeeBuy me a coffee





© 2020 modified by Tae You Kim

Powered by "shoman2"