[~] Refactor

2021-07-15 09:49:15 +03:00 · 2021-07-15 09:49:15 +03:00 · e60b290a00
commit e60b290a00
parent 10553c739c
1 changed files with 190 additions and 155 deletions
--- a/python/tasks/jigsaw_toxic.py
+++ b/python/tasks/jigsaw_toxic.py
@ -66,188 +66,223 @@ def kernel_1_sample_scrap(
    )
 def kernel_2():
-	import numpy as np # linear algebra
+    import numpy as np # linear algebra
-	import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
+    import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
-	from tqdm import tqdm
+    from tqdm import tqdm
-	from sklearn.model_selection import train_test_split
+    from sklearn.model_selection import train_test_split
-	import tensorflow as tf
+    import tensorflow as tf
-	from keras.models import Sequential
+    from keras.models import Sequential
-	from keras.layers.recurrent import LSTM, GRU,SimpleRNN
+    from keras.layers.recurrent import LSTM, GRU,SimpleRNN
-	from keras.layers.core import Dense, Activation, Dropout
+    from keras.layers.core import Dense, Activation, Dropout
-	from keras.layers.embeddings import Embedding
+    from keras.layers.embeddings import Embedding
-	from keras.layers.normalization import BatchNormalization
+    from keras.layers.normalization import BatchNormalization
-	from keras.utils import np_utils
+    from keras.utils import np_utils
-	from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
+    from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
-	from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
+    from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
-	from keras.preprocessing import sequence, text
+    from keras.preprocessing import sequence, text
-	from keras.callbacks import EarlyStopping
+    from keras.callbacks import EarlyStopping
-	import matplotlib.pyplot as plt
+    import matplotlib.pyplot as plt
-	import seaborn as sns
+    import seaborn as sns
-	#%matplotlib inline
+    #%matplotlib inline
-	from plotly import graph_objs as go
+    from plotly import graph_objs as go
-	import plotly.express as px
+    import plotly.express as px
-	import plotly.figure_factory as ff
+    import plotly.figure_factory as ff
-	# %% [markdown]
+    # %% [markdown]
-	# # Configuring TPU's
+    # # Configuring TPU's
-	#
+    #
-	# For this version of Notebook we will be using TPU's as we have to built a BERT Model
+    # For this version of Notebook we will be using TPU's as we have to built a BERT Model
-	# %% [code]
+    # %% [code]
-	# Detect hardware, return appropriate distribution strategy
+    # Detect hardware, return appropriate distribution strategy
-	try:
+    try:
-		# TPU detection. No parameters necessary if TPU_NAME environment variable is
+        # TPU detection. No parameters necessary if TPU_NAME environment variable is
-		# set: this is always the case on Kaggle.
+        # set: this is always the case on Kaggle.
-		tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
+        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
-		print('Running on TPU ', tpu.master())
+        print('Running on TPU ', tpu.master())
-	except ValueError:
+    except ValueError:
-		tpu = None
+        tpu = None
-	if tpu:
+    if tpu:
-		tf.config.experimental_connect_to_cluster(tpu)
+        tf.config.experimental_connect_to_cluster(tpu)
-		tf.tpu.experimental.initialize_tpu_system(tpu)
+        tf.tpu.experimental.initialize_tpu_system(tpu)
-		strategy = tf.distribute.experimental.TPUStrategy(tpu)
+        strategy = tf.distribute.experimental.TPUStrategy(tpu)
-	else:
+    else:
-		# Default distribution strategy in Tensorflow. Works on CPU and single GPU.
+        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
-		strategy = tf.distribute.get_strategy()
+        strategy = tf.distribute.get_strategy()
-	print("REPLICAS: ", strategy.num_replicas_in_sync)
+    print("REPLICAS: ", strategy.num_replicas_in_sync)
-	# %% [code]
+    # %% [code]
-	train = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv')
+    train = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv')
-	validation = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
+    validation = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
-	test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
+    test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
-	# %% [markdown]
+    # %% [markdown]
-	# We will drop the other columns and approach this problem as a Binary Classification Problem and also we will have our exercise done on a smaller subsection of the dataset(only 12000 data points) to make it easier to train the models
+    # We will drop the other columns and approach this problem as a Binary Classification Problem and also we will have our exercise done on a smaller subsection of the dataset(only 12000 data points) to make it easier to train the models
-	# %% [code]
+    # %% [code]
-	train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)
+    train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)
-	# %% [code]
+    # %% [code]
-	train = train.loc[:12000,:]
+    train = train.loc[:12000,:]
-	train.shape
+    train.shape
-	# %% [markdown]
+    # %% [markdown]
-	# We will check the maximum number of words that can be present in a comment , this will help us in padding later
+    # We will check the maximum number of words that can be present in a comment , this will help us in padding later
-	# %% [code]
+    # %% [code]
-	train['comment_text'].apply(lambda x:len(str(x).split())).max()
+    train['comment_text'].apply(lambda x:len(str(x).split())).max()
 	# %% [markdown]
 	# Writing a function for getting auc score for validation
-	# %% [code]
+    # %% [markdown]
-	def roc_auc(predictions,target):
+    # ### Data Preparation
 		'''
 		This methods returns the AUC Score when given the Predictions
 		and Labels
 		'''
-		fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
+    # %% [code]
-		roc_auc = metrics.auc(fpr, tpr)
+    xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values,
-		return roc_auc
+                                                      stratify=train.toxic.values,
                                                      random_state=42,
                                                      test_size=0.2, shuffle=True)
-	# %% [markdown]
+    # %% [markdown]
-	# ### Data Preparation
+    # # Before We Begin
    #
    # Before we Begin If you are a complete starter with NLP and never worked with text data, I am attaching a few kernels that will serve as a starting point of your journey
    # * https://www.kaggle.com/arthurtok/spooky-nlp-and-topic-modelling-tutorial
    # * https://www.kaggle.com/abhishek/approaching-almost-any-nlp-problem-on-kaggle
    #
    # If you want a more basic dataset to practice with here is another kernel which I wrote:
    # * https://www.kaggle.com/tanulsingh077/what-s-cooking
    #
    # Below are some Resources to get started with basic level Neural Networks, It will help us to easily understand the upcoming parts
    # * https://www.youtube.com/watch?v=aircAruvnKk&list=PL_h2yd2CGtBHEKwEH5iqTZH85wLS-eUzv
    # * https://www.youtube.com/watch?v=IHZwWFHWa-w&list=PL_h2yd2CGtBHEKwEH5iqTZH85wLS-eUzv&index=2
    # * https://www.youtube.com/watch?v=Ilg3gGewQ5U&list=PL_h2yd2CGtBHEKwEH5iqTZH85wLS-eUzv&index=3
    # * https://www.youtube.com/watch?v=tIeHLnjs5U8&list=PL_h2yd2CGtBHEKwEH5iqTZH85wLS-eUzv&index=4
    #
    # For Learning how to visualize test data and what to use view:
    # * https://www.kaggle.com/tanulsingh077/twitter-sentiment-extaction-analysis-eda-and-model
    # * https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda
-	# %% [code]
+    # %% [markdown]
-	xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values,
+    # # Simple RNN
-													  stratify=train.toxic.values,
+    #
-													  random_state=42,
+    # ## Basic Overview
-													  test_size=0.2, shuffle=True)
+    #
    # What is a RNN?
    #
    # Recurrent Neural Network(RNN) are a type of Neural Network where the output from previous step are fed as input to the current step. In traditional neural networks, all the inputs and outputs are independent of each other, but in cases like when it is required to predict the next word of a sentence, the previous words are required and hence there is a need to remember the previous words. Thus RNN came into existence, which solved this issue with the help of a Hidden Layer.
    #
    # Why RNN's?
    #
    # https://www.quora.com/Why-do-we-use-an-RNN-instead-of-a-simple-neural-network
    #
    # ## In-Depth Understanding
    #
    # * https://medium.com/mindorks/understanding-the-recurrent-neural-network-44d593f112a2
    # * https://www.youtube.com/watch?v=2E65LDnM2cA&list=PL1F3ABbhcqa3BBWo170U4Ev2wfsF7FN8l
    # * https://www.d2l.ai/chapter_recurrent-neural-networks/rnn.html
    #
    # ## Code Implementation
    #
    # So first I will implement the and then I will explain the code step by step
-	# %% [markdown]
+    # %% [code]
-	# # Before We Begin
+    # using keras tokenizer here
-	#
+    token = text.Tokenizer(num_words=None)
-	# Before we Begin If you are a complete starter with NLP and never worked with text data, I am attaching a few kernels that will serve as a starting point of your journey
+    max_len = 1500
 	# * https://www.kaggle.com/arthurtok/spooky-nlp-and-topic-modelling-tutorial
 	# * https://www.kaggle.com/abhishek/approaching-almost-any-nlp-problem-on-kaggle
 	#
 	# If you want a more basic dataset to practice with here is another kernel which I wrote:
 	# * https://www.kaggle.com/tanulsingh077/what-s-cooking
 	#
 	# Below are some Resources to get started with basic level Neural Networks, It will help us to easily understand the upcoming parts
 	# * https://www.youtube.com/watch?v=aircAruvnKk&list=PL_h2yd2CGtBHEKwEH5iqTZH85wLS-eUzv
 	# * https://www.youtube.com/watch?v=IHZwWFHWa-w&list=PL_h2yd2CGtBHEKwEH5iqTZH85wLS-eUzv&index=2
 	# * https://www.youtube.com/watch?v=Ilg3gGewQ5U&list=PL_h2yd2CGtBHEKwEH5iqTZH85wLS-eUzv&index=3
 	# * https://www.youtube.com/watch?v=tIeHLnjs5U8&list=PL_h2yd2CGtBHEKwEH5iqTZH85wLS-eUzv&index=4
 	#
 	# For Learning how to visualize test data and what to use view:
 	# * https://www.kaggle.com/tanulsingh077/twitter-sentiment-extaction-analysis-eda-and-model
 	# * https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda
-	# %% [markdown]
+    token.fit_on_texts(list(xtrain) + list(xvalid))
-	# # Simple RNN
+    xtrain_seq = token.texts_to_sequences(xtrain)
-	#
+    xvalid_seq = token.texts_to_sequences(xvalid)
 	# ## Basic Overview
 	#
 	# What is a RNN?
 	#
 	# Recurrent Neural Network(RNN) are a type of Neural Network where the output from previous step are fed as input to the current step. In traditional neural networks, all the inputs and outputs are independent of each other, but in cases like when it is required to predict the next word of a sentence, the previous words are required and hence there is a need to remember the previous words. Thus RNN came into existence, which solved this issue with the help of a Hidden Layer.
 	#
 	# Why RNN's?
 	#
 	# https://www.quora.com/Why-do-we-use-an-RNN-instead-of-a-simple-neural-network
 	#
 	# ## In-Depth Understanding
 	#
 	# * https://medium.com/mindorks/understanding-the-recurrent-neural-network-44d593f112a2
 	# * https://www.youtube.com/watch?v=2E65LDnM2cA&list=PL1F3ABbhcqa3BBWo170U4Ev2wfsF7FN8l
 	# * https://www.d2l.ai/chapter_recurrent-neural-networks/rnn.html
 	#
 	# ## Code Implementation
 	#
 	# So first I will implement the and then I will explain the code step by step
-	# %% [code]
+    #zero pad the sequences
-	# using keras tokenizer here
+    xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
-	token = text.Tokenizer(num_words=None)
+    xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)
 	max_len = 1500
-	token.fit_on_texts(list(xtrain) + list(xvalid))
+    word_index = token.word_index
 	xtrain_seq = token.texts_to_sequences(xtrain)
 	xvalid_seq = token.texts_to_sequences(xvalid)
-	#zero pad the sequences
+    # %% [code]
-	xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
+    #%%time
-	xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)
+    with strategy.scope():
        # A simpleRNN without any pretrained embeddings and one dense layer
        model = Sequential()
        model.add(Embedding(len(word_index) + 1,
                         300,
                         input_length=max_len))
        model.add(SimpleRNN(100))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
-	word_index = token.word_index
+    model.summary()
-	# %% [code]
+    return dict(
-	#%%time
+        model=model,
-	with strategy.scope():
+        xtrain_pad=xtrain_pad,
-		# A simpleRNN without any pretrained embeddings and one dense layer
+        ytrain=ytrain,
-		model = Sequential()
+        strategy=strategy,
-		model.add(Embedding(len(word_index) + 1,
+        xvalid_pad=xvalid_pad,
-						 300,
+        yvalid=yvalid,
-						 input_length=max_len))
+        xtrain_seq=xtrain_seq,
-		model.add(SimpleRNN(100))
+    )
 		model.add(Dense(1, activation='sigmoid'))
 		model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
 	model.summary()
-	# %% [code]
+def kernel_3(
-	model.fit(xtrain_pad, ytrain, nb_epoch=5, batch_size=64*strategy.num_replicas_in_sync) #Multiplying by Strategy to run on TPU's
+    o_2,
 ):
    # %% [markdown]
    # Writing a function for getting auc score for validation
-	# %% [code]
+    # %% [code]
-	scores = model.predict(xvalid_pad)
+    def roc_auc(predictions,target):
-	print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))
+        '''
        This methods returns the AUC Score when given the Predictions
        and Labels
        '''
-	# %% [code]
+        fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
-	scores_model = []
+        roc_auc = metrics.auc(fpr, tpr)
-	scores_model.append({'Model': 'SimpleRNN','AUC_Score': roc_auc(scores,yvalid)})
+        return roc_auc
-	# %% [markdown]
+    # %% [code]
-	# ## Code Explanantion
+    o_2['model'].fit(
-	# * Tokenization<br><br>
+        o_2['xtrain_pad'],
-	#  So if you have watched the videos and referred to the links, you would know that in an RNN we input a sentence word by word. We represent every word as one hot vectors of dimensions : Numbers of words in Vocab +1. <br>
+        o_2['ytrain'],
-	#   What keras Tokenizer does is , it takes all the unique words in the corpus,forms a dictionary with words as keys and their number of occurences as values,it then sorts the dictionary in descending order of counts. It then assigns the first value 1 , second value 2 and so on. So let's suppose word 'the' occured the most in the corpus then it will assigned index 1 and vector representing 'the' would be a one-hot vector with value 1 at position 1 and rest zereos.<br>
+        nb_epoch=5,
-	#   Try printing first 2 elements of xtrain_seq you will see every word is represented as a digit now
+        batch_size=64*o_2['strategy'].num_replicas_in_sync
    ) #Multiplying by Strategy to run on TPU's
-	# %% [code]
+    # %% [code]
-	xtrain_seq[:1]
+    scores = o_2['model'].predict(o_2['xvalid_pad'])
    print(
        "Auc: %.2f%%" % (
            roc_auc(
                scores,
                o_2['yvalid']
            )
        )
    )
    # %% [code]
    scores_model = []
    scores_model.append(
        {
            'Model': 'SimpleRNN',
            'AUC_Score': roc_auc(
                scores,
                o_2['yvalid']
            )
        }
    )
    # %% [markdown]
    # ## Code Explanantion
    # * Tokenization<br><br>
    #  So if you have watched the videos and referred to the links, you would know that in an RNN we input a sentence word by word. We represent every word as one hot vectors of dimensions : Numbers of words in Vocab +1. <br>
    #   What keras Tokenizer does is , it takes all the unique words in the corpus,forms a dictionary with words as keys and their number of occurences as values,it then sorts the dictionary in descending order of counts. It then assigns the first value 1 , second value 2 and so on. So let's suppose word 'the' occured the most in the corpus then it will assigned index 1 and vector representing 'the' would be a one-hot vector with value 1 at position 1 and rest zereos.<br>
    #   Try printing first 2 elements of xtrain_seq you will see every word is represented as a digit now
    # %% [code]
    xtrain_seq[:1]