import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/data-assistants-with-gemma/submission_categories.txt
/kaggle/input/data-assistants-with-gemma/submission_instructions.txt
/kaggle/input/understanding-contextual-questions-answers/train.csv
/kaggle/input/gemma/keras/gemma_2b_en/2/config.json
/kaggle/input/gemma/keras/gemma_2b_en/2/tokenizer.json
/kaggle/input/gemma/keras/gemma_2b_en/2/metadata.json
/kaggle/input/gemma/keras/gemma_2b_en/2/model.weights.h5
/kaggle/input/gemma/keras/gemma_2b_en/2/assets/tokenizer/vocabulary.spm
/kaggle/input/1000-data-science-concepts/data_science_concepts.csv


# Install Keras 3 last. 
!pip install -q -U keras-nlp
!pip install -q -U keras>=3

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.

tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.

tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.

tensorflow 2.15.0 requires keras<2.16,>=2.15.0, but you have keras 3.2.1 which is incompatible.


import sklearn
import pandas as pd
import numpy as np
from wordcloud import WordCloud, STOPWORDS
import keras
import keras_nlp
from IPython.display import display, Markdown
import matplotlib.pyplot as plt
from keras_nlp.models import GemmaCausalLM
import re
import warnings
warnings.filterwarnings('ignore')

2024-04-14 17:22:31.472980: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-14 17:22:31.473188: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-14 17:22:31.597863: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


import os
os.environ["KERAS_BACKEND"] = "jax"  # Or "torch" or "tensorflow".
# Avoid memory fragmentation on JAX backend.
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"


data = pd.read_csv('/kaggle/input/1000-data-science-concepts/data_science_concepts.csv',nrows=200)
data.head() # First 5 rows of the dataset


data.tail() # Last 5 rows of the dataset


print("Information of Dataset: ")
print(data.info(),'\n')

print("Check for NULL values: ")
print(data.isnull().sum().sum())

print("Shape of Dataset: ")
print(data.shape)

Information of Dataset: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  200 non-null    object
 1   Answer    200 non-null    object
dtypes: object(2)
memory usage: 3.2+ KB
None 

Check for NULL values: 
0
Shape of Dataset: 
(200, 2)


comment_words = ''
stopwords = set(STOPWORDS)

# Define the colormap
colormap = 'viridis'
 
# iterate through the csv file
for col in data.columns:
    # Concatenate all values in the column into a single string
    # and convert to lowercase
    comment_words += ' '.join(str(val).upper() for val in data[col]) + ' '

    # Generate WordCloud for the current column
    wordcloud = WordCloud(width=500, height=600,
                          stopwords=stopwords,
                          min_font_size=8).generate(comment_words)

    # Plot the WordCloud image
    plt.figure(figsize=(5, 5), facecolor=None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.title(f"Word Cloud for {col}")
    plt.show()


#Create the model using the from_preset method
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_2b_en")
gemma_lm.summary()

Attaching 'config.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/gemma/keras/gemma_2b_en/2' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Kaggle notebook...
Attaching 'assets/tokenizer/vocabulary.spm' from model 'keras/gemma/keras/gemma_2b_en/2' to your Kaggle notebook...
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.

Preprocessor: "gemma_causal_lm_preprocessor"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Tokenizer (type)                                   ┃                                             Vocab # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ gemma_tokenizer (GemmaTokenizer)                   │                                             256,000 │
└────────────────────────────────────────────────────┴─────────────────────────────────────────────────────┘

Model: "gemma_causal_lm"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Layer (type)                  ┃ Output Shape              ┃         Param # ┃ Connected to               ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ padding_mask (InputLayer)     │ (None, None)              │               0 │ -                          │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ token_ids (InputLayer)        │ (None, None)              │               0 │ -                          │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ gemma_backbone                │ (None, None, 2048)        │   2,506,172,416 │ padding_mask[0][0],        │
│ (GemmaBackbone)               │                           │                 │ token_ids[0][0]            │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ token_embedding               │ (None, None, 256000)      │     524,288,000 │ gemma_backbone[0][0]       │
│ (ReversibleEmbedding)         │                           │                 │                            │
└───────────────────────────────┴───────────────────────────┴─────────────────┴────────────────────────────┘

 Total params: 2,506,172,416 (9.34 GB)

 Trainable params: 2,506,172,416 (9.34 GB)

 Non-trainable params: 0 (0.00 B)


gemma_lm.generate("What is supervised machine learning?", max_length=64)

WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1713115454.782714      24 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1713115454.856710      24 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1713115454.900669      24 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update

'What is supervised machine learning?\n\nSupervised machine learning is a type of machine learning where the algorithm is trained on a set of labeled data. The algorithm is then used to predict the outcome of new data.\n\nSupervised machine learning is a type of machine learning where the algorithm is trained on a set of labeled'


print(gemma_lm.generate("Can you explain neural networks?", max_length=256))

W0000 00:00:1713115475.541716      24 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update

Can you explain neural networks?

A 100-W, 120-V lightbulb has a resistance of $12 \Omega$ when cold $\left(20^{\circ} \mathrm{C}\right)$ and $140 \Omega$ when on (hot). Calculate its power consumption at the instant it is turned on.

A 100-turn, 2.0-cm-diameter coil is at rest with its axis vertical. A uniform magnetic field $60^{\circ}$ away from vertical increases from 0.50 T to 1.50 T in 0.60 s. What is the induced emf in the coil?

A 100-W, 120-V lightbulb has a resistance of $12 \Omega$ when cold $\left(20^{\circ} \mathrm{C}\right)$ and $140 \Omega$ when on (hot). a. What is the resistance of the lightbulb when hot? b. What is the change in electrical energy requred if the light is on for one hour at room temperature? c. Suppose the lightbulb is turned on when the temperature is $20


#Prepare the dataset for fine-tuning
dataset = []
    
for index, row in data.iterrows():
    question, answer = row['Question'], row['Answer']
    template = (f"Question:\n{question}\n\nAnswer:\n{answer}")
    dataset.append(template)


#Enable LoRA for the model and set the LoRA rank to 64.
gemma_lm.backbone.enable_lora(rank=64)
gemma_lm.summary()

Preprocessor: "gemma_causal_lm_preprocessor"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Tokenizer (type)                                   ┃                                             Vocab # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ gemma_tokenizer (GemmaTokenizer)                   │                                             256,000 │
└────────────────────────────────────────────────────┴─────────────────────────────────────────────────────┘

Model: "gemma_causal_lm"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Layer (type)                  ┃ Output Shape              ┃         Param # ┃ Connected to               ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ padding_mask (InputLayer)     │ (None, None)              │               0 │ -                          │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ token_ids (InputLayer)        │ (None, None)              │               0 │ -                          │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ gemma_backbone                │ (None, None, 2048)        │   2,527,995,904 │ padding_mask[0][0],        │
│ (GemmaBackbone)               │                           │                 │ token_ids[0][0]            │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ token_embedding               │ (None, None, 256000)      │     524,288,000 │ gemma_backbone[0][0]       │
│ (ReversibleEmbedding)         │                           │                 │                            │
└───────────────────────────────┴───────────────────────────┴─────────────────┴────────────────────────────┘

 Total params: 2,527,995,904 (9.42 GB)

 Trainable params: 21,823,488 (83.25 MB)

 Non-trainable params: 2,506,172,416 (9.34 GB)


# Limit the input sequence length to 512 (to control memory usage).
gemma_lm.preprocessor.sequence_length = 512
# Use AdamW 
optimizer = keras.optimizers.AdamW(
    learning_rate=5e-5,
    weight_decay=0.01,
)
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

# Compile the model
gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()]
)


gemma_lm.fit(dataset, epochs=6, batch_size=1)

Epoch 1/6

W0000 00:00:1713115530.386434      82 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update

200/200 ━━━━━━━━━━━━━━━━━━━━ 190s 711ms/step - loss: 0.2695 - sparse_categorical_accuracy: 0.5686
Epoch 2/6
200/200 ━━━━━━━━━━━━━━━━━━━━ 142s 711ms/step - loss: 0.2221 - sparse_categorical_accuracy: 0.6081
Epoch 3/6
200/200 ━━━━━━━━━━━━━━━━━━━━ 142s 711ms/step - loss: 0.2069 - sparse_categorical_accuracy: 0.6304
Epoch 4/6
200/200 ━━━━━━━━━━━━━━━━━━━━ 142s 711ms/step - loss: 0.1856 - sparse_categorical_accuracy: 0.6661
Epoch 5/6
200/200 ━━━━━━━━━━━━━━━━━━━━ 142s 711ms/step - loss: 0.1574 - sparse_categorical_accuracy: 0.7196
Epoch 6/6
200/200 ━━━━━━━━━━━━━━━━━━━━ 142s 711ms/step - loss: 0.1265 - sparse_categorical_accuracy: 0.7740

<keras.src.callbacks.history.History at 0x7b57646789a0>


print(gemma_lm.generate("What is supervised machine learning?", max_length=256))

W0000 00:00:1713116405.944183      24 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update

What is supervised machine learning?

Machine learning is a branch of artificial intelligence that allows computers to learn and perform tasks through experience, rather than being explicitly programmed. In supervised learning, computers are exposed to a labeled dataset of input-output pairs, where the output is known in advance. During training, the computers learn predictive models by analyzing the data patterns and relationships, and then make predictions based on new input data. This process enables computers to learn to perform tasks without explicit programming, making machine learning essential for applications like image classification, natural language processing, and recommendation systems.


print(gemma_lm.generate("What is regression? Which models can you use to solve a regression problem? ", max_length=256))

What is regression? Which models can you use to solve a regression problem? 

Regression analysis is a statistical technique used to determine the relationship between two or more variables, typically to predict a continuous outcome. There are various regression models available, each with its own set of assumptions and limitations. Understanding the assumptions and limitations of different regression models is crucial for correctly interpreting the results and ensuring accurate predictions.


print(gemma_lm.generate("What is K-fold cross-validation? ", max_length=256))

What is K-fold cross-validation? 

K-fold cross-validation involves dividing data into consecutive folds of equal size and then validating the model on each fold. This method is commonly used to assess the model's generalization ability.


print(gemma_lm.generate("What are the main parameters of the decision tree model?", max_length=256))

What are the main parameters of the decision tree model?

A 2-year-oldthrowaway rule for choosing leaf nodes is to choose data nodes with (a) few categories, and (rule) (b) many missing data values.

Consider the following statements:

<code> Sentiniel s = new Sentinel(5);
s.mRecyclerView. Đi
 sentinel. Đi
 return sentinel.mValue;
</code>

(c) Which data node access path is taken?

A 2-year-old throwaway rule for choosing leaf nodes is to choose data nodes with (a) few categories, and (b) many missing data values.


print(gemma_lm.generate("Can you explain neural networks?", max_length=256))

Can you explain neural networks?

A neural network is a network of computational nodes inspired by the human brain that processes information. It consists of multiple layers of nodes, where each node receives input from adjacent layers and processes it before passing the output to subsequent layers, mimicking the process of information transmission in the human brain. Neural networks are widely used in machine learning and artificial intelligence for tasks such as image recognition, natural language processing, and pattern recognition. By mimicking the human brain's processing capabilities, neural networks demonstrate impressive learning abilities, enabling them to perform complex tasks without explicit programming.


print(gemma_lm.generate("Can you explain what a false positive and a false negative are?", max_length=256))

Can you explain what a false positive and a false negative are?

A false positive and a false negative are outcomes that are erroneously classified as positive or negative, respectively, when they are actually the opposite. They occur when there is ambiguity in the classification process, leading to incorrect predictions. For example, a model may incorrectly classify a patient as healthy when they actually have symptoms of illness, resulting in suboptimal healthcare outcomes. Similarly, a false negative diagnosis can lead to delayed treatment or misdiagnosis, compromising patient health.

	Question	Answer
0	What is under-fitting and overfitting in machine learning?	Underfitting is when a model is too simple, and overfitting is when it's too complex, making it perform poorly on new data.
1	Can you explain what a false positive and a false negative are?	A false positive incorrectly indicates a condition is present when it's not, while a false negative misses detecting a condition that is there.
2	Clarify the concept of Phase IV.	Phase IV studies, also known as post-marketing surveillance, are conducted after a drug or medical product is made available to the general public. They aim to monitor the product's safety, efficacy, and long-term effects in a larger and more diverse population, providing valuable insights into real-world usage. Phase IV studies help regulators, healthcare providers, and patients make informed decisions about the product's continued use by assessing its risks and benefits over an extended period outside the controlled environment of clinical trials.
3	What is semi-supervised learning described in a short description?	Semi-supervised learning integrates both labeled and unlabeled data during model training. By leveraging the abundance of unlabeled data alongside limited labeled data, it enhances model performance and generalization to new examples, offering scalability and efficiency in scenarios where acquiring labeled data is resource-intensive or impractical. This approach bridges the gap between supervised and unsupervised learning, unlocking the potential of vast unlabeled datasets for training robust machine learning models.
4	Discuss the parallelization of training in gradient boosting models.	Parallelizing training of a gradient boosting model is indeed possible, leveraging the parallel processing capabilities of modern hardware, such as GPUs. Frameworks like XGBoost offer options like 'tree_method = 'gpu_hist'' to utilize GPUs for faster training. By distributing computation across multiple cores or devices simultaneously, parallelization accelerates the training process, significantly reducing training time and improving efficiency. This approach is particularly beneficial for large datasets and complex models, where traditional sequential training may be computationally intensive and time-consuming.

	Question	Answer
195	Why are activation functions required in neural networks?	Activation functions introduce nonlinearity, enabling neural networks to learn complex relationships between inputs and outputs, enhancing model capacity and expressiveness.
196	Can you explain a bidirectional search algorithm?	A bidirectional search algorithm runs two simultaneous searches: one forward from the starting point and one backward from the goal. The aim is to meet in the middle, thus potentially finding a solution faster than a unidirectional search.
197	Do gradient descent methods always converge to similar points?	Gradient descent methods may converge to different local optima, which depend on the starting conditions and the nature of the cost function.
198	Describe word2vec.	Word2vec is a suite of models used to produce word embeddings, trained to predict surrounding words in a linguistic context.
199	What is the difference between a generative and discriminative model?	Generative models learn data categories, while discriminative models learn category distinctions. Discriminative models generally outperform generative models in classification tasks.

Ask Gemma, Data Science¶

Package Installations and Importing libraries¶

Loading Dataset¶

Visualize Data using Word Cloud¶

Gemma Model¶

Test Model before Tuning¶

Fine tuning with LoRA¶

Memory Control - Epochs¶

Test Model after Tuning¶