Q1.(1)
import numpy as np
import matplotlib.pyplot as plt
mean=0
std_dev=1
num_samples=1000
r_num=np.random.normal(mean,std_dev,num_samples)
rounded_num=np.round(r_num).astype(int)
unique_values, frequencies=np.unique(rounded_num, return_counts=True)
print(“Values|Frequencies”)
for value,frequency in zip(unique_values, frequencies):
print(f“{value:>5}|{frequency:>9}“)
plt.bar(unique_values, frequencies, color=‘red’, edgecolor=‘blue’)
plt.title(“Frequency Distribution”)
plt.xlabel(“Values”)
plt.ylabel(“Frequencies”)
plt.grid(axis=‘y’, linestyle=‘–‘, alpha=0.7)
plt.show()
Q1(2)
odd=[]
even=[]
range_first=int(input(“Enter the first number:”))
range_last=int(input(“Enter the last number:”))
Range=range(range_first, range_last+1)
for i in Range:
if i%2==0:
even.append(i)
else:
odd.append(i)
def is_prime(num):
if num<2:
return False
for n in range(2,int(num**0.5)+1):
if num%n==0:
return False
return True
prime_odd_20=[]
for i in odd:
if is_prime(i):
prime_odd_20.append(i)
if len(prime_odd_20)==20:
break
print(“The first 20 prime odd number are:”, prime_odd_20)
Q1(3)
import pandas as pd
# Function to read a CSV file using pandas
def read_csv_with_pandas(file_path):
try:
data = pd.read_csv(file_path)
print(data)
except FileNotFoundError:
print(“The file was not found.”)
except pd.errors.EmptyDataError:
print(“The file is empty.”)
except Exception as e:
print(f“An error occurred: {e}“)
# Example usage
file_path = ‘sample.csv’ # Replace with your actual CSV file path
read_csv_with_pandas(file_path)
Q2(1)
my_tuple = (42, “Python”, 3.14, True, [1, 2, 3])
reversed_tuple = my_tuple[::-1]
print(“Reversed tuple:”, reversed_tuple)
a, b, c, d, e = my_tuple
print(“Unpacked values:”, a, b, c, d, e)
print(“Third element of tuple:”, my_tuple[2])
print(“Last three elements:”, my_tuple[-3:])
Q2(2)
d1 ={“a”:1,“b”:2 ,“c”:3}
d2={“d”:4,“e”:5 ,“f”:6}
print(d1)
print(d2)
merge_d = {*d1,*d2}
print(merge_d)
New_D ={}
key_to_extract =[“a”,“b”]
for key in d1:
if key in key_to_extract:
New_D[key]=d1[key]
print(New_D)
Q3 and Q4(1)
import numpy as np
A= np.array([[1,2],[3,4]])
B = np.array([[5,6],[7,8]])
print(A)
print(B)
Add = np.add(A,B)
print(Add)
Sub = np.subtract(A,B)
print(Sub)
Multi = np.multiply(A,B)
print(Multi)
Div= np.divide(A,B)
print(Div)
Q4(2)
import numpy as np
n = int(input(“Enter the number of equations: “))
A = np.zeros((n, n))
B = np.zeros(n)
for i in range(n):
for j in range(n):
A[i][j] = int(input(f“Enter the coefficient {j+1} of equation {i+1}: “))
B[i] = int(input(f“Enter the constant term of equation {i+1}: “))
print(“Coefficient Matrix (A):”)
print(A)
print(“Constant Terms (B):”)
print(B)
Solution = np.linalg.solve(A,B)
print(f“Solution of LE are: {Solution}“)
Q5.
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
# Load dataset
df = sns.load_dataset(‘titanic’)
# Display first few rows
print(“Initial data preview:”)
print(df.head())
# a. Handling missing values
print(“\nMissing values:”)
print(df.isnull().sum())
# Fill missing numerical values with median
df.fillna(df.median(numeric_only=True), inplace=True)
# Fill missing categorical values with mode
for col in df.select_dtypes(include=[‘object’]).columns:
df[col].fillna(df[col].mode()[0], inplace=True)
# b. Data integration and normalization
# Convert categorical columns to numerical using Label Encoding
label_enc = LabelEncoder()
for col in df.select_dtypes(include=[‘object’]).columns:
df[col] = label_enc.fit_transform(df[col])
# Normalize numerical features using Min-Max scaling
scaler = MinMaxScaler()
df[df.select_dtypes(include=[‘int64’, ‘float64’]).columns] = scaler.fit_transform(
df.select_dtypes(include=[‘int64’, ‘float64’])
)
# Standardize numerical features using StandardScaler
std_scaler = StandardScaler()
df[df.select_dtypes(include=[‘int64’, ‘float64’]).columns] = std_scaler.fit_transform(
df.select_dtypes(include=[‘int64’, ‘float64’])
)
# Show the transformed data
print(“\nData preview after preprocessing:”)
print(df.head())
Q6.
import pandas as pd
import numpy as np
data = {
‘Student_ID’: [1, 2, 3, 4, 5, 6, 7, 8],
‘Name’: [‘Alice’, ‘Bob’, ‘Charlie’, ‘David’, ‘Eve’, ‘Frank’, ‘Grace’, ‘Helen’],
‘Age’: [15, 16, np.nan, 15, 16, 17, 15, 16],
‘Gender’: [‘F’, ‘M’, ‘M’, ‘M’, ‘F’, ‘M’, ‘F’, np.nan],
‘Math_score’: [85, 90, np.nan, 75, 60, 95, 80, 88],
‘Science_score’: [78, 82, 70, 65, 55, 99, 77, 60],
‘English_score’: [88, 87, 75, 70, np.nan, 96, 89, 92]
}
df = pd.DataFrame(data)
df[‘Age’].fillna(df[‘Age’].mean(), inplace=True)
df[‘Math_score’].fillna(df[‘Math_score’].mean(), inplace=True)
df[‘English_score’].fillna(df[‘English_score’].mean(), inplace=True)
df[‘Gender’].fillna(df[‘Gender’].mode()[0], inplace=True)
def cap_outliers(col):
q1 = col.quantile(0.25)
q3 = col.quantile(0.75)
iqr = q3 – q1
lower = q1 – 1.5 * iqr
upper = q3 + 1.5 * iqr
return col.clip(lower, upper)
df[‘Math_score’] = cap_outliers(df[‘Math_score’])
df[‘Science_score’] = cap_outliers(df[‘Science_score’])
df[‘English_score’] = cap_outliers(df[‘English_score’])
print(df)
Q7.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
data = {
‘Student_ID’: [1, 2, 3, 4, 5, 6, 7, 8],
‘Name’: [“Alice”, “Bob”, “John”, “Eva”, “Grace”, “helen”, ‘Ivana’, ‘Josh’],
‘Age’: [15, 16, np.nan, 15, 16, 17, 15, 16],
‘Gender’: [‘F’, ‘M’, ‘M’, ‘M’, ‘F’, ‘M’, ‘F’, ‘M’],
‘Math_score’: [85, 90, np.nan, 75, 60, 95, 80, 88],
‘Science_score’: [78, 82, 70, 65, 55, 99, 77, 60],
‘English_score’: [88, 87, 75, 70, np.nan, 96, 89, 92]
}
df = pd.DataFrame(data)
df[‘Age’].fillna(df[‘Age’].mean(), inplace=True)
df[‘Math_score’].fillna(df[‘Math_score’].mean(), inplace=True)
df[‘Science_score’].fillna(df[‘Science_score’].mean(), inplace=True)
df[‘English_score’].fillna(df[‘English_score’].mean(), inplace=True)
df[‘Gender’].fillna(df[‘Gender’].mode()[0], inplace=True)
df[‘Science_score’] = np.log(df[‘Science_score’])
df[‘English_score’] = np.log(df[‘English_score’])
scaler = MinMaxScaler()
df[‘Math_score_Scaled’] = scaler.fit_transform(df[[‘Math_score’]])
print(df)
Q8.
import matplotlib.pyplot
import pandas as pd
from sklearn.datasets import load_iris
import seaborn as sns
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
plt.figure(figsize=(10,6))
df.hist(bins=20, figsize=(10,6), edgecolor=‘black’)
plt.suptitle(“Histogram”)
plt.show()
plt.figure(figsize=(8,6))
sns.boxplot(data=df)
plt.suptitle(“Boxplot”)
plt.xlabel(“Features”)
plt.ylabel(“Values”)
plt.show()
Q9.
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
df = pd.read_csv(‘https://github.com/kb22/Heart-Disease-Prediction/raw/master/dataset.csv’)
print(df.head())
# Scatter Plot
plt.scatter(df[‘age’], df[‘chol’]) # Replace ‘age’ and ‘chol’ with desired columns
plt.xlabel(‘Age’)
df.head()
plt.figure(figsize=(8, 6))
plt.ylabel(‘Cholesterol’)
plt.title(‘Scatter Plot of Age vs. Cholesterol’)
plt.show()
# Bar Plot
plt.figure(figsize=(8, 6))
plt.bar(df[‘sex’].unique(), df[‘sex’].value_counts()) # Replace ‘sex’ with desired column
plt.xlabel(‘Sex’)
plt.ylabel(‘Count’)
plt.title(‘Bar Plot of Sex Distribution’)
plt.show()
# Density Plot
plt.figure(figsize=(8, 6))
sns.kdeplot(df[‘chol’]) # Replace ‘chol’ with desired column
plt.xlabel(‘Cholesterol’)
plt.ylabel(‘Density’)
plt.title(‘Density Plot of Cholesterol’)
plt.show()
# Pie Chart
plt.figure(figsize=(8, 6))
plt.pie(df[‘cp’].value_counts(), labels=df[‘cp’].unique(), autopct=‘%1.1f%%’) # Replace ‘cp’
plt.title(‘Pie Chart of Chest Pain Type Distribution’)
plt.show()
# Bubble Plot
plt.figure(figsize=(8, 6))
plt.scatter(df[‘age’], df[‘chol’], s=df[‘trestbps’]*5, alpha=0.5) # Replace with desired colu
plt.xlabel(‘Age’)
plt.ylabel(‘Cholesterol’)
plt.title(‘Bubble Plot of Age vs. Cholesterol (Size: Resting Blood Pressure)’)
plt.show()
# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap=‘coolwarm’)
plt.title(‘Heatmap of Correlation Matrix’)
plt.show()
Q10.
import pandas as pd
import seaborn as sns
df=sns.load_dataset(“iris”)
print(“First 5 rows of dataset”)
print(“\nMedian Values:”)
print(df.select_dtypes(include=[‘number’]).median())
print(“\nStandard Deviation:”)
print(df.select_dtypes(include=[‘number’]).std())
print(“\nVariance:”)
print(df.select_dtypes(include=[‘number’]).var())
print(“\nMinimum Values:”)
print(df.select_dtypes(include=[‘number’]).min())
print(“\nMaximum Values:”)
print(df.select_dtypes(include=[‘number’]).max())
Q11.
import numpy as np
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.weightstats import ztest
# Step 1: Generate Synthetic Dataset
data = {
‘Group_A’: np.random.normal(loc=50, scale=10, size=30),
‘Group_B’: np.random.normal(loc=55, scale=12, size=30),
‘Category’: np.random.choice([‘Yes’, ‘No’], size=30, p=[0.6, 0.4])
}
df = pd.DataFrame(data)
print(df.head())
# Step 2: One-Sample T-test
t_stat, p_value = stats.ttest_1samp(df[‘Group_A’], 50)
print(“One-Sample T-Test: t-value =”, t_stat, “p-value =”, p_value)
# Step 3: Independent Two-Sample T-test
t_stat, p_value = stats.ttest_ind(df[‘Group_A’], df[‘Group_B’])
print(“Independent Two-Sample T-Test: t-value =”, t_stat, “p-value =”, p_value)
# Step 4: Paired T-test
df[‘Group_A’] = df[‘Group_A’] + np.random.normal(0.5, size=30)
t_stat, p_value = stats.ttest_rel(df[‘Group_A’], df[‘Group_B’])
print(“Paired T-Test: t-value =”, t_stat, “p-value =”, p_value)
# Step 5: Z-test
z_stat, p_value = ztest(df[‘Group_A’], df[‘Group_B’])
print(“Z-Test: z-value =”, z_stat, “p-value =”, p_value)
# Step 6: Chi-Square Test
df[‘Above_Mean’] = df[‘Group_A’] > df[‘Group_A’].mean()
crosstab = pd.crosstab(df[‘Category’], df[‘Above_Mean’])
chi2_stat, p_value, dof, expected = stats.chi2_contingency(crosstab)
print(“Chi-Square Test: chi2-value =”, chi2_stat, “p-value =”, p_value)
if p_value < 0.05:
print(“Reject the null hypothesis”)
else:
print(“Fail to reject the null hypothesis”)
Q12.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Sample dataset (for demonstration)
data = {
“Item_Weight”: [9.3, 5.92, 17.5, 19.2, 8.9, 15.3, 10.5, 7.8, 12.4, 20.1],
“Item_Visibility”: [0.016, 0.019, 0.017, 0.000, 0.066, 0.035, 0.045, 0.025, 0.012, 0.030],
“Item_MRP”: [249.8, 48.2, 141.6, 182.1, 53.9, 210.5, 150.3, 200.2, 130.4, 275.0],
“Outlet_Establishment_Year”: [1999, 2009, 1999, 1998, 1987, 2004, 1995, 2002, 1997, 2010],
“Item_Outlet_Sales”: [3735.1, 443.4, 2097.3, 732.4, 994.7, 2500.2, 1750.6, 2100.8, 1950.9, 1400.0]
}
# Convert data to DataFrame
df = pd.DataFrame(data)
# Display basic dataset info
print(“\nDataset Overview:\n”, df.head())
# Define features (X) and target variable (y)
X = df[[“Item_Weight”, “Item_Visibility”, “Item_MRP”, “Outlet_Establishment_Year”]]
y = df[“Item_Outlet_Sales”]
# Split data into Training (80%) and Testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict sales using the trained model
y_pred = model.predict(X_test)
# Model Performance Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(“\nModel Performance Evaluation:”)
print(f“Mean Squared Error (MSE): {mse:.2f}“)
print(f“R-squared (R^2): {r2:.2f}“)
# Visualizing Actual vs Predicted Sales
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_test, y=y_pred, color=“blue”, label=“Actual vs Predicted”)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color=“red”, linestyle=“–“)
plt.xlabel(“Actual Sales”)
plt.ylabel(“Predicted Sales”)
plt.title(“Actual vs Predicted Sales”)
plt.legend()
plt.show()
# Display model coefficients
feature_importance = pd.DataFrame({‘Feature’: X.columns, ‘Coefficient’: model.coef_})
print(“\nModel Coefficients:\n”, feature_importance)
Q13.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
# Gaussian Naive Bayes with Pima Indians Diabetes Dataset
url = “https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv”
column_names = [“Pregnancies”, “Glucose”, “BloodPressure”, “SkinThickness”, “Insulin”,
“BMI”, “DiabetesPedigreeFunction”, “Age”, “Outcome”]
df = pd.read_csv(url, names=column_names)
X = df.drop(“Outcome”, axis=1)
y = df[“Outcome”]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f“Accuracy (GaussianNB): {accuracy:.2f}“)
# ——————————-
# Multinomial Naive Bayes for Text Data
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
text_data = [
“I love programming in python”,
“Python is great for data science”,
“I enjoy coding in python”,
“I am learning machine learning”,
“Data science is fun with python”,
“I like playing football”
]
labels = [1, 1, 1, 0, 1, 0]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(text_data)
y = np.array(labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f“Accuracy (MultinomialNB): {accuracy:.2f}“)
# ——————————-
# Bernoulli Naive Bayes with Binary Features
from sklearn.naive_bayes import BernoulliNB
X = np.array([[1, 0, 1], [1, 1, 1], [0, 1, 1], [0, 0, 1], [1, 0, 0], [0, 1, 0]])
y = np.array([1, 1, 0, 0, 1, 0])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f“Accuracy (BernoulliNB): {accuracy:.2f}“)
Please follow and like us: