DMW_all_pracs | PDF Host

Q 1) import pandas as pd sales_data = pd read_csv("D:\\TYCS\\sem6\\PRACTICALS\\Codes\\DMW\\DMW_Prac\\P2\\sales_data.csv") customer_data = pd read_csv("D:\\TYCS\\sem6\\PRACTICALS\\Codes\\DMW\\DMW_Prac\\P2\\customer_data.csv" sales_data head() customer_data head() merged_data = pd merge(sales_data, customer_data, on = "Customer ID", how = "inner") print(merged_data head(10)) merged_data rename(columns = { 'Date': 'Transaction Date', 'Quantity': 'Quantity Sold', 'Total Price': 'Total Sale Amount', 'Name': 'Customer Name', 'Email': 'Customer Email', 'Phone Number': 'Customer Phone', 'Address': 'Customer Address' }, inplace =True ) print(merged_data columns) merged_data dropna(subset = ['Total Sale Amount'], inplace =True ) merged_data loc[:, 'Customer Email'] = merged_data['Customer Email'] fillna('Unknown') print(merged_data head(10)) print(merged_data dtypes) merged_data['Transaction Date'] = pd to_datetime(merged_data['Transaction Date'], format = "%d-%m-%Y") print(merged_data dtypes) total_sales_by_customer = merged_data groupby(['Customer ID', 'Customer Name'])[['Total Sale Amount', 'Quantity Sold']] sum() sales_by_product = merged_data groupby(['Product ID'])[['Total Sale Amount', 'Quantity Sold']] sum() print("\nTotal sales by customer \n", total_sales_by_customer) print("\nTotal sales by Product \n", sales_by_product) merged_data['Transaction Month'] = merged_data['Transaction Date'] dt to_period('M') sales_by_month = merged_data groupby(['Transaction Month'])[['Total Sale Amount']] sum() print(sales_by_month) transactions_per_customer = merged_data groupby('Customer ID')['Transaction ID'] nunique() print(transactions_per_customer) max_sale = merged_data['Total Sale Amount'] max() min_sale = merged_data['Total Sale Amount'] min() print(f"Max Sale Amount: ${max_sale:.2f}") print(f"Min Sale Amount: ${min_sale:.2f}") merged_data to_csv("D:\\TYCS\\sem6\\PRACTICALS\\Codes\\DMW\\DMW_Prac\\P2\\merged_data.csv", index =False ) In [6]: 2/28/26, 5:32 PM Prac2 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac2.html 1/6 Transaction ID Date Customer ID Product ID Quantity Unit Price \ 0 TXN0001 15-07-2024 CUST0040 PID001 10 200 1 TXN0002 26-10-2024 CUST0033 PID005 10 350 2 TXN0003 30-07-2024 CUST0012 PID006 7 500 3 TXN0004 22-12-2024 CUST0010 PID007 6 100 4 TXN0005 10-04-2024 CUST0044 PID003 3 400 5 TXN0006 26-11-2024 CUST0043 PID005 1 200 6 TXN0007 22-04-2024 CUST0025 PID005 1 500 7 TXN0008 06-01-2024 CUST0019 PID002 5 200 8 TXN0009 04-04-2024 CUST0026 PID002 9 350 9 TXN0010 04-07-2024 CUST0024 PID005 4 400 Total Price Name Email \ 0 2000.0 Alice Miller NaN 1 3500.0 John Smith john.smith@outlook.com 2 NaN Bob Johnson bob.johnson@outlook.com 3 600.0 Alice Johnson alice.johnson@hotmail.com 4 1200.0 Daniel Rodriguez daniel.rodriguez@outlook.com 5 200.0 Bob Jones bob.jones@outlook.com 6 500.0 John Miller NaN 7 1000.0 Bob Martinez bob.martinez@hotmail.com 8 3150.0 Jane Smith jane.smith@outlook.com 9 1600.0 Charlie Rodriguez charlie.rodriguez@hotmail.com Phone Number Address 0 (722) 885-6293 624 Oak St, Riverside, FL 77884 1 (733) 299-1696 111 Maple St, Hilltop, OH 97521 2 (324) 285-1102 273 Cedar St, Sunset, MI 31662 3 (268) 857-7514 728 Elm St, Townsville, FL 54183 4 (874) 320-7771 351 Birch St, Riverside, CA 91247 5 (316) 476-6857 903 Pine St, Cityville, OH 11135 6 (715) 986-8517 213 Main St, Sunset, TX 51893 7 (871) 921-6214 504 Cedar St, Cityville, NY 26501 8 (541) 759-6976 901 Birch St, Greenfield, TX 26345 9 (186) 926-6216 250 Pine St, Greenfield, IL 94761 Index(['Transaction ID', 'Transaction Date', 'Customer ID', 'Product ID', 'Quantity Sold', 'Unit Price', 'Total Sale Amount', 'Customer Name', 'Customer Email', 'Customer Phone', 'Customer Address'], dtype='str') Transaction ID Transaction Date Customer ID Product ID Quantity Sold \ 0 TXN0001 15-07-2024 CUST0040 PID001 10 1 TXN0002 26-10-2024 CUST0033 PID005 10 3 TXN0004 22-12-2024 CUST0010 PID007 6 4 TXN0005 10-04-2024 CUST0044 PID003 3 5 TXN0006 26-11-2024 CUST0043 PID005 1 6 TXN0007 22-04-2024 CUST0025 PID005 1 7 TXN0008 06-01-2024 CUST0019 PID002 5 8 TXN0009 04-04-2024 CUST0026 PID002 9 9 TXN0010 04-07-2024 CUST0024 PID005 4 10 TXN0011 30-08-2024 CUST0016 PID006 7 Unit Price Total Sale Amount Customer Name \ 0 200 2000.0 Alice Miller 1 350 3500.0 John Smith 3 100 600.0 Alice Johnson 4 400 1200.0 Daniel Rodriguez 5 200 200.0 Bob Jones 6 500 500.0 John Miller 7 200 1000.0 Bob Martinez 8 350 3150.0 Jane Smith 9 400 1600.0 Charlie Rodriguez 10 150 1050.0 Sophia Smith 2/28/26, 5:32 PM Prac2 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac2.html 2/6 Customer Email Customer Phone \ 0 Unknown (722) 885-6293 1 john.smith@outlook.com (733) 299-1696 3 alice.johnson@hotmail.com (268) 857-7514 4 daniel.rodriguez@outlook.com (874) 320-7771 5 bob.jones@outlook.com (316) 476-6857 6 Unknown (715) 986-8517 7 bob.martinez@hotmail.com (871) 921-6214 8 jane.smith@outlook.com (541) 759-6976 9 charlie.rodriguez@hotmail.com (186) 926-6216 10 sophia.smith@hotmail.com (289) 451-6068 Customer Address 0 624 Oak St, Riverside, FL 77884 1 111 Maple St, Hilltop, OH 97521 3 728 Elm St, Townsville, FL 54183 4 351 Birch St, Riverside, CA 91247 5 903 Pine St, Cityville, OH 11135 6 213 Main St, Sunset, TX 51893 7 504 Cedar St, Cityville, NY 26501 8 901 Birch St, Greenfield, TX 26345 9 250 Pine St, Greenfield, IL 94761 10 491 Main St, Greenfield, IL 95415 Transaction ID str Transaction Date str Customer ID str Product ID str Quantity Sold int64 Unit Price int64 Total Sale Amount float64 Customer Name str Customer Email str Customer Phone str Customer Address str dtype: object Transaction ID str Transaction Date datetime64[us] Customer ID str Product ID str Quantity Sold int64 Unit Price int64 Total Sale Amount float64 Customer Name str Customer Email str Customer Phone str Customer Address str dtype: object Total sales by customer Total Sale Amount Quantity Sold Customer ID Customer Name CUST0001 Daniel Brown 450.0 3 CUST0002 Grace Garcia 800.0 4 CUST0003 Charlie Davis 800.0 4 CUST0004 Alice Brown 1500.0 3 CUST0005 Emily Johnson 600.0 4 CUST0006 David Brown 1300.0 8 CUST0007 Bob Davis 1200.0 12 CUST0008 Daniel Miller 300.0 3 CUST0009 Daniel Martinez 600.0 2 CUST0010 Alice Johnson 5100.0 15 CUST0011 John Davis 300.0 1 CUST0012 Bob Johnson 2400.0 13 2/28/26, 5:32 PM Prac2 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac2.html 3/6 CUST0013 Jane Brown 6900.0 26 CUST0014 Daniel Jones 2450.0 15 CUST0015 Bob Rodriguez 1150.0 9 CUST0016 Sophia Smith 5050.0 15 CUST0017 Sophia Williams 1450.0 8 CUST0019 Bob Martinez 7000.0 22 CUST0020 John Brown 250.0 5 CUST0021 Emily Brown 4000.0 10 CUST0022 Sophia Davis 1050.0 21 CUST0023 Bob Miller 3000.0 8 CUST0024 Charlie Rodriguez 4350.0 13 CUST0025 John Miller 3050.0 14 CUST0026 Jane Smith 8500.0 25 CUST0027 Jane Smith 2000.0 5 CUST0028 John Brown 5450.0 13 CUST0030 Emily Brown 16400.0 45 CUST0031 Jane Martinez 2750.0 15 CUST0033 John Smith 4700.0 18 CUST0034 Emily Brown 1600.0 4 CUST0035 Emily Garcia 3200.0 17 CUST0036 Sophia Smith 2700.0 9 CUST0037 Alice Williams 4450.0 12 CUST0039 Sophia Jones 3300.0 15 CUST0040 Alice Miller 2800.0 14 CUST0041 Alice Smith 800.0 2 CUST0042 Daniel Garcia 3150.0 9 CUST0043 Bob Jones 12250.0 34 CUST0044 Daniel Rodriguez 5350.0 19 CUST0045 Grace Martinez 1800.0 10 CUST0046 Alice Williams 1150.0 7 CUST0049 John Martinez 1800.0 6 CUST0050 Daniel Garcia 500.0 2 Total sales by Product Total Sale Amount Quantity Sold Product ID PID001 11550.0 50 PID002 21650.0 64 PID003 14400.0 50 PID004 21450.0 81 PID005 17400.0 79 PID006 14800.0 52 PID007 17100.0 66 PID008 21350.0 77 Total Sale Amount Transaction Month 2024-01 11550.0 2024-02 24150.0 2024-03 8400.0 2024-04 10900.0 2024-05 11050.0 2024-06 8550.0 2024-07 6850.0 2024-08 8800.0 2024-09 3450.0 2024-10 9300.0 2024-11 19400.0 2024-12 17300.0 Customer ID CUST0001 1 CUST0002 1 CUST0003 1 CUST0004 1 2/28/26, 5:32 PM Prac2 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac2.html 4/6 CUST0005 1 CUST0006 2 CUST0007 2 CUST0008 1 CUST0009 2 CUST0010 2 CUST0011 1 CUST0012 2 CUST0013 3 CUST0014 3 CUST0015 2 CUST0016 2 CUST0017 4 CUST0019 3 CUST0020 1 CUST0021 1 CUST0022 3 CUST0023 2 CUST0024 3 CUST0025 3 CUST0026 5 CUST0027 1 CUST0028 2 CUST0030 6 CUST0031 2 CUST0033 2 CUST0034 1 CUST0035 4 CUST0036 1 CUST0037 2 CUST0039 2 CUST0040 2 CUST0041 1 CUST0042 2 CUST0043 7 CUST0044 4 CUST0045 2 CUST0046 2 CUST0049 1 CUST0050 1 Name: Transaction ID, dtype: int64 Max Sale Amount: $4500.00 Min Sale Amount: $100.00 Q 2) import pandas as pd employee_data = { 'employee_id': [101, 102, 103, 104], 'name': ['Alice', 'Bob', 'Charlie', 'David'], 'department_id': [1, 2, 1, 3], 'position': ['Developer', 'Manager', 'Developer', 'HR Specialist'], 'hire_date': ['2020-01-25', '2023-08-23', '2019-03-12', '2021-07-01'], 'salary': [80000, 95000, 85000, 70000] } department_data = { 'department_id': [1, 2, 3], 'department_name': ['IT', 'Marketing', 'HR'], 'manager_id': [105, 106, 107], 'location': ['New York', 'Chicago', 'San Francisco'] } In [7]: 2/28/26, 5:32 PM Prac2 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac2.html 5/6   df_employee = pd DataFrame(employee_data) df_department = pd DataFrame(department_data) df_merged = pd merge(df_employee, df_department, on = "department_id", how = "left") df_merged df_cleaned = df_merged dropna() df_merged['hire_date'] = pd to_datetime(df_merged['hire_date']) df_merged['years_of_service'] = (pd to_datetime('today') - df_merged['hire_date']) dt days // 365 df_merged total_salary = df_merged groupby('department_name')['salary'] sum() reset_index() total_salary rename(columns = {'salary': 'total_salary'}, inplace =True ) total_salary max_min_salary = df_merged groupby('department_name')['salary'] agg(['max', 'min']) reset_index() max_min_salary avg_years_service = df_merged groupby('department_name')['years_of_service'] mean() reset_index() avg_years_service rename(columns = {'years_of_service': 'avg_years_of_service'}, inplace =True ) avg_years_service position_dist = df_merged groupby(['department_name', 'position']) size() reset_index(name = "employees_count") position_dist median_salary = df_merged groupby('department_name')['salary'] median() reset_index() median_salary rename(columns = {'salary': 'median_salary'}, inplace =True ) median_salary df_merged['hired_last_year'] = df_merged['hire_date'] dt year == (pd to_datetime('today') year - 1) hired_last_year_count = df_merged groupby('department_name')['hired_last_year'] sum() reset_index() hired_last_year_count rename(columns = {'hired_last_year': 'employees_hired_last_year'}, inplace =True ) hired_last_year_count df_summary = pd merge(total_salary, max_min_salary, on = "department_name", how = 'left') df_summary = pd merge(df_summary, avg_years_service, on = "department_name", how = "left") df_summary = pd merge(df_summary, median_salary, on = "department_name", how = "left") df_summary = pd merge(df_summary, hired_last_year_count, on = "department_name", how = "left") df_summary department_name total_salary max min avg_years_of_service median_salary employees_hired_last_ye 0 HR 70000 70000 70000 4.0 70000.0 1 IT 165000 85000 80000 6.0 82500.0 2 Marketing 95000 95000 95000 2.0 95000.0 Out[7]: 2/28/26, 5:32 PM Prac2 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac2.html 6/6 Q 1) import numpy as np import pandas as pd from sklearn.datasets import load_iris from sklearn.feature_selection import VarianceThreshold import seaborn as sns import matplotlib.pyplot as plt iris = load_iris() X = pd DataFrame(iris data, columns = iris feature_names) y = iris target print(iris) variance_threshold = VarianceThreshold(threshold = 0.2) X_variance_reduced = variance_threshold fit_transform(X) X_variance_reduced_df = pd DataFrame(X_variance_reduced, columns = X columns[variance_threshold get_support()]) print("Data after variance Thresholding:") print(X_variance_reduced_df head()) correlation_matrix = X corr() plt figure(figsize = (8,6)) sns heatmap(correlation_matrix, annot =True , cmap = 'coolwarm', fmt = '.2f',\ cbar =True ) plt title('Correlation Matrix of Iris Features') plt show() upper_triangle = correlation_matrix where(np triu(np ones\ (correlation_matrix shape),k = 1) astype(bool)) highly_correlated_features = [] for i in range(upper_triangle shape[0]): for j in range(i + 1, upper_triangle shape[1]): if upper_triangle iloc[i,j] > 0.9: highly_correlated_features append((X columns[i],X columns[j])) print("\nHighly Correlated Features (Correlation > 0.9):") print(highly_correlated_features) features_to_remove = [feature[1] for feature in highly_correlated_features] X_correlated_reduced = X drop(columns = features_to_remove) print("\nData after Removing Highly Correlated Features:") print(X_correlated_reduced head()) In [5]: 2/28/26, 5:35 PM Prac3 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac3.html 1/8 {'data': array([[5.1, 3.5, 1.4, 0.2], [4.9, 3. , 1.4, 0.2], [4.7, 3.2, 1.3, 0.2], [4.6, 3.1, 1.5, 0.2], [5. , 3.6, 1.4, 0.2], [5.4, 3.9, 1.7, 0.4], [4.6, 3.4, 1.4, 0.3], [5. , 3.4, 1.5, 0.2], [4.4, 2.9, 1.4, 0.2], [4.9, 3.1, 1.5, 0.1], [5.4, 3.7, 1.5, 0.2], [4.8, 3.4, 1.6, 0.2], [4.8, 3. , 1.4, 0.1], [4.3, 3. , 1.1, 0.1], [5.8, 4. , 1.2, 0.2], [5.7, 4.4, 1.5, 0.4], [5.4, 3.9, 1.3, 0.4], [5.1, 3.5, 1.4, 0.3], [5.7, 3.8, 1.7, 0.3], [5.1, 3.8, 1.5, 0.3], [5.4, 3.4, 1.7, 0.2], [5.1, 3.7, 1.5, 0.4], [4.6, 3.6, 1. , 0.2], [5.1, 3.3, 1.7, 0.5], [4.8, 3.4, 1.9, 0.2], [5. , 3. , 1.6, 0.2], [5. , 3.4, 1.6, 0.4], [5.2, 3.5, 1.5, 0.2], [5.2, 3.4, 1.4, 0.2], [4.7, 3.2, 1.6, 0.2], [4.8, 3.1, 1.6, 0.2], [5.4, 3.4, 1.5, 0.4], [5.2, 4.1, 1.5, 0.1], [5.5, 4.2, 1.4, 0.2], [4.9, 3.1, 1.5, 0.2], [5. , 3.2, 1.2, 0.2], [5.5, 3.5, 1.3, 0.2], [4.9, 3.6, 1.4, 0.1], [4.4, 3. , 1.3, 0.2], [5.1, 3.4, 1.5, 0.2], [5. , 3.5, 1.3, 0.3], [4.5, 2.3, 1.3, 0.3], [4.4, 3.2, 1.3, 0.2], [5. , 3.5, 1.6, 0.6], [5.1, 3.8, 1.9, 0.4], [4.8, 3. , 1.4, 0.3], [5.1, 3.8, 1.6, 0.2], [4.6, 3.2, 1.4, 0.2], [5.3, 3.7, 1.5, 0.2], [5. , 3.3, 1.4, 0.2], [7. , 3.2, 4.7, 1.4], [6.4, 3.2, 4.5, 1.5], [6.9, 3.1, 4.9, 1.5], [5.5, 2.3, 4. , 1.3], [6.5, 2.8, 4.6, 1.5], [5.7, 2.8, 4.5, 1.3], [6.3, 3.3, 4.7, 1.6], [4.9, 2.4, 3.3, 1. ], [6.6, 2.9, 4.6, 1.3], [5.2, 2.7, 3.9, 1.4], [5. , 2. , 3.5, 1. ], [5.9, 3. , 4.2, 1.5], [6. , 2.2, 4. , 1. ], 2/28/26, 5:35 PM Prac3 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac3.html 2/8 [6.1, 2.9, 4.7, 1.4], [5.6, 2.9, 3.6, 1.3], [6.7, 3.1, 4.4, 1.4], [5.6, 3. , 4.5, 1.5], [5.8, 2.7, 4.1, 1. ], [6.2, 2.2, 4.5, 1.5], [5.6, 2.5, 3.9, 1.1], [5.9, 3.2, 4.8, 1.8], [6.1, 2.8, 4. , 1.3], [6.3, 2.5, 4.9, 1.5], [6.1, 2.8, 4.7, 1.2], [6.4, 2.9, 4.3, 1.3], [6.6, 3. , 4.4, 1.4], [6.8, 2.8, 4.8, 1.4], [6.7, 3. , 5. , 1.7], [6. , 2.9, 4.5, 1.5], [5.7, 2.6, 3.5, 1. ], [5.5, 2.4, 3.8, 1.1], [5.5, 2.4, 3.7, 1. ], [5.8, 2.7, 3.9, 1.2], [6. , 2.7, 5.1, 1.6], [5.4, 3. , 4.5, 1.5], [6. , 3.4, 4.5, 1.6], [6.7, 3.1, 4.7, 1.5], [6.3, 2.3, 4.4, 1.3], [5.6, 3. , 4.1, 1.3], [5.5, 2.5, 4. , 1.3], [5.5, 2.6, 4.4, 1.2], [6.1, 3. , 4.6, 1.4], [5.8, 2.6, 4. , 1.2], [5. , 2.3, 3.3, 1. ], [5.6, 2.7, 4.2, 1.3], [5.7, 3. , 4.2, 1.2], [5.7, 2.9, 4.2, 1.3], [6.2, 2.9, 4.3, 1.3], [5.1, 2.5, 3. , 1.1], [5.7, 2.8, 4.1, 1.3], [6.3, 3.3, 6. , 2.5], [5.8, 2.7, 5.1, 1.9], [7.1, 3. , 5.9, 2.1], [6.3, 2.9, 5.6, 1.8], [6.5, 3. , 5.8, 2.2], [7.6, 3. , 6.6, 2.1], [4.9, 2.5, 4.5, 1.7], [7.3, 2.9, 6.3, 1.8], [6.7, 2.5, 5.8, 1.8], [7.2, 3.6, 6.1, 2.5], [6.5, 3.2, 5.1, 2. ], [6.4, 2.7, 5.3, 1.9], [6.8, 3. , 5.5, 2.1], [5.7, 2.5, 5. , 2. ], [5.8, 2.8, 5.1, 2.4], [6.4, 3.2, 5.3, 2.3], [6.5, 3. , 5.5, 1.8], [7.7, 3.8, 6.7, 2.2], [7.7, 2.6, 6.9, 2.3], [6. , 2.2, 5. , 1.5], [6.9, 3.2, 5.7, 2.3], [5.6, 2.8, 4.9, 2. ], [7.7, 2.8, 6.7, 2. ], [6.3, 2.7, 4.9, 1.8], [6.7, 3.3, 5.7, 2.1], [7.2, 3.2, 6. , 1.8], 2/28/26, 5:35 PM Prac3 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac3.html 3/8 [6.2, 2.8, 4.8, 1.8], [6.1, 3. , 4.9, 1.8], [6.4, 2.8, 5.6, 2.1], [7.2, 3. , 5.8, 1.6], [7.4, 2.8, 6.1, 1.9], [7.9, 3.8, 6.4, 2. ], [6.4, 2.8, 5.6, 2.2], [6.3, 2.8, 5.1, 1.5], [6.1, 2.6, 5.6, 1.4], [7.7, 3. , 6.1, 2.3], [6.3, 3.4, 5.6, 2.4], [6.4, 3.1, 5.5, 1.8], [6. , 3. , 4.8, 1.8], [6.9, 3.1, 5.4, 2.1], [6.7, 3.1, 5.6, 2.4], [6.9, 3.1, 5.1, 2.3], [5.8, 2.7, 5.1, 1.9], [6.8, 3.2, 5.9, 2.3], [6.7, 3.3, 5.7, 2.5], [6.7, 3. , 5.2, 2.3], [6.3, 2.5, 5. , 1.9], [6.5, 3. , 5.2, 2. ], [6.2, 3.4, 5.4, 2.3], [5.9, 3. , 5.1, 1.8]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), 'frame': None, 'target_names': array (['setosa', 'versicolor', 'virginica'], dtype='<U10'), 'DESCR': '.. _iris_dataset:\n\nIris plants data set\n--------------------\n\n**Data Set Characteristics:**\n\n:Number of Instances: 150 (50 in each of three classes)\n:Number of Attributes: 4 numeric, predictive attributes and the class\n:Attribute Info rmation:\n - sepal length in cm\n - sepal width in cm\n - petal length in cm\n - petal wid th in cm\n - class:\n - Iris-Setosa\n - Iris-Versicolour\n - Iris- Virginica\n\n:Summary Statistics:\n\n============== ==== ==== ======= ===== ====================\n Min Max Mean SD Class Correlation\n============== ==== ==== ======= ===== ==================== \nsepal length: 4.3 7.9 5.84 0.83 0.7826\nsepal width: 2.0 4.4 3.05 0.43 -0.4194\n petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\npetal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n============== ==== ==== ======= ===== ====================\n\n:Missing Attribute Val ues: None\n:Class Distribution: 33.3% for each of 3 classes.\n:Creator: R.A. Fisher\n:Donor: Michael M arshall (MARSHALL%PLU@io.arc.nasa.gov)\n:Date: July, 1988\n\nThe famous Iris database, first used by S ir R.A. Fisher. The dataset is taken\nfrom Fisher\'s paper. Note that it\'s the same as in R, but not as in the UCI\nMachine Learning Repository, which has two wrong data points.\n\nThis is perhaps the be st known database to be found in the\npattern recognition literature. Fisher\'s paper is a classic in the field and\nis referenced frequently to this day. (See Duda & Hart, for example.) The\ndata set c ontains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant. One class i s linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\n.. d ropdown:: References\n\n - Fisher, R.A. "The use of multiple measurements in taxonomic problems"\n Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n Mathematical Statistics" (John Wiley, NY, 1950).\n - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysi s.\n (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n Structure and Classification Rule for Recognition i n Partially Exposed\n Environments". IEEE Transactions on Pattern Analysis and Machine\n Intell igence, Vol. PAMI-2, No. 1, 67-71.\n - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions\n on Information Theory, May 1972, 431-433.\n - See also: 1988 MLC Proceedings, 54-6 4. Cheeseman et al"s AUTOCLASS II\n conceptual clustering system finds 3 classes in the data.\n - Many, many more ...\n', 'feature_names': ['sepal length (cm)', 'sepal width (cm)', 'petal length (c m)', 'petal width (cm)'], 'filename': 'iris.csv', 'data_module': 'sklearn.datasets.data'} Data after variance Thresholding: sepal length (cm) petal length (cm) petal width (cm) 0 5.1 1.4 0.2 2/28/26, 5:35 PM Prac3 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac3.html 4/8 1 4.9 1.4 0.2 2 4.7 1.3 0.2 3 4.6 1.5 0.2 4 5.0 1.4 0.2 Highly Correlated Features (Correlation > 0.9): [('petal length (cm)', 'petal width (cm)')] Data after Removing Highly Correlated Features: sepal length (cm) sepal width (cm) petal length (cm) 0 5.1 3.5 1.4 1 4.9 3.0 1.4 2 4.7 3.2 1.3 3 4.6 3.1 1.5 4 5.0 3.6 1.4 Q 2) import numpy as np import pandas as pd from sklearn.datasets import load_breast_cancer from sklearn.feature_selection import VarianceThreshold import seaborn as sns import matplotlib.pyplot as plt data = load_breast_cancer() X = pd DataFrame(data data, columns = data feature_names) y = data target variance_threshold = VarianceThreshold(threshold = 0.2) X_variance_reduced = variance_threshold fit_transform(X) In [6]: 2/28/26, 5:35 PM Prac3 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac3.html 5/8 X_variance_reduced_df = pd DataFrame(X_variance_reduced, columns = X columns[variance_threshold get_support()]) print("Data after variance Thresholding:\n", X_variance_reduced_df head()) correlation_matrix = X corr() plt figure(figsize = (10,8)) sns heatmap(correlation_matrix, annot =True , cmap = 'coolwarm', fmt = '.2f', cbar =True ) plt title('Correlation Matrix of Breast Cancer Features') plt show() upper_triangle = correlation_matrix where(np triu(np ones(correlation_matrix shape),k = 1) astype(bool)) highly_correlated_features = [] for i in range(upper_triangle shape[0]): for j in range(i + 1, upper_triangle shape[1]): if upper_triangle iloc[i,j] > 0.9: highly_correlated_features append((X columns[i],X columns[j])) print("\nHighly Correlated Features (Correlation > 0.9):\n",highly_correlated_features) features_to_remove = [feature[1] for feature in highly_correlated_features] X_correlated_reduced = X drop(columns = features_to_remove) print("\nData after Removing Highly Correlated Features:\n",X_correlated_reduced head()) Data after variance Thresholding: mean radius mean texture mean perimeter mean area texture error \ 0 17.99 10.38 122.80 1001.0 0.9053 1 20.57 17.77 132.90 1326.0 0.7339 2 19.69 21.25 130.00 1203.0 0.7869 3 11.42 20.38 77.58 386.1 1.1560 4 20.29 14.34 135.10 1297.0 0.7813 perimeter error area error worst radius worst texture worst perimeter \ 0 8.589 153.40 25.38 17.33 184.60 1 3.398 74.08 24.99 23.41 158.80 2 4.585 94.03 23.57 25.53 152.50 3 3.445 27.23 14.91 26.50 98.87 4 5.438 94.44 22.54 16.67 152.20 worst area 0 2019.0 1 1956.0 2 1709.0 3 567.7 4 1575.0 2/28/26, 5:35 PM Prac3 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac3.html 6/8 2/28/26, 5:35 PM Prac3 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac3.html 7/8 Highly Correlated Features (Correlation > 0.9): [('mean radius', 'mean perimeter'), ('mean radius', 'mean area'), ('mean radius', 'worst radius'), ('mean radius', 'worst perimeter'), ('mean radius', 'worst area'), ('mean texture', 'worst texture'), ('mean perimeter', 'mean area'), ('mean perimeter', 'worst radius'), ('mean perimeter', 'worst perimet er'), ('mean perimeter', 'worst area'), ('mean area', 'worst radius'), ('mean area', 'worst perimete r'), ('mean area', 'worst area'), ('mean concavity', 'mean concave points'), ('mean concave points', 'worst concave points'), ('radius error', 'perimeter error'), ('radius error', 'area error'), ('perime ter error', 'area error'), ('worst radius', 'worst perimeter'), ('worst radius', 'worst area'), ('wors t perimeter', 'worst area')] Data after Removing Highly Correlated Features: mean radius mean texture mean smoothness mean compactness \ 0 17.99 10.38 0.11840 0.27760 1 20.57 17.77 0.08474 0.07864 2 19.69 21.25 0.10960 0.15990 3 11.42 20.38 0.14250 0.28390 4 20.29 14.34 0.10030 0.13280 mean concavity mean symmetry mean fractal dimension radius error \ 0 0.3001 0.2419 0.07871 1.0950 1 0.0869 0.1812 0.05667 0.5435 2 0.1974 0.2069 0.05999 0.7456 3 0.2414 0.2597 0.09744 0.4956 4 0.1980 0.1809 0.05883 0.7572 texture error smoothness error compactness error concavity error \ 0 0.9053 0.006399 0.04904 0.05373 1 0.7339 0.005225 0.01308 0.01860 2 0.7869 0.006150 0.04006 0.03832 3 1.1560 0.009110 0.07458 0.05661 4 0.7813 0.011490 0.02461 0.05688 concave points error symmetry error fractal dimension error \ 0 0.01587 0.03003 0.006193 1 0.01340 0.01389 0.003532 2 0.02058 0.02250 0.004571 3 0.01867 0.05963 0.009208 4 0.01885 0.01756 0.005115 worst smoothness worst compactness worst concavity worst symmetry \ 0 0.1622 0.6656 0.7119 0.4601 1 0.1238 0.1866 0.2416 0.2750 2 0.1444 0.4245 0.4504 0.3613 3 0.2098 0.8663 0.6869 0.6638 4 0.1374 0.2050 0.4000 0.2364 worst fractal dimension 0 0.11890 1 0.08902 2 0.08758 3 0.17300 4 0.07678 2/28/26, 5:35 PM Prac3 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac3.html 8/8 Q 1) import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity from scipy.sparse import csr_matrix ratings = pd read_csv('D:\TYCS\\sem6\\PRACTICALS\\Codes\\DMW\\DMW_Prac\\P4\\ratings.csv') movies = pd read_csv('D:\TYCS\\sem6\\PRACTICALS\\Codes\\DMW\\DMW_Prac\\P4\\movies.csv') print("Ratings Dataset:") print(ratings head()) print("\nMovies Dataset:") print(movies head()) user_item_matrix = ratings pivot_table(index = 'userId',columns = 'movieId',values = 'rating') user_item_matrix = user_item_matrix fillna(0) print(user_item_matrix head()) sparse_user_item = csr_matrix(user_item_matrix values) user_similarity = cosine_similarity(sparse_user_item) user_similarity_df = pd DataFrame(user_similarity,index = user_item_matrix index, columns = user_item_matrix index) print("\nUser similarity matrix:") print(user_similarity_df head()) def recommend_movies(user_id,top_n = 10): similarity_scores = user_similarity_df[user_id] similarity_scores = similarity_scores sort_values(ascending =False ) similar_users = similarity_scores index[1:] similar_users_rating = user_item_matrix iloc[similar_users] weighted_ratings = similar_users_ratings T dot(similarity_scores[1:]) predicted_ratings = weighted_ratings / similarity_scores[1:] sum() user_rated_movies = user_item_matrix iloc[user_id] predicted_ratings = predicted_ratings[user_rated_movies == 0] top_recommendations = predicted_ratings sort_values(ascending =False ) head(top_n) recommended_movies = movies[movies['movieid'] isin(top_recommendations index)] return recommended_movies In [3]: 2/28/26, 5:39 PM Prac4 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac4.html 1/3 Ratings Dataset: userId movieId rating timestamp 0 1 1 4.0 964982703.0 1 1 3 4.0 964981247.0 2 1 6 4.0 964982224.0 3 1 47 5.0 964983815.0 4 1 50 5.0 964982931.0 Movies Dataset: id title \ 0 1 Toy Story (1995) 1 2 Jumanji (1995) 2 3 Grumpier Old Men (1995) 3 4 Waiting to Exhale (1995) 4 5 Father of the Bride Part II (1995) tags 0 Adventure|Animation|Children|Comedy|Fantasy 1 Adventure|Children|Fantasy 2 Comedy|Romance 3 Comedy|Drama|Romance 4 Comedy movieId 1 2 3 4 5 6 7 8 \ userId 1 4.0 0.0 4.0 0.0 0.0 4.0 0.0 0.0 2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 movieId 10 11 ... 184349 184471 184791 185031 185135 187541 \ userId ... 1 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 2 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 3 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 4 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 5 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 movieId 187593 187595 188301 190183 userId 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 4 0.0 0.0 0.0 0.0 5 0.0 0.0 0.0 0.0 [5 rows x 4737 columns] User similarity matrix: userId 1 2 3 4 5 6 7 \ userId 1 1.000000 0.027283 0.059720 0.194395 0.129080 0.128152 0.158744 2 0.027283 1.000000 0.000000 0.003726 0.016614 0.025333 0.027585 3 0.059720 0.000000 1.000000 0.002251 0.005020 0.003936 0.000000 4 0.194395 0.003726 0.002251 1.000000 0.128659 0.088491 0.115120 5 0.129080 0.016614 0.005020 0.128659 1.000000 0.300349 0.108342 userId 8 9 10 ... 113 114 115 \ userId ... 1 0.136968 0.064263 0.016875 ... 0.072532 0.078342 0.225257 2 0.027257 0.000000 0.067445 ... 0.000000 0.087650 0.022085 3 0.004941 0.000000 0.000000 ... 0.004259 0.000000 0.018016 4 0.062969 0.011361 0.031163 ... 0.153986 0.060063 0.159077 2/28/26, 5:39 PM Prac4 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac4.html 2/3 5 0.429075 0.000000 0.030611 ... 0.037392 0.000000 0.187884 userId 116 117 118 119 120 121 122 userId 1 0.185708 0.165122 0.132333 0.057203 0.111491 0.117801 0.196493 2 0.046360 0.028542 0.000000 0.176479 0.000000 0.026743 0.147195 3 0.004109 0.023570 0.005473 0.000000 0.000000 0.002909 0.013450 4 0.082364 0.084375 0.062689 0.022371 0.088782 0.033320 0.122858 5 0.110899 0.445072 0.170346 0.040487 0.036210 0.366812 0.060527 [5 rows x 122 columns] 2/28/26, 5:39 PM Prac4 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac4.html 3/3 import pandas as pd from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules #import filedisplay as fp store_data = pd read_csv("C:\\Users\\bhara\\OneDrive\\Desktop\\lalit\\Sem6\\DMWPrac\\Dmw\\P5\\store_data.csv", header =None ) #preprocess dataset store_data = store_data fillna(0) #create list transcatios records = [] for i in range(0,store_data shape[0]): records append([str(store_data values[i,j]) for j in range(store_data shape[1]) if str (store_data values[i,j]) != '0']) #convert into dataframes itemsets = pd DataFrame([ {item: 1 for item in transaction} for transaction in records]) fillna(0) astype(int) print(itemsets) #generate frequetnt itemset frequent_itemsets = apriori(itemsets,min_support = 0.0045,use_colnames =True ) print("Frequent itemsets: ") print(frequent_itemsets) #calculate total number of itemsets num_itemsets = len(frequent_itemsets) print(num_itemsets) #generate ar rules = association_rules(frequent_itemsets,metric = "confidence", min_threshold = 0.2,num_itemsets = num_itemsets) print(rules) #filter rules min_lift = 3 filtered_rules = rules[rules['lift'] >= min_lift] print(filtered_rules) #display frequent itemsets #display ar print("\nAssociation Rules: ") for _, rule in filtered_rules iterrows(): print(f"Rule: {rule['antecedents']}->{rule['consequents']}") print(f"Support:{rule['support']}") print(f"Confidence: {rule['confidence']}") print(f"Lift: {rule['lift']}") print("+++++++++++++++++++++") In [1]: 2/28/26, 7:29 PM Prac5 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac5.html 1/7 shrimp almonds avocado vegetables mix green grapes \ 0 1 1 1 1 1 1 0 0 0 0 0 2 0 0 0 0 0 3 0 0 1 0 0 4 0 0 0 0 0 ... ... ... ... ... ... 7496 0 0 0 0 0 7497 0 0 0 0 0 7498 0 0 0 0 0 7499 0 0 0 0 0 7500 0 0 0 0 0 whole weat flour yams cottage cheese energy drink tomato juice ... \ 0 1 1 1 1 1 ... 1 0 0 0 0 0 ... 2 0 0 0 0 0 ... 3 0 0 0 0 0 ... 4 0 0 0 0 0 ... ... ... ... ... ... ... ... 7496 0 0 0 0 0 ... 7497 0 0 0 0 0 ... 7498 0 0 0 0 0 ... 7499 0 0 0 0 0 ... 7500 0 0 0 0 0 ... melons cauliflower green beans ketchup bramble burger sauce \ 0 0 0 0 0 0 0 1 0 0 0 0 0 0 2 0 0 0 0 0 0 3 0 0 0 0 0 0 4 0 0 0 0 0 0 ... ... ... ... ... ... ... 7496 0 0 0 0 0 0 7497 0 0 0 0 0 0 7498 0 0 0 0 0 0 7499 0 0 0 0 0 0 7500 0 0 0 0 0 0 oatmeal asparagus cream napkins 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 3 0 0 0 0 4 0 0 0 0 ... ... ... ... ... 7496 0 0 0 0 7497 0 0 0 0 7498 0 0 0 0 7499 0 0 0 0 7500 0 0 0 0 [7501 rows x 120 columns] C:\Users\bhara\anaconda3\Lib\site-packages\mlxtend\frequent_patterns\fpcommon.py:175: DeprecationWarning: Data Frames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type warnings.warn( 2/28/26, 7:29 PM Prac5 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac5.html 2/7 Frequent itemsets: support itemsets 0 0.071457 (shrimp) 1 0.020397 (almonds) 2 0.033329 (avocado) 3 0.025730 (vegetables mix) 4 0.009065 (green grapes) .. ... ... 837 0.005333 (ground beef, spaghetti, grated cheese) 838 0.006399 (ground beef, spaghetti, herb & pepper) 839 0.004533 (mineral water, spaghetti, chocolate, eggs) 840 0.004533 (mineral water, spaghetti, milk, frozen vegeta... 841 0.004933 (mineral water, spaghetti, milk, chocolate) [842 rows x 2 columns] 842 antecedents consequents \ 0 (shrimp) (mineral water) 1 (shrimp) (milk) 2 (shrimp) (frozen vegetables) 3 (shrimp) (spaghetti) 4 (shrimp) (chocolate) .. ... ... 695 (spaghetti, milk, frozen vegetables) (mineral water) 696 (mineral water, spaghetti, milk) (chocolate) 697 (mineral water, spaghetti, chocolate) (milk) 698 (mineral water, milk, chocolate) (spaghetti) 699 (spaghetti, milk, chocolate) (mineral water) antecedent support consequent support support confidence lift \ 0 0.071457 0.238368 0.023597 0.330224 1.385352 1 0.071457 0.129583 0.017598 0.246269 1.900474 2 0.071457 0.095321 0.016664 0.233209 2.446574 3 0.071457 0.174110 0.021197 0.296642 1.703760 4 0.071457 0.163845 0.017998 0.251866 1.537221 .. ... ... ... ... ... 695 0.008266 0.238368 0.004533 0.548387 2.300588 696 0.015731 0.163845 0.004933 0.313559 1.913758 697 0.015865 0.129583 0.004933 0.310924 2.399428 698 0.013998 0.174110 0.004933 0.352381 2.023897 699 0.010932 0.238368 0.004933 0.451220 1.892952 representativity leverage conviction zhangs_metric jaccard \ 0 1.0 0.006564 1.137144 0.299568 0.082441 1 1.0 0.008338 1.154811 0.510279 0.095930 2 1.0 0.009853 1.179825 0.636767 0.111012 3 1.0 0.008756 1.174209 0.444850 0.094474 4 1.0 0.006290 1.117654 0.376370 0.082822 .. ... ... ... ... ... 695 1.0 0.002562 1.686470 0.570040 0.018722 696 1.0 0.002355 1.218103 0.485099 0.028244 697 1.0 0.002877 1.263167 0.592636 0.035104 698 1.0 0.002495 1.275271 0.513086 0.026929 699 1.0 0.002327 1.387862 0.476938 0.020185 certainty kulczynski 0 0.120604 0.214609 1 0.134057 0.191036 2 0.152417 0.204017 3 0.148363 0.209194 4 0.105269 0.180856 .. ... ... 695 0.407046 0.283701 696 0.179051 0.171833 697 0.208339 0.174495 698 0.215853 0.190356 699 0.279467 0.235957 2/28/26, 7:29 PM Prac5 file:///D:/TYCS/sem6/PRACTICALS/Manuals/DMW/Prac5.html 3/7