1 2. Implementation of pre - processing Techniques and evaluate with various statistical methods for any given raw data. import pandas as pd from mlxtend.frequent_patterns import apriori, association_rules from mlxtend.preprocessing import TransactionEncoder dataset = [ ['milk', 'bread', 'butter'], ['bread', 'butter'], ['milk', 'bread'], ['milk', 'bread', 'butter', 'jam'], ['bread', 'jam'] ] te = TransactionEncoder() te_ary = te. fit(dataset).transform(dataset) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True) print("Frequent Itemsets: \ n", frequent_itemsets) 2 # Step 2: Generate association rules rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) print(" \ nAssociation Rules: \ n", rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]) 3. Implementation of Association rule mining FP - Algorithm. import pandas as pd from mlxtend.preprocessing import TransactionEncoder 3 from mlxtend.frequent_patterns import fpgrowth, association_rules dataset = [ ['milk', 'bread', 'butter'], ['bread', 'butter'], ['milk', 'bread'], ['milk', 'bread', 'butter', 'jam'], ['bread', 'jam'] ] te = TransactionEncoder() te_ary = te.fit(dataset).transform(dataset) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = fpgrowth(df, min_support=0.4, use_colnames=True) print("Frequent Itemsets: \ n", frequent_itemsets) rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) print(" \ nAssociation Rules: \ n", rules[['antecedent s', 'consequents', 'support', 'confidence', 'lift']]) 4.Implementation of clustering algorithms, Partitioning Algorithm: K - means Algorithms import numpy as np import matplotlib.pyplot as plt 4 from sklearn.datasets import make_blobs X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=42) class KMeans: def __init__(self, k=4, max_iters=100, tolerance=1e - 4): self.k = k self.max_iters = max_iters self.tolerance = tolerance def fit(self , X): np.random.seed(42) random_idxs = np.random.permutation(X.shape[0])[:self.k] self.centroids = X[random_idxs] for _ in range(self.max_iters): self.labels = self._assign_clusters(X) new_centroids = np.array([X[self.labels == i].mean(axis=0) for i in range(self.k)]) # Check for convergence if np.all(np.linalg.norm(self.centroids - new_centroids, axis=1) < self.tolerance): break self. centroids = new_centroids def _assign_clusters(self, X): 5 distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2) return np.argmin(distances, axis=1) def predict(self, X): return self._assign_clusters(X) # Run kmeans = KMeans(k=4) kmeans = kmeans.predict(X) plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=30, cmap='viridis') plt.scatter(kmeans.centroids[:, 0], kmeans.centroids[:, 1], c='red', s=200, alpha=0.75, mar ker='X') plt.title('K - Means Clustering') plt.show() 5. Implementation of clustering algorithm - Hierarchical Clustering: BIRCH algorithm import numpy as np import matplotlib.pyplot as plt 6 from sklearn.datasets import make_blobs X, y_true = make_blobs(n_samples=300, centers=3, cluster_std=0.6, random_state=42) class KMeans: def __init__(self, k=3, max_iters=100, tol=1e - 4): self.k = k self.max_iters = max_iters self.tol = tol def fit(self, X): np.random.seed(42) random_idx = np.random.permutation(X.shape[0])[:self.k] self.centroids = X[random_idx] for i in range(self.max_iters): self.labels = self._assign_clusters(X) new_centroids = np.array([X[self.labels == j].m ean(axis=0) for j in range(self.k)]) diff = np.linalg.norm(self.centroids - new_centroids) if diff < self.tol: break self.centroids = new_centroids def _assign_clusters(self, X): distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2) return np.argmin(distances, axis=1) def predict(self, X): return self._assign_clusters(X) 7 # Run KMeans model = KMeans(k=3) model.fit(X) y_pred = model.predict(X) # Plotting plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=50, cmap='viridis') plt.scatter(model.centroids[:, 0], model.centroids[:, 1], c='red', s=200, marker='X') plt.title("K - Means Clustering (From Scratch)") plt.show() 6. Implementation of clustering algorithms CURE Algorithm. import numpy as np 8 import matplotlib.pyplot as plt from sklearn.datasets import make_blobs from scipy.spatial.distance import cdist X, _ = make_blobs(n_samples=100, centers=4, random_state=42) class CureClustering: def __init__(self, k=4, c=5, alpha=0.2): self.k = k self.c = c self.alpha = alpha self.clusters = [] def fit(self, X): self.clusters = [[x] for x in X] while len(self.clusters) > self.k: 9 distances = np.full((len(self.clusters), len(self.clusters)), np.inf) for i in range(len(self.clusters)): for j in range(i + 1, len(self.clusters)): d = self._cluster_distance(self.clusters[i], self.clusters[j]) di stances[i][j] = d i, j = np.unravel_index(np.argmin(distances), distances.shape) new_cluster = self.clusters[i] + self.clusters[j] self.clusters.pop(j) self.clusters.pop(i) self.clusters.append(new_cluster) def _cluster_distance(self, cluster1, cluster2): reps1 = self._get_representatives(cluster1) reps2 = self._get_representatives(cluster2) return np.min(cdist(reps1, reps2)) def _get_representatives(self, cluster): cluster = np.array(cluster) centroid = n p.mean(cluster, axis=0) dists = cdist(cluster, [centroid]).flatten() 10 idx = np.argsort(dists)[ - self.c:] reps = cluster[idx] return reps * (1 - self.alpha) + centroid * self.alpha def predict(self, X): labels = np.zeros(len(X)) for i, x in enume rate(X): dists = [np.min(cdist([x], cluster)) for cluster in self.clusters] labels[i] = np.argmin(dists) return labels.astype(int) cure = CureClustering(k=4, c=5, alpha=0.2) cure.fit(X) labels = cure.predict(X) plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis') plt.title("CURE Clustering (Simplified)") plt.show() 7. Implementation of clustering algorithms Density Base Clustering: DBSCAN Algorithm. 11 from sklearn.datasets import make_moons from sklearn.cluster import DBSCAN import matplotlib.pyplot as plt X, _ = make_moons(n_samples=300, noise=0.05, random_state=42) db = DBSCAN(eps=0.2, min_samples=5) labels = db.fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='plasma', s=40) plt.title("DBSCAN Clustering (Moons Dataset)") plt.show() from sklearn.datasets import make_blobs X, _ = make_blobs(n_samples=300, centers=3, cluster_std=0.5, random_state=42) db = DBSCAN(eps=0.3, min_samples=5) labels = db.fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=40) plt.title("DBSCAN on Gaussian Blobs") plt.show() from sklearn.metrics import silhouette_score if len(set(labels)) > 1: score = silhouette_score(X, labels) print(f"Silhouette Score: {score:.2f}") else: print("Silhouette Score not applicable (only one cluster found).") 8. Implementation of clustering algorithms – Agglomerative algorithm. import numpy as np 12 import matplotlib.pyplot as plt from scipy. cluster.hierarchy import dendrogram, linkage, fcluster from sklearn.datasets import make_blobs from sklearn.cluster import AgglomerativeClustering X, y = make_blobs(n_samples=100, centers=3, random_state=42) Z = linkage(X, method='ward') plt.figure(figs ize=(10, 6)) dendrogram(Z) plt.title("Hierarchical Clustering Dendrogram") plt.show() model = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward') labels = model.fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis') plt.title("Agglomerative Clustering (3 clusters)") plt.show() 13 clusters = fcluster(Z, t=10, criterion='distance') plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis') plt.title("Hierarchical Cl ustering (Cut at t=10)") plt.show() 9. Implementation of clustering algorithms Data - stream Algorithm – Bloom Filter. 14 import hashlib import numpy as np class BloomFilter: def __init__(self, size=1000, hash_count=3): self.size = size self.hash_count = hash_count self.bit_array = np.zeros(size, dtype=bool) def _hashes(self, item): result = [] item = item.encode('utf - 8') for i in range(self.hash_count): hash_result = hashlib.md5(item + str(i).encode()).hexdigest() result.append(int(hash_result, 16) % self.size) return result def add(self, item): for index in self._hashes(item): self.bit_array[index] = True def check(self, item): return all(self.bit_array[index] for index in self._hashes(item)) bf = BloomFilter(size=1000, hash_count=4) bf.add("apple") 15 bf.add("banana") bf.add("orange") print("Check 'apple':", bf.check("apple")) print("Check 'grape':", bf.check("grape")) print("Check 'banana':", bf.check("banana")) 10.Implementation of link analysis algorithms - Page Rank. import numpy as np def pagerank(link_matrix, damping=0.85, max_iter=100, tol=1e - 6): N = link_matrix.shape[0] out_link_counts = link_matrix.sum(axis=1) transition_matrix = np.zeros_like(link_matrix, dtype=float) for i in range(N): 16 if out_link_counts[i] == 0: transition_matrix[i] = 1.0 / N else: transition_matrix[i] = link_matrix[i] / out_link_counts[i] transition_matrix = damping * transition_matrix + (1 - damping) / N pr = np.ones(N) / N for _ in range(max_iter): new_pr = pr @transition_matrix if np.linalg.norm(new_ pr - pr, 1) < tol: break pr = new_pr return pr link_matrix = np.array([ [0, 1, 1, 0], [0, 0, 0, 1], [1, 0, 0, 1], [0, 0, 1, 0], ]) ranks = pagerank(link_matrix) print("PageRanks:", ranks)