R Code

by ADMIN 7 views

Introduction

R is a popular programming language and environment for statistical computing and graphics. It is widely used in data analysis, machine learning, and data visualization. In this article, we will explore the R code for data analysis and machine learning, including data loading, cleaning, and visualization, as well as model training and evaluation.

Section 1: Data Loading and Cleaning

Installing Packages

# Install required packages
install.packages(c("tidyverse", "ggplot2", "cluster", "factoextra",
                   "caret", "randomForest", "DataExplorer", "iml"))

# Load required packages
library(tidyverse)
library(ggplot2)
library(cluster)
library(factoextra)
library(caret)
library(randomForest)
library(DataExplorer)
library(iml)

Loading Data

# Load the data
data <- read.csv("breast cancer data.csv", header = FALSE)

# View the first few rows
head(data)

# Check the structure
str(data)

Cleaning Data

# Convert to appropriate types
data <- data %>%
  mutate(across(where(is.character), ~ trimws(.))) %>%  # Trim whitespace
  mutate(across(c(-id, -diagnosis), as.numeric)) %>%    # Convert all but 'id' and 'diagnosis' to numeric
  mutate(
    id = as.factor(id),
    diagnosis = as.factor(tolower(diagnosis))           # Lowercase 'diagnosis' values
  ) %>%
  filter(if_all(where(is.numeric), ~ !is.na(.) & . >= 0))  # Remove rows with negative or NA numeric values

Section 2: Data Visualization

Plotting Data Structure

# Load required libraries
library(ggplot2)
library(dplyr)
library(corrplot)
library(grid)

# Function to plot with space and explanation
plot_with_explanation <- function(plot_obj, explanation_text) {
  print(plot_obj)
  grid.text(explanation_text,
            x = 0.5, y = unit(1, "npc") - unit(5, "lines"), just = "center",
            gp = gpar(fontsize = 11, col = "darkblue"))
  grid.newpage()  # add space before next plot
}

# Assuming 'data' is your dataframe with breast cancer data loaded

# 1. Diagnosis distribution plot
p1 <- ggplot(data, aes(x = diagnosis, fill = diagnosis)) +
  geom_bar() +
  theme_minimal() +
  labs(title = "Diagnosis Distribution", x = "Diagnosis", y = "Count") +
  scale_fill_manual(values = c("benign" = "skyblue", "malignant" = "salmon"))

# 2. Histogram of radius_mean by diagnosis
p2 <- ggplot(data, aes(x = radius_mean, fill = diagnosis)) +
  geom_histogram(bins = 30, alpha = 0.6, position = "identity", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Radius Mean by Diagnosis", x = "Radius Mean", y = "Count")

# 3. Scatter plot radius_mean vs area_mean colored diagnosis
p3 <- ggplot(data, aes(x = radius_mean, y = area_mean, color = diagnosis)) +
  geom_point(alpha = 0.6) +
  theme_minimal() +
  labs(title = "Radius Mean vs Area Mean", x = "Radius Mean", y = "Area Mean")

# 4. Correlation matrix plot of numeric features
numeric_data <- select_if(data, is.numeric)
cor_matrix <- cor(numeric_data, use = "complete.obs")

# Now print each plot with its explanation and space
plot_with_explanation(p1, "This plot shows the distribution of diagnosis in the data.")
plot_with_explanation(p2, "This plot shows the histogram of radius_mean by diagnosis.")
plot_with_explanation(p3, "This plot shows the scatter plot of radius_mean vs area_mean colored by diagnosis.")
plot_with_explanation(corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.7, title = "Correlation Matrix", mar = c(0,0,1,0)), "This plot shows the correlation matrix of numeric features.")

Section 3: Model Training and Evaluation

K-Means Clustering

# Sample up to 1000 rows (or all if less than 1000)
sampled_data <- data %>% sample_n(min(1000, nrow(data)))

# Select numeric features for clustering
cluster_data <- sampled_data %>%
  select(radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean) %>%
  scale()

# Determine the optimal number of clusters using Elbow method
fviz_nbclust(cluster_data, kmeans, method = "wss") +
  ggtitle("Elbow Method for Choosing k")

# Apply k-means with k = 3 (or choose k based on the elbow plot)
kmeans_model <- kmeans(cluster_data, centers = 3, nstart = 25)

# Add cluster assignments to the sampled_data
sampled_data$Cluster <- as.factor(kmeans_model$cluster)

# Visualize clusters with custom colors
fviz_cluster(kmeans_model, data = cluster_data, geom = "point", ellipse.type = "norm") +
  ggtitle("K-means Clustering with Custom Colors") +
  scale_color_manual(values = c("#E41A1C", "#377EB8", "#4DAF4A")) +
  scale_fill_manual(values = c("#E41A1C", "#377EB8", "#4DAF4A"))

Random Forest Model

# Example dataset with numeric features and a factor target
model_data <- data.frame(
  Age = sample(20:70, 200, replace = TRUE),
  Stay_Length = sample(1:30, 200, replace = TRUE),
  Billing.Amount = runif(200, 100, 5000),
  Test.Results = sample(c("Positive", "Negative"), 200, replace = TRUE)
)

# Train-test split
set.seed(123) # You can remove or change the seed to get a different split
trainIndex <- createDataPartition(model_data$Test.Results, p = 0.8, list = FALSE)
train <- model_data[trainIndex, ]
test <- model_data[-trainIndex, ]

# Explicitly ensure the target variable is a factor
train$Test.Results <- as.factor$Test.Results)
test$Test.Results <- as.factor(test$Test.Results) # Ensure test target is also a factor

# Train Random Forest model
rf_model <- randomForest(Test.Results ~ ., data = train, importance = TRUE)

# Get predictions
predictions <- predict(rf_model, newdata = test)

# Get all unique levels from both predictions and the actual test results
all_levels <- unique(c(levels(predictions), levels(test$Test.Results)))

# Coerce both predictions and test$Test.Results to factors with the unified levels
predictions_factor <- factor(as.character(predictions), levels = all_levels)
test_results_factor <- factor(as.character(test$Test.Results), levels = all_levels)

# Calculate the confusion matrix using the factors with aligned levels
conf_matrix <- confusionMatrix(predictions_factor, test_results_factor)

# Print the confusion matrix
print(conf_matrix)

# Extract and print the accuracy
accuracy <- conf_matrix$overall['Accuracy']
print(paste("Accuracy:", accuracy))

SVM Model

# Example dataset with numeric features and a factor target
model_data <- data.frame(
  Age = sample(20:70, 200, replace = TRUE),
  Stay_Length = sample(1:30, 200, replace = TRUE),
  Billing.Amount = runif(200, 100, 5000),
  Test.Results = sample(c("Positive", "Negative"), 200, replace = TRUE)
)

# Train-test split
set.seed(123) # You can remove or change the seed to get a different split
trainIndex <- createDataPartition(model_data$Test.Results, p = 0.8, list = FALSE)
train <- model_data[trainIndex, ]
test <- model_data[-trainIndex, ]

# Explicitly ensure the target variable is a factor
train$Test.Results <- as.factor(train$Test.Results)
test$Test.Results <- as.factor(test$Test.Results) # Ensure test target is also a factor

# Train SVM model
svm_model <- svm(Test.Results ~ ., data = train, kernel = "linear", cost = 1)

# Make predictions on the test data
predictions <- predict(svm_model, newdata = test)

# Get all unique levels from both predictions and the actual test results
all_levels <- unique(c(levels(predictions), levels(test$Test.Results)))

# Coerce both predictions and test$Test.Results to factors with the unified levels
predictions_factor <- factor(as.character(predictions), levels = all_levels)
test_results_factor <- factor(as.character(test$Test.Results), levels = all_levels)

# Calculate the confusion matrix using the factors with aligned levels
conf_matrix <- confusionMatrix(predictions_factor, test_results_factor)

# Print the confusion matrix
print(conf_matrix)

# Extract and print the accuracy
accuracy <- conf_matrix$overall['Accuracy']
print(paste("Accuracy:", accuracy))

Q: What is R and why is it used for data analysis and machine learning?

A: R is a popular programming language and environment for statistical computing and graphics. It is widely used in data analysis, machine learning, and data visualization due to its flexibility, scalability, and extensive libraries.

Q: What are the key steps in data analysis using R?

A: The key steps in data analysis using R include:

  1. Data loading: Loading the data into R using various functions such as read.csv(), read.table(), etc.
  2. Data cleaning: Cleaning the data by handling missing values, outliers, and data types.
  3. Data visualization: Visualizing the data using various plots and charts such as ggplot2, plot(), etc.
  4. Model training: Training machine learning models using various algorithms such as randomForest(), svm(), etc.
  5. Model evaluation: Evaluating the performance of the trained models using various metrics such as accuracy, precision, recall, etc.

Q: What are some common data visualization techniques used in R?

A: Some common data visualization techniques used in R include:

  1. Bar plots: Used to compare categorical data.
  2. Scatter plots: Used to visualize the relationship between two continuous variables.
  3. Histograms: Used to visualize the distribution of a single variable.
  4. Box plots: Used to compare the distribution of a variable across different groups.
  5. Heatmaps: Used to visualize the relationship between two variables.

Q: What are some common machine learning algorithms used in R?

A: Some common machine learning algorithms used in R include:

  1. Random Forest: A decision tree-based algorithm used for classification and regression tasks.
  2. Support Vector Machines (SVM): A supervised learning algorithm used for classification and regression tasks.
  3. K-Means Clustering: An unsupervised learning algorithm used for clustering data.
  4. Principal Component Analysis (PCA): An unsupervised learning algorithm used for dimensionality reduction.
  5. Gradient Boosting: A decision tree-based algorithm used for classification and regression tasks.

Q: How can I handle missing values in R?

A: There are several ways to handle missing values in R, including:

  1. Listwise deletion: Deleting all rows with missing values.
  2. Mean imputation: Replacing missing values with the mean of the variable.
  3. Median imputation: Replacing missing values with the median of the variable.
  4. Regression imputation: Using a regression model to predict the missing values.
  5. Multiple imputation: Creating multiple versions of the dataset with different imputed values.

Q: How can I evaluate the performance of a machine learning model in R?

A: There are several ways to evaluate the performance of a machine learning model in R, including:

  1. Accuracy: Calculating the proportion of correctly classified instances.
  2. Precision: Calculating the proportion of true positives among all positive predictions.
  3. Recall: Calculating the proportion of true positives among all actual positive instances.
  4. F1 score: Calculating the harmonic mean of precision and recall.
  5. ROC-AUC: Calculating the area under the receiver operating characteristic curve.

Q: How can I optimize the performance of a machine learning model in R?

A: There are several ways to optimize the performance of a machine learning model in R, including:

  1. Hyperparameter tuning: Adjusting the hyperparameters of the model to improve its performance.
  2. Feature engineering: Creating new features from existing ones to improve the model's performance.
  3. Regularization: Adding a penalty term to the loss function to prevent overfitting.
  4. Ensemble methods: Combining the predictions of multiple models to improve the overall performance.
  5. Cross-validation: Evaluating the model's performance on multiple subsets of the data to prevent overfitting.