Final

2025-12-18 23:06:51 +00:00
parent 68df013917
commit 47a7fed915
4 changed files with 176 additions and 325 deletions
@@ -1,2 +0,0 @@
-ipykernel==6.4.1
-pyspark==3.1.2
@@ -1,6 +1,7 @@
 # HW3 - Task 1
 # Nicholas Pease
-# IMDB Data Loading into MongoDB
+# I decided to try MongoD because structure of documents, which aligns closely with JSON format, where I store a majority of the data I work with in other projects.
+# I stored the data in this assignment in a collection of movie documents, where each document contains information about a movie, its directors, and its cast.

 import csv
 import pandas as pd
@@ -10,7 +11,6 @@ import os
 from typing import Dict, List, Optional
 from collections import defaultdict

-# Global data structures for processing
 movies = {}
 persons = {}
 directors = {}
@@ -18,46 +18,25 @@ client = None
 db = None
 movies_collection = None

-def connect(connection_string: str = "mongodb://localhost:27017/", 
-           database_name: str = "imdb_database") -> bool:
+def connect():
    global client, db, movies_collection
-    try:
-        client = MongoClient(connection_string, serverSelectionTimeoutMS=5000)
+    
+    client = MongoClient("mongodb://localhost:27017/", serverSelectionTimeoutMS=5000)
    client.admin.command('ping')
-        db = client[database_name]
+    db = client["imdb_database"]
    movies_collection = db.movies
    
-        print(f"Connected to MongoDB database: {database_name}")
-        return True
+    print(f"Connected to MongoDB database: imdb_database")

-    except ConnectionFailure as e:
-        print(f"Failed to connect to MongoDB: {e}")
-        return False
-    except Exception as e:
-        print(f"MongoDB connection error: {e}")
-        return False
-
-def disconnect():
-    global client
-    if client:
-        client.close()
-
-def load_movies_data(file_path: str, sample_size: Optional[int] = None) -> bool:
+def load_movies_data(file_path):
    global movies
-    try:
    movies_loaded = 0
    with open(file_path, 'r', encoding='latin-1') as file:
-            next(file)
+        next(file) # skip header
        
        for line in file:
-                if sample_size and movies_loaded >= sample_size:
-                    break
-                    
            line = line.strip()
-                if not line:
-                    continue
            
-                try:
            parts = line.split(',')
            
            movie_id = int(parts[0])
@@ -81,29 +60,11 @@ def load_movies_data(file_path: str, sample_size: Optional[int] = None) -> bool:
            
            movies_loaded += 1
    
-                except (ValueError, IndexError):
-                    continue
-        
    print(f"Loaded {len(movies)} movies")
-        return True

-    except Exception as e:
-        print(f"Error loading movies: {e}")
-        return False
-
-def load_persons_data(file_path: str) -> bool:
+def load_persons_data(file_path):
    global persons
-    try:
-        persons_df = pd.read_csv(
-                    file_path,
-                    encoding='latin-1',
-                    dtype={
-                        'id': 'int32',
-                        'fname': 'string',
-                        'lname': 'string', 
-                        'gender': 'string'
-                    }
-                )
+    persons_df = pd.read_csv(file_path,encoding='latin-1',dtype={'id': 'int32','fname': 'string','lname': 'string', 'gender': 'string'})
    
    for _, row in persons_df.iterrows():
        person_id = int(row['id'])
@@ -115,24 +76,10 @@ def load_persons_data(file_path: str) -> bool:
        }
    
    print(f"Loaded {len(persons)} persons")
-        return True

-    except Exception as e:
-        print(f"Error loading persons: {e}")
-        return False
-
-def load_directors_data(file_path: str) -> bool:
+def load_directors_data(file_path):
    global directors
-    try:
-        directors_df = pd.read_csv(
-            file_path,
-            encoding="latin-1",
-            dtype={
-                'id': 'int32',
-                'fname': 'string',
-                'lname': 'string'
-            }
-        )
+    directors_df = pd.read_csv(file_path,encoding="latin-1",dtype={'id': 'int32','fname': 'string','lname': 'string'})
    
    for _, row in directors_df.iterrows():
        director_id = int(row['id'])
@@ -143,20 +90,10 @@ def load_directors_data(file_path: str) -> bool:
        }
    
    print(f"Loaded {len(directors)} directors")
-        return True

-    except Exception as e:
-        print(f"Error loading directors: {e}")
-        return False
-
-def load_movie_directors_data(file_path: str) -> bool:
+def load_movie_directors_data(file_path):
    global movies, directors
-    try:
-        movie_directors_df = pd.read_csv(
-            file_path,
-            encoding='latin-1',
-            dtype={'did': 'int32', 'mid': 'int32'}
-        )
+    movie_directors_df = pd.read_csv(file_path,encoding='latin-1',dtype={'did': 'int32', 'mid': 'int32'})
    
    linked_count = 0
    for _, row in movie_directors_df.iterrows():
@@ -171,45 +108,28 @@ def load_movie_directors_data(file_path: str) -> bool:
            movies[movie_id]['directors'].append(director_info)
            linked_count += 1

-        return True
-        
-    except Exception as e:
-        print(f"Error loading movie-director relationships: {e}")
-        return False
-
-def load_cast_data(file_path: str, max_cast_per_movie: int = 25) -> bool:
+def load_cast_data(file_path):
    global movies, persons
-    try:
-        if not os.path.exists(file_path):
-            return True
    
-        movie_cast_count = defaultdict(int)
    linked_count = 0
    
    with open(file_path, 'r', encoding='latin-1') as file:
-            next(file)
-            
-            for line_num, line in enumerate(file):
-                if line_num > 100000:
-                    break
+        next(file)  # skip header
        
+        for line in file:
            line = line.strip()
            if not line:
                continue
                
-                try:
            parts = line.split(',')
-                    
            if len(parts) < 3:
                continue
            
            person_id = int(parts[0])
            movie_id = int(parts[1])
-                    role = ','.join(parts[2:]) if len(parts) > 2 else "[Unknown]"
-                    
-                    if (movie_id in movies and person_id in persons and 
-                        movie_cast_count[movie_id] < max_cast_per_movie):
+            role = parts[2]
            
+            if (movie_id in movies and person_id in persons):
                cast_info = {
                    'person_id': person_id,
                    'name': persons[person_id]['name'],
@@ -217,37 +137,16 @@ def load_cast_data(file_path: str, max_cast_per_movie: int = 25) -> bool:
                    'role': role
                }
                movies[movie_id]['cast'].append(cast_info)
-                        movie_cast_count[movie_id] += 1
                linked_count += 1
    
-                except (ValueError, KeyError):
-                    continue
-        
-        if linked_count > 0:
    print(f"Loaded {linked_count} cast entries")

-        return True
-        
-    except Exception as e:
-        return True
-
-def create_indexes():
-    global movies_collection
-    try:
-        movies_collection.create_index("movie_name", unique=True, name="idx_movie_name")
-        movies_collection.create_index("year", name="idx_year")
-        movies_collection.create_index("rank", name="idx_rank")
-        movies_collection.create_index("directors.name", name="idx_director_name")
-        
-    except Exception as e:
-        pass
-
-def insert_movies_to_mongodb(batch_size: int = 2000) -> bool:
+def insert_movies_to_mongodb():
    global movies, movies_collection
-    try:
    movies_collection.drop()
    
-        documents = []
+    inserted_count = 0
+    
    for movie_id, movie_data in movies.items():
        document = {
            'movie_id': movie_data['movie_id'],
@@ -257,78 +156,30 @@ def insert_movies_to_mongodb(batch_size: int = 2000) -> bool:
            'directors': movie_data['directors'],
            'cast': movie_data['cast']
        }
-            documents.append(document)
        
-            if len(documents) >= batch_size:
-                movies_collection.insert_many(documents, ordered=False)
-                documents = []
+        movies_collection.insert_one(document)
+        inserted_count += 1 
    
-        if documents:
-            movies_collection.insert_many(documents, ordered=False)
-        
-        create_indexes()
+    # Create indexes
+    movies_collection.create_index("movie_name", name="idx_movie_name")
    
    total_count = movies_collection.count_documents({})
-        print(f"Inserted {total_count} movies into MongoDB")
-        
-        return True
-        
-    except Exception as e:
-        print(f"Error inserting data into MongoDB: {e}")
-        return False
-
-def query_movie_by_name(movie_name: str) -> Optional[Dict]:
-    global movies_collection
-    try:
-        result = movies_collection.find_one({"movie_name": movie_name})
-        return result
-    except Exception as e:
-        print(f"Error querying movie by name: {e}")
-        return None
-
-def query_movies_by_year_range(start_year: int, end_year: int) -> List[Dict]:
-    global movies_collection
-    try:
-        query = {"year": {"$gte": start_year, "$lte": end_year}}
-        results = movies_collection.find(query).sort("year", 1)
-        return list(results)
-    except Exception as e:
-        print(f"Error querying movies by year range: {e}")
-        return []
+    print(f"Inserted {inserted_count} movies into MongoDB")

 def main():
-    try:
-        print("IMDB DATA LOADING FOR MONGODB")
-        print("=" * 50)
+        # Load in the files
+        movie_file = os.path.join("IMDB", "IMDBMovie.txt")
+        person_file = os.path.join("IMDB", "IMDBPerson.txt") 
+        director_file = os.path.join("IMDB", "IMDBDirectors.txt")
+        movie_director_file = os.path.join("IMDB", "IMDBMovie_Directors.txt")
+        cast_file = os.path.join("IMDB", "IMDBCast.txt")
        
-        data_directory = "IMDB"
-        sample_size = None
-        
-        movie_file = os.path.join(data_directory, "IMDBMovie.txt")
-        person_file = os.path.join(data_directory, "IMDBPerson.txt") 
-        director_file = os.path.join(data_directory, "IMDBDirectors.txt")
-        movie_director_file = os.path.join(data_directory, "IMDBMovie_Directors.txt")
-        cast_file = os.path.join(data_directory, "IMDBCast.txt")
-        
-        if not load_movies_data(movie_file, sample_size=sample_size):
-            print("Failed to load movies")
-            return
-        
-        if not load_persons_data(person_file):
-            print("Failed to load persons")
-            return
-            
-        if not load_directors_data(director_file):
-            print("Failed to load directors")
-            return
-            
-        if not load_movie_directors_data(movie_director_file):
-            print("Failed to load movie-director relationships")
-            return
-
-        if not load_cast_data(cast_file):
-            print("Failed to load cast data")
-            return
+        # Load the data into memory
+        load_movies_data(movie_file)
+        load_persons_data(person_file)
+        load_directors_data(director_file)
+        load_movie_directors_data(movie_director_file)
+        load_cast_data(cast_file)
        
        total_movies = len(movies)
        movies_with_directors = sum(1 for movie in movies.values() if movie['directors'])
@@ -338,31 +189,16 @@ def main():
        print(f"Movies with directors: {movies_with_directors}")
        print(f"Movies with cast: {movies_with_cast}")
        
-        if connect():
-            if insert_movies_to_mongodb(batch_size=1000):
-                print("Data inserted into MongoDB successfully")
+        # Insert the data into MongoDB
+        connect()
+        insert_movies_to_mongodb()
        
-                test_movie = query_movie_by_name("$ (1971)")
-                if test_movie:
-                    print(f"\nFound movie: {test_movie['movie_name']} ({test_movie.get('year', 'N/A')})")
+        # b Query DB for "Shrek (2001)"
+        result = movies_collection.find_one({'movie_name': "Shrek (2001)"})
+        print('\nQuery Result for "Shrek (2001)":')
+        print(result)

-                total_in_db = movies_collection.count_documents({})
-                movies_with_directors_db = movies_collection.count_documents({"directors": {"$ne": []}})
+        global client
+        client.close()

-                print(f"Total movies in database: {total_in_db}")
-                print(f"Movies with directors: {movies_with_directors_db}")
-                
-            else:
-                print("Failed to insert data into MongoDB")
-        else:
-            print("MongoDB not available")
-        
-    except Exception as e:
-        print(f"Error in main execution: {e}")
-    
-    finally:
-        disconnect()
-
-
-if __name__ == "__main__":
-    main()
+main()
@@ -1,3 +1,10 @@
+# HW3 - Task 2
+# Nicholas Pease
+# This loads a directed graph from an input file, 
+# computes the Page Rank for each vertex using Spark, 
+# and saves the results to an output file.
+# This follows the same structure as the assignment description dictates.
+
 from pyspark import SparkConf, SparkContext

 def main():
@@ -1,3 +1,10 @@
+# HW3 - Task 3
+# Nicholas Pease
+# This takes the output from Task 2 (Page Rank results),
+# loads it into a Spark DataFrame, performs SQL queries to
+# extract specific information, joins with another input file with labels,
+# and saves the final results to a CSV file.
+
 from pyspark import SparkConf, SparkContext
 from pyspark.sql import SparkSession

@@ -30,9 +37,12 @@ def main():
    names_df = spark.createDataFrame(names_rdd, ["id", "name"])
    names_df.createOrReplaceTempView("names")
    
-    # e. Join and print dataframes on id
+    # e. Join and print dataframes on id, save as CSV (t3-out.csv)
    result = spark.sql("SELECT n.id, n.name, p.page_rank FROM names n JOIN page_rank p ON n.id = p.id")
    print("Joined DataFrame (id, name, page_rank):")
    result.show()    
+    with open("t3-out.csv", "w") as f:
+        for row in result.collect():
+            f.write(f"{row['id']} {row['name']} {row['page_rank']}\n")
    
 main()