COS482-HW3/task1.py

# HW3 - Task 1
# Nicholas Pease
# I decided to try MongoD because structure of documents, which aligns closely with JSON format, where I store a majority of the data I work with in other projects.
# I stored the data in this assignment in a collection of movie documents, where each document contains information about a movie, its directors, and its cast.
# Answer to part c: If we were accessing movies primarily by their years, followed by their names, I would create a compound index on the 'year' and 'movie_name' fields in the MongoDB collection.

import csv
import pandas as pd
from pymongo import MongoClient, ASCENDING
from pymongo.errors import ConnectionFailure, DuplicateKeyError
import os
from typing import Dict, List, Optional
from collections import defaultdict

movies = {}
persons = {}
directors = {}
client = None
db = None
movies_collection = None

def connect():
    global client, db, movies_collection

    client = MongoClient("mongodb://localhost:27017/", serverSelectionTimeoutMS=5000)
    client.admin.command('ping')
    db = client["imdb_database"]
    movies_collection = db.movies

    print(f"Connected to MongoDB database: imdb_database")

def load_movies_data(file_path):
    global movies
    movies_loaded = 0
    with open(file_path, 'r', encoding='latin-1') as file:
        next(file) # skip header

        for line in file:
            line = line.strip()

            parts = line.split(',')

            movie_id = int(parts[0])

            year_str = parts[-2] if len(parts) >= 3 else ''
            rank_str = parts[-1] if len(parts) >= 4 else ''

            movie_name = ','.join(parts[1:-2]) if len(parts) > 3 else parts[1] if len(parts) > 1 else ''

            year = int(year_str) if year_str and year_str.strip() else None
            rank = float(rank_str) if rank_str and rank_str.strip() else None

            movies[movie_id] = {
                'movie_id': movie_id,
                'movie_name': movie_name,
                'year': year,
                'rank': rank,
                'directors': [],
                'cast': []
            }

            movies_loaded += 1

    print(f"Loaded {len(movies)} movies")

def load_persons_data(file_path):
    global persons
    persons_df = pd.read_csv(file_path,encoding='latin-1',dtype={'id': 'int32','fname': 'string','lname': 'string', 'gender': 'string'})

    for _, row in persons_df.iterrows():
        person_id = int(row['id'])
        full_name = f"{row['fname']} {row['lname']}".strip()
        persons[person_id] = {
            'person_id': person_id,
            'name': full_name,
            'gender': str(row['gender'])
        }

    print(f"Loaded {len(persons)} persons")

def load_directors_data(file_path):
    global directors
    directors_df = pd.read_csv(file_path,encoding="latin-1",dtype={'id': 'int32','fname': 'string','lname': 'string'})

    for _, row in directors_df.iterrows():
        director_id = int(row['id'])
        full_name = f"{row['fname']} {row['lname']}".strip()
        directors[director_id] = {
            'director_id': director_id,
            'name': full_name
        }

    print(f"Loaded {len(directors)} directors")

def load_movie_directors_data(file_path):
    global movies, directors
    movie_directors_df = pd.read_csv(file_path,encoding='latin-1',dtype={'did': 'int32', 'mid': 'int32'})

    linked_count = 0
    for _, row in movie_directors_df.iterrows():
        director_id = int(row['did'])
        movie_id = int(row['mid'])

        if movie_id in movies and director_id in directors:
            director_info = {
                'director_id': director_id,
                'name': directors[director_id]['name']
            }
            movies[movie_id]['directors'].append(director_info)
            linked_count += 1

def load_cast_data(file_path):
    global movies, persons

    linked_count = 0

    with open(file_path, 'r', encoding='latin-1') as file:
        next(file)  # skip header

        for line in file:
            line = line.strip()
            if not line:
                continue

            parts = line.split(',')
            if len(parts) < 3:
                continue

            person_id = int(parts[0])
            movie_id = int(parts[1])
            role = parts[2]

            if (movie_id in movies and person_id in persons):
                cast_info = {
                    'person_id': person_id,
                    'name': persons[person_id]['name'],
                    'gender': persons[person_id]['gender'],
                    'role': role
                }
                movies[movie_id]['cast'].append(cast_info)
                linked_count += 1

    print(f"Loaded {linked_count} cast entries")

def insert_movies_to_mongodb():
    global movies, movies_collection
    movies_collection.drop()

    inserted_count = 0

    for movie_id, movie_data in movies.items():
        document = {
            'movie_id': movie_data['movie_id'],
            'movie_name': movie_data['movie_name'],
            'year': movie_data['year'],
            'rank': movie_data['rank'],
            'directors': movie_data['directors'],
            'cast': movie_data['cast']
        }

        movies_collection.insert_one(document)
        inserted_count += 1

    # Create indexes
    movies_collection.create_index("movie_name", name="idx_movie_name")

    total_count = movies_collection.count_documents({})
    print(f"Inserted {inserted_count} movies into MongoDB")

def main():
        # Load in the files
        movie_file = os.path.join("IMDB", "IMDBMovie.txt")
        person_file = os.path.join("IMDB", "IMDBPerson.txt")
        director_file = os.path.join("IMDB", "IMDBDirectors.txt")
        movie_director_file = os.path.join("IMDB", "IMDBMovie_Directors.txt")
        cast_file = os.path.join("IMDB", "IMDBCast.txt")

        # Load the data into memory
        load_movies_data(movie_file)
        load_persons_data(person_file)
        load_directors_data(director_file)
        load_movie_directors_data(movie_director_file)
        load_cast_data(cast_file)

        total_movies = len(movies)
        movies_with_directors = sum(1 for movie in movies.values() if movie['directors'])
        movies_with_cast = sum(1 for movie in movies.values() if movie['cast'])

        print(f"\nTotal movies loaded: {total_movies}")
        print(f"Movies with directors: {movies_with_directors}")
        print(f"Movies with cast: {movies_with_cast}")

        # Insert the data into MongoDB
        connect()
        insert_movies_to_mongodb()

        # b Query DB for "Shrek (2001)"
        result = movies_collection.find_one({'movie_name': "Shrek (2001)"})
        print('\nQuery Result for "Shrek (2001)":')
        print(result)

        global client
        client.close()

main()