Files
2025-12-18 18:15:29 -05:00

205 lines
6.9 KiB
Python

# HW3 - Task 1
# Nicholas Pease
# I decided to try MongoD because structure of documents, which aligns closely with JSON format, where I store a majority of the data I work with in other projects.
# I stored the data in this assignment in a collection of movie documents, where each document contains information about a movie, its directors, and its cast.
# Answer to part c: If we were accessing movies primarily by their years, followed by their names, I would create a compound index on the 'year' and 'movie_name' fields in the MongoDB collection.
import csv
import pandas as pd
from pymongo import MongoClient, ASCENDING
from pymongo.errors import ConnectionFailure, DuplicateKeyError
import os
from typing import Dict, List, Optional
from collections import defaultdict
movies = {}
persons = {}
directors = {}
client = None
db = None
movies_collection = None
def connect():
global client, db, movies_collection
client = MongoClient("mongodb://localhost:27017/", serverSelectionTimeoutMS=5000)
client.admin.command('ping')
db = client["imdb_database"]
movies_collection = db.movies
print(f"Connected to MongoDB database: imdb_database")
def load_movies_data(file_path):
global movies
movies_loaded = 0
with open(file_path, 'r', encoding='latin-1') as file:
next(file) # skip header
for line in file:
line = line.strip()
parts = line.split(',')
movie_id = int(parts[0])
year_str = parts[-2] if len(parts) >= 3 else ''
rank_str = parts[-1] if len(parts) >= 4 else ''
movie_name = ','.join(parts[1:-2]) if len(parts) > 3 else parts[1] if len(parts) > 1 else ''
year = int(year_str) if year_str and year_str.strip() else None
rank = float(rank_str) if rank_str and rank_str.strip() else None
movies[movie_id] = {
'movie_id': movie_id,
'movie_name': movie_name,
'year': year,
'rank': rank,
'directors': [],
'cast': []
}
movies_loaded += 1
print(f"Loaded {len(movies)} movies")
def load_persons_data(file_path):
global persons
persons_df = pd.read_csv(file_path,encoding='latin-1',dtype={'id': 'int32','fname': 'string','lname': 'string', 'gender': 'string'})
for _, row in persons_df.iterrows():
person_id = int(row['id'])
full_name = f"{row['fname']} {row['lname']}".strip()
persons[person_id] = {
'person_id': person_id,
'name': full_name,
'gender': str(row['gender'])
}
print(f"Loaded {len(persons)} persons")
def load_directors_data(file_path):
global directors
directors_df = pd.read_csv(file_path,encoding="latin-1",dtype={'id': 'int32','fname': 'string','lname': 'string'})
for _, row in directors_df.iterrows():
director_id = int(row['id'])
full_name = f"{row['fname']} {row['lname']}".strip()
directors[director_id] = {
'director_id': director_id,
'name': full_name
}
print(f"Loaded {len(directors)} directors")
def load_movie_directors_data(file_path):
global movies, directors
movie_directors_df = pd.read_csv(file_path,encoding='latin-1',dtype={'did': 'int32', 'mid': 'int32'})
linked_count = 0
for _, row in movie_directors_df.iterrows():
director_id = int(row['did'])
movie_id = int(row['mid'])
if movie_id in movies and director_id in directors:
director_info = {
'director_id': director_id,
'name': directors[director_id]['name']
}
movies[movie_id]['directors'].append(director_info)
linked_count += 1
def load_cast_data(file_path):
global movies, persons
linked_count = 0
with open(file_path, 'r', encoding='latin-1') as file:
next(file) # skip header
for line in file:
line = line.strip()
if not line:
continue
parts = line.split(',')
if len(parts) < 3:
continue
person_id = int(parts[0])
movie_id = int(parts[1])
role = parts[2]
if (movie_id in movies and person_id in persons):
cast_info = {
'person_id': person_id,
'name': persons[person_id]['name'],
'gender': persons[person_id]['gender'],
'role': role
}
movies[movie_id]['cast'].append(cast_info)
linked_count += 1
print(f"Loaded {linked_count} cast entries")
def insert_movies_to_mongodb():
global movies, movies_collection
movies_collection.drop()
inserted_count = 0
for movie_id, movie_data in movies.items():
document = {
'movie_id': movie_data['movie_id'],
'movie_name': movie_data['movie_name'],
'year': movie_data['year'],
'rank': movie_data['rank'],
'directors': movie_data['directors'],
'cast': movie_data['cast']
}
movies_collection.insert_one(document)
inserted_count += 1
# Create indexes
movies_collection.create_index("movie_name", name="idx_movie_name")
total_count = movies_collection.count_documents({})
print(f"Inserted {inserted_count} movies into MongoDB")
def main():
# Load in the files
movie_file = os.path.join("IMDB", "IMDBMovie.txt")
person_file = os.path.join("IMDB", "IMDBPerson.txt")
director_file = os.path.join("IMDB", "IMDBDirectors.txt")
movie_director_file = os.path.join("IMDB", "IMDBMovie_Directors.txt")
cast_file = os.path.join("IMDB", "IMDBCast.txt")
# Load the data into memory
load_movies_data(movie_file)
load_persons_data(person_file)
load_directors_data(director_file)
load_movie_directors_data(movie_director_file)
load_cast_data(cast_file)
total_movies = len(movies)
movies_with_directors = sum(1 for movie in movies.values() if movie['directors'])
movies_with_cast = sum(1 for movie in movies.values() if movie['cast'])
print(f"\nTotal movies loaded: {total_movies}")
print(f"Movies with directors: {movies_with_directors}")
print(f"Movies with cast: {movies_with_cast}")
# Insert the data into MongoDB
connect()
insert_movies_to_mongodb()
# b Query DB for "Shrek (2001)"
result = movies_collection.find_one({'movie_name': "Shrek (2001)"})
print('\nQuery Result for "Shrek (2001)":')
print(result)
global client
client.close()
main()