205 lines
6.9 KiB
Python
205 lines
6.9 KiB
Python
# HW3 - Task 1
|
|
# Nicholas Pease
|
|
# I decided to try MongoD because structure of documents, which aligns closely with JSON format, where I store a majority of the data I work with in other projects.
|
|
# I stored the data in this assignment in a collection of movie documents, where each document contains information about a movie, its directors, and its cast.
|
|
# Answer to part c: If we were accessing movies primarily by their years, followed by their names, I would create a compound index on the 'year' and 'movie_name' fields in the MongoDB collection.
|
|
|
|
import csv
|
|
import pandas as pd
|
|
from pymongo import MongoClient, ASCENDING
|
|
from pymongo.errors import ConnectionFailure, DuplicateKeyError
|
|
import os
|
|
from typing import Dict, List, Optional
|
|
from collections import defaultdict
|
|
|
|
movies = {}
|
|
persons = {}
|
|
directors = {}
|
|
client = None
|
|
db = None
|
|
movies_collection = None
|
|
|
|
def connect():
|
|
global client, db, movies_collection
|
|
|
|
client = MongoClient("mongodb://localhost:27017/", serverSelectionTimeoutMS=5000)
|
|
client.admin.command('ping')
|
|
db = client["imdb_database"]
|
|
movies_collection = db.movies
|
|
|
|
print(f"Connected to MongoDB database: imdb_database")
|
|
|
|
def load_movies_data(file_path):
|
|
global movies
|
|
movies_loaded = 0
|
|
with open(file_path, 'r', encoding='latin-1') as file:
|
|
next(file) # skip header
|
|
|
|
for line in file:
|
|
line = line.strip()
|
|
|
|
parts = line.split(',')
|
|
|
|
movie_id = int(parts[0])
|
|
|
|
year_str = parts[-2] if len(parts) >= 3 else ''
|
|
rank_str = parts[-1] if len(parts) >= 4 else ''
|
|
|
|
movie_name = ','.join(parts[1:-2]) if len(parts) > 3 else parts[1] if len(parts) > 1 else ''
|
|
|
|
year = int(year_str) if year_str and year_str.strip() else None
|
|
rank = float(rank_str) if rank_str and rank_str.strip() else None
|
|
|
|
movies[movie_id] = {
|
|
'movie_id': movie_id,
|
|
'movie_name': movie_name,
|
|
'year': year,
|
|
'rank': rank,
|
|
'directors': [],
|
|
'cast': []
|
|
}
|
|
|
|
movies_loaded += 1
|
|
|
|
print(f"Loaded {len(movies)} movies")
|
|
|
|
def load_persons_data(file_path):
|
|
global persons
|
|
persons_df = pd.read_csv(file_path,encoding='latin-1',dtype={'id': 'int32','fname': 'string','lname': 'string', 'gender': 'string'})
|
|
|
|
for _, row in persons_df.iterrows():
|
|
person_id = int(row['id'])
|
|
full_name = f"{row['fname']} {row['lname']}".strip()
|
|
persons[person_id] = {
|
|
'person_id': person_id,
|
|
'name': full_name,
|
|
'gender': str(row['gender'])
|
|
}
|
|
|
|
print(f"Loaded {len(persons)} persons")
|
|
|
|
def load_directors_data(file_path):
|
|
global directors
|
|
directors_df = pd.read_csv(file_path,encoding="latin-1",dtype={'id': 'int32','fname': 'string','lname': 'string'})
|
|
|
|
for _, row in directors_df.iterrows():
|
|
director_id = int(row['id'])
|
|
full_name = f"{row['fname']} {row['lname']}".strip()
|
|
directors[director_id] = {
|
|
'director_id': director_id,
|
|
'name': full_name
|
|
}
|
|
|
|
print(f"Loaded {len(directors)} directors")
|
|
|
|
def load_movie_directors_data(file_path):
|
|
global movies, directors
|
|
movie_directors_df = pd.read_csv(file_path,encoding='latin-1',dtype={'did': 'int32', 'mid': 'int32'})
|
|
|
|
linked_count = 0
|
|
for _, row in movie_directors_df.iterrows():
|
|
director_id = int(row['did'])
|
|
movie_id = int(row['mid'])
|
|
|
|
if movie_id in movies and director_id in directors:
|
|
director_info = {
|
|
'director_id': director_id,
|
|
'name': directors[director_id]['name']
|
|
}
|
|
movies[movie_id]['directors'].append(director_info)
|
|
linked_count += 1
|
|
|
|
def load_cast_data(file_path):
|
|
global movies, persons
|
|
|
|
linked_count = 0
|
|
|
|
with open(file_path, 'r', encoding='latin-1') as file:
|
|
next(file) # skip header
|
|
|
|
for line in file:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
parts = line.split(',')
|
|
if len(parts) < 3:
|
|
continue
|
|
|
|
person_id = int(parts[0])
|
|
movie_id = int(parts[1])
|
|
role = parts[2]
|
|
|
|
if (movie_id in movies and person_id in persons):
|
|
cast_info = {
|
|
'person_id': person_id,
|
|
'name': persons[person_id]['name'],
|
|
'gender': persons[person_id]['gender'],
|
|
'role': role
|
|
}
|
|
movies[movie_id]['cast'].append(cast_info)
|
|
linked_count += 1
|
|
|
|
print(f"Loaded {linked_count} cast entries")
|
|
|
|
def insert_movies_to_mongodb():
|
|
global movies, movies_collection
|
|
movies_collection.drop()
|
|
|
|
inserted_count = 0
|
|
|
|
for movie_id, movie_data in movies.items():
|
|
document = {
|
|
'movie_id': movie_data['movie_id'],
|
|
'movie_name': movie_data['movie_name'],
|
|
'year': movie_data['year'],
|
|
'rank': movie_data['rank'],
|
|
'directors': movie_data['directors'],
|
|
'cast': movie_data['cast']
|
|
}
|
|
|
|
movies_collection.insert_one(document)
|
|
inserted_count += 1
|
|
|
|
# Create indexes
|
|
movies_collection.create_index("movie_name", name="idx_movie_name")
|
|
|
|
total_count = movies_collection.count_documents({})
|
|
print(f"Inserted {inserted_count} movies into MongoDB")
|
|
|
|
def main():
|
|
# Load in the files
|
|
movie_file = os.path.join("IMDB", "IMDBMovie.txt")
|
|
person_file = os.path.join("IMDB", "IMDBPerson.txt")
|
|
director_file = os.path.join("IMDB", "IMDBDirectors.txt")
|
|
movie_director_file = os.path.join("IMDB", "IMDBMovie_Directors.txt")
|
|
cast_file = os.path.join("IMDB", "IMDBCast.txt")
|
|
|
|
# Load the data into memory
|
|
load_movies_data(movie_file)
|
|
load_persons_data(person_file)
|
|
load_directors_data(director_file)
|
|
load_movie_directors_data(movie_director_file)
|
|
load_cast_data(cast_file)
|
|
|
|
total_movies = len(movies)
|
|
movies_with_directors = sum(1 for movie in movies.values() if movie['directors'])
|
|
movies_with_cast = sum(1 for movie in movies.values() if movie['cast'])
|
|
|
|
print(f"\nTotal movies loaded: {total_movies}")
|
|
print(f"Movies with directors: {movies_with_directors}")
|
|
print(f"Movies with cast: {movies_with_cast}")
|
|
|
|
# Insert the data into MongoDB
|
|
connect()
|
|
insert_movies_to_mongodb()
|
|
|
|
# b Query DB for "Shrek (2001)"
|
|
result = movies_collection.find_one({'movie_name': "Shrek (2001)"})
|
|
print('\nQuery Result for "Shrek (2001)":')
|
|
print(result)
|
|
|
|
global client
|
|
client.close()
|
|
|
|
main() |