|
|
|
@@ -1,6 +1,7 @@
|
|
|
|
|
# HW3 - Task 1
|
|
|
|
|
# Nicholas Pease
|
|
|
|
|
# IMDB Data Loading into MongoDB
|
|
|
|
|
# I decided to try MongoD because structure of documents, which aligns closely with JSON format, where I store a majority of the data I work with in other projects.
|
|
|
|
|
# I stored the data in this assignment in a collection of movie documents, where each document contains information about a movie, its directors, and its cast.
|
|
|
|
|
|
|
|
|
|
import csv
|
|
|
|
|
import pandas as pd
|
|
|
|
@@ -10,7 +11,6 @@ import os
|
|
|
|
|
from typing import Dict, List, Optional
|
|
|
|
|
from collections import defaultdict
|
|
|
|
|
|
|
|
|
|
# Global data structures for processing
|
|
|
|
|
movies = {}
|
|
|
|
|
persons = {}
|
|
|
|
|
directors = {}
|
|
|
|
@@ -18,46 +18,25 @@ client = None
|
|
|
|
|
db = None
|
|
|
|
|
movies_collection = None
|
|
|
|
|
|
|
|
|
|
def connect(connection_string: str = "mongodb://localhost:27017/",
|
|
|
|
|
database_name: str = "imdb_database") -> bool:
|
|
|
|
|
def connect():
|
|
|
|
|
global client, db, movies_collection
|
|
|
|
|
try:
|
|
|
|
|
client = MongoClient(connection_string, serverSelectionTimeoutMS=5000)
|
|
|
|
|
|
|
|
|
|
client = MongoClient("mongodb://localhost:27017/", serverSelectionTimeoutMS=5000)
|
|
|
|
|
client.admin.command('ping')
|
|
|
|
|
db = client[database_name]
|
|
|
|
|
db = client["imdb_database"]
|
|
|
|
|
movies_collection = db.movies
|
|
|
|
|
|
|
|
|
|
print(f"Connected to MongoDB database: {database_name}")
|
|
|
|
|
return True
|
|
|
|
|
print(f"Connected to MongoDB database: imdb_database")
|
|
|
|
|
|
|
|
|
|
except ConnectionFailure as e:
|
|
|
|
|
print(f"Failed to connect to MongoDB: {e}")
|
|
|
|
|
return False
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"MongoDB connection error: {e}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def disconnect():
|
|
|
|
|
global client
|
|
|
|
|
if client:
|
|
|
|
|
client.close()
|
|
|
|
|
|
|
|
|
|
def load_movies_data(file_path: str, sample_size: Optional[int] = None) -> bool:
|
|
|
|
|
def load_movies_data(file_path):
|
|
|
|
|
global movies
|
|
|
|
|
try:
|
|
|
|
|
movies_loaded = 0
|
|
|
|
|
with open(file_path, 'r', encoding='latin-1') as file:
|
|
|
|
|
next(file)
|
|
|
|
|
next(file) # skip header
|
|
|
|
|
|
|
|
|
|
for line in file:
|
|
|
|
|
if sample_size and movies_loaded >= sample_size:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
line = line.strip()
|
|
|
|
|
if not line:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
parts = line.split(',')
|
|
|
|
|
|
|
|
|
|
movie_id = int(parts[0])
|
|
|
|
@@ -81,29 +60,11 @@ def load_movies_data(file_path: str, sample_size: Optional[int] = None) -> bool:
|
|
|
|
|
|
|
|
|
|
movies_loaded += 1
|
|
|
|
|
|
|
|
|
|
except (ValueError, IndexError):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
print(f"Loaded {len(movies)} movies")
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error loading movies: {e}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def load_persons_data(file_path: str) -> bool:
|
|
|
|
|
def load_persons_data(file_path):
|
|
|
|
|
global persons
|
|
|
|
|
try:
|
|
|
|
|
persons_df = pd.read_csv(
|
|
|
|
|
file_path,
|
|
|
|
|
encoding='latin-1',
|
|
|
|
|
dtype={
|
|
|
|
|
'id': 'int32',
|
|
|
|
|
'fname': 'string',
|
|
|
|
|
'lname': 'string',
|
|
|
|
|
'gender': 'string'
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
persons_df = pd.read_csv(file_path,encoding='latin-1',dtype={'id': 'int32','fname': 'string','lname': 'string', 'gender': 'string'})
|
|
|
|
|
|
|
|
|
|
for _, row in persons_df.iterrows():
|
|
|
|
|
person_id = int(row['id'])
|
|
|
|
@@ -115,24 +76,10 @@ def load_persons_data(file_path: str) -> bool:
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
print(f"Loaded {len(persons)} persons")
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error loading persons: {e}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def load_directors_data(file_path: str) -> bool:
|
|
|
|
|
def load_directors_data(file_path):
|
|
|
|
|
global directors
|
|
|
|
|
try:
|
|
|
|
|
directors_df = pd.read_csv(
|
|
|
|
|
file_path,
|
|
|
|
|
encoding="latin-1",
|
|
|
|
|
dtype={
|
|
|
|
|
'id': 'int32',
|
|
|
|
|
'fname': 'string',
|
|
|
|
|
'lname': 'string'
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
directors_df = pd.read_csv(file_path,encoding="latin-1",dtype={'id': 'int32','fname': 'string','lname': 'string'})
|
|
|
|
|
|
|
|
|
|
for _, row in directors_df.iterrows():
|
|
|
|
|
director_id = int(row['id'])
|
|
|
|
@@ -143,20 +90,10 @@ def load_directors_data(file_path: str) -> bool:
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
print(f"Loaded {len(directors)} directors")
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error loading directors: {e}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def load_movie_directors_data(file_path: str) -> bool:
|
|
|
|
|
def load_movie_directors_data(file_path):
|
|
|
|
|
global movies, directors
|
|
|
|
|
try:
|
|
|
|
|
movie_directors_df = pd.read_csv(
|
|
|
|
|
file_path,
|
|
|
|
|
encoding='latin-1',
|
|
|
|
|
dtype={'did': 'int32', 'mid': 'int32'}
|
|
|
|
|
)
|
|
|
|
|
movie_directors_df = pd.read_csv(file_path,encoding='latin-1',dtype={'did': 'int32', 'mid': 'int32'})
|
|
|
|
|
|
|
|
|
|
linked_count = 0
|
|
|
|
|
for _, row in movie_directors_df.iterrows():
|
|
|
|
@@ -171,45 +108,28 @@ def load_movie_directors_data(file_path: str) -> bool:
|
|
|
|
|
movies[movie_id]['directors'].append(director_info)
|
|
|
|
|
linked_count += 1
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error loading movie-director relationships: {e}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def load_cast_data(file_path: str, max_cast_per_movie: int = 25) -> bool:
|
|
|
|
|
def load_cast_data(file_path):
|
|
|
|
|
global movies, persons
|
|
|
|
|
try:
|
|
|
|
|
if not os.path.exists(file_path):
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
movie_cast_count = defaultdict(int)
|
|
|
|
|
linked_count = 0
|
|
|
|
|
|
|
|
|
|
with open(file_path, 'r', encoding='latin-1') as file:
|
|
|
|
|
next(file)
|
|
|
|
|
|
|
|
|
|
for line_num, line in enumerate(file):
|
|
|
|
|
if line_num > 100000:
|
|
|
|
|
break
|
|
|
|
|
next(file) # skip header
|
|
|
|
|
|
|
|
|
|
for line in file:
|
|
|
|
|
line = line.strip()
|
|
|
|
|
if not line:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
parts = line.split(',')
|
|
|
|
|
|
|
|
|
|
if len(parts) < 3:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
person_id = int(parts[0])
|
|
|
|
|
movie_id = int(parts[1])
|
|
|
|
|
role = ','.join(parts[2:]) if len(parts) > 2 else "[Unknown]"
|
|
|
|
|
|
|
|
|
|
if (movie_id in movies and person_id in persons and
|
|
|
|
|
movie_cast_count[movie_id] < max_cast_per_movie):
|
|
|
|
|
role = parts[2]
|
|
|
|
|
|
|
|
|
|
if (movie_id in movies and person_id in persons):
|
|
|
|
|
cast_info = {
|
|
|
|
|
'person_id': person_id,
|
|
|
|
|
'name': persons[person_id]['name'],
|
|
|
|
@@ -217,37 +137,16 @@ def load_cast_data(file_path: str, max_cast_per_movie: int = 25) -> bool:
|
|
|
|
|
'role': role
|
|
|
|
|
}
|
|
|
|
|
movies[movie_id]['cast'].append(cast_info)
|
|
|
|
|
movie_cast_count[movie_id] += 1
|
|
|
|
|
linked_count += 1
|
|
|
|
|
|
|
|
|
|
except (ValueError, KeyError):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if linked_count > 0:
|
|
|
|
|
print(f"Loaded {linked_count} cast entries")
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
def create_indexes():
|
|
|
|
|
global movies_collection
|
|
|
|
|
try:
|
|
|
|
|
movies_collection.create_index("movie_name", unique=True, name="idx_movie_name")
|
|
|
|
|
movies_collection.create_index("year", name="idx_year")
|
|
|
|
|
movies_collection.create_index("rank", name="idx_rank")
|
|
|
|
|
movies_collection.create_index("directors.name", name="idx_director_name")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def insert_movies_to_mongodb(batch_size: int = 2000) -> bool:
|
|
|
|
|
def insert_movies_to_mongodb():
|
|
|
|
|
global movies, movies_collection
|
|
|
|
|
try:
|
|
|
|
|
movies_collection.drop()
|
|
|
|
|
|
|
|
|
|
documents = []
|
|
|
|
|
inserted_count = 0
|
|
|
|
|
|
|
|
|
|
for movie_id, movie_data in movies.items():
|
|
|
|
|
document = {
|
|
|
|
|
'movie_id': movie_data['movie_id'],
|
|
|
|
@@ -257,78 +156,30 @@ def insert_movies_to_mongodb(batch_size: int = 2000) -> bool:
|
|
|
|
|
'directors': movie_data['directors'],
|
|
|
|
|
'cast': movie_data['cast']
|
|
|
|
|
}
|
|
|
|
|
documents.append(document)
|
|
|
|
|
|
|
|
|
|
if len(documents) >= batch_size:
|
|
|
|
|
movies_collection.insert_many(documents, ordered=False)
|
|
|
|
|
documents = []
|
|
|
|
|
movies_collection.insert_one(document)
|
|
|
|
|
inserted_count += 1
|
|
|
|
|
|
|
|
|
|
if documents:
|
|
|
|
|
movies_collection.insert_many(documents, ordered=False)
|
|
|
|
|
|
|
|
|
|
create_indexes()
|
|
|
|
|
# Create indexes
|
|
|
|
|
movies_collection.create_index("movie_name", name="idx_movie_name")
|
|
|
|
|
|
|
|
|
|
total_count = movies_collection.count_documents({})
|
|
|
|
|
print(f"Inserted {total_count} movies into MongoDB")
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error inserting data into MongoDB: {e}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def query_movie_by_name(movie_name: str) -> Optional[Dict]:
|
|
|
|
|
global movies_collection
|
|
|
|
|
try:
|
|
|
|
|
result = movies_collection.find_one({"movie_name": movie_name})
|
|
|
|
|
return result
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error querying movie by name: {e}")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def query_movies_by_year_range(start_year: int, end_year: int) -> List[Dict]:
|
|
|
|
|
global movies_collection
|
|
|
|
|
try:
|
|
|
|
|
query = {"year": {"$gte": start_year, "$lte": end_year}}
|
|
|
|
|
results = movies_collection.find(query).sort("year", 1)
|
|
|
|
|
return list(results)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error querying movies by year range: {e}")
|
|
|
|
|
return []
|
|
|
|
|
print(f"Inserted {inserted_count} movies into MongoDB")
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
try:
|
|
|
|
|
print("IMDB DATA LOADING FOR MONGODB")
|
|
|
|
|
print("=" * 50)
|
|
|
|
|
# Load in the files
|
|
|
|
|
movie_file = os.path.join("IMDB", "IMDBMovie.txt")
|
|
|
|
|
person_file = os.path.join("IMDB", "IMDBPerson.txt")
|
|
|
|
|
director_file = os.path.join("IMDB", "IMDBDirectors.txt")
|
|
|
|
|
movie_director_file = os.path.join("IMDB", "IMDBMovie_Directors.txt")
|
|
|
|
|
cast_file = os.path.join("IMDB", "IMDBCast.txt")
|
|
|
|
|
|
|
|
|
|
data_directory = "IMDB"
|
|
|
|
|
sample_size = None
|
|
|
|
|
|
|
|
|
|
movie_file = os.path.join(data_directory, "IMDBMovie.txt")
|
|
|
|
|
person_file = os.path.join(data_directory, "IMDBPerson.txt")
|
|
|
|
|
director_file = os.path.join(data_directory, "IMDBDirectors.txt")
|
|
|
|
|
movie_director_file = os.path.join(data_directory, "IMDBMovie_Directors.txt")
|
|
|
|
|
cast_file = os.path.join(data_directory, "IMDBCast.txt")
|
|
|
|
|
|
|
|
|
|
if not load_movies_data(movie_file, sample_size=sample_size):
|
|
|
|
|
print("Failed to load movies")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if not load_persons_data(person_file):
|
|
|
|
|
print("Failed to load persons")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if not load_directors_data(director_file):
|
|
|
|
|
print("Failed to load directors")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if not load_movie_directors_data(movie_director_file):
|
|
|
|
|
print("Failed to load movie-director relationships")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if not load_cast_data(cast_file):
|
|
|
|
|
print("Failed to load cast data")
|
|
|
|
|
return
|
|
|
|
|
# Load the data into memory
|
|
|
|
|
load_movies_data(movie_file)
|
|
|
|
|
load_persons_data(person_file)
|
|
|
|
|
load_directors_data(director_file)
|
|
|
|
|
load_movie_directors_data(movie_director_file)
|
|
|
|
|
load_cast_data(cast_file)
|
|
|
|
|
|
|
|
|
|
total_movies = len(movies)
|
|
|
|
|
movies_with_directors = sum(1 for movie in movies.values() if movie['directors'])
|
|
|
|
@@ -338,31 +189,16 @@ def main():
|
|
|
|
|
print(f"Movies with directors: {movies_with_directors}")
|
|
|
|
|
print(f"Movies with cast: {movies_with_cast}")
|
|
|
|
|
|
|
|
|
|
if connect():
|
|
|
|
|
if insert_movies_to_mongodb(batch_size=1000):
|
|
|
|
|
print("Data inserted into MongoDB successfully")
|
|
|
|
|
# Insert the data into MongoDB
|
|
|
|
|
connect()
|
|
|
|
|
insert_movies_to_mongodb()
|
|
|
|
|
|
|
|
|
|
test_movie = query_movie_by_name("$ (1971)")
|
|
|
|
|
if test_movie:
|
|
|
|
|
print(f"\nFound movie: {test_movie['movie_name']} ({test_movie.get('year', 'N/A')})")
|
|
|
|
|
# b Query DB for "Shrek (2001)"
|
|
|
|
|
result = movies_collection.find_one({'movie_name': "Shrek (2001)"})
|
|
|
|
|
print('\nQuery Result for "Shrek (2001)":')
|
|
|
|
|
print(result)
|
|
|
|
|
|
|
|
|
|
total_in_db = movies_collection.count_documents({})
|
|
|
|
|
movies_with_directors_db = movies_collection.count_documents({"directors": {"$ne": []}})
|
|
|
|
|
global client
|
|
|
|
|
client.close()
|
|
|
|
|
|
|
|
|
|
print(f"Total movies in database: {total_in_db}")
|
|
|
|
|
print(f"Movies with directors: {movies_with_directors_db}")
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
print("Failed to insert data into MongoDB")
|
|
|
|
|
else:
|
|
|
|
|
print("MongoDB not available")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error in main execution: {e}")
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
disconnect()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|
|
|
|
|
main()
|